In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train[train.text.isnull()]

In [None]:
#remove nan entries
train = train[train.text.isnull()==False].reset_index()

In [None]:
train.shape

In [None]:
%matplotlib inline

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.bar(x=[0,1,2], height=[len(train[train.sentiment=='positive']),len(train[train.sentiment=='neutral']),len(train[train.sentiment=='negative'])],color=['g','orange','firebrick'])
plt.xticks([0,1,2], ['positive','neutral','negative'])
plt.xlabel('Sentiment')
plt.ylabel('# of tweets')
plt.show()

**Bag of words**

In [None]:
#create count_vector and fit it to the train data
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(lowercase=True, stop_words='english') #exclude common words in English
count_vector.fit(train['text'].get_values())

In [None]:
#transform train to bag of words matrix
words_array = count_vector.transform(train['text']).toarray()
words_matrix =  pd.DataFrame(words_array, columns = count_vector.get_feature_names())

Count number of samples that contain each word in entire training dataset and in positive and negative samples seperately:

In [None]:
total_freq = words_matrix.astype(bool).sum(axis=0).sort_values(ascending=False)

In [None]:
#add label column to word_matrix
words_matrix['sentiment'] = train.sentiment

In [None]:
positive_freq = words_matrix[words_matrix.sentiment == 'positive'].iloc[:,:-1].astype(bool).sum(axis=0).sort_values(ascending=False)
negative_freq = words_matrix[words_matrix.sentiment == 'negative'].iloc[:,:-1].astype(bool).sum(axis=0).sort_values(ascending=False)

In [None]:
words_stats = pd.concat([total_freq, positive_freq.reindex(total_freq.index), negative_freq.reindex(total_freq.index)], axis=1)
words_stats = words_stats.rename(columns={0: 'total_count', 1: 'pos_count', 2: 'neg_count'})

In [None]:
words_stats.head(25)

In [None]:
words_stats.loc['awesome']

Remove common words that appear in both pos and neg classes with a similar frequency (or with a high frequency in neutral sentiment records):

In [None]:
#calculate percentage of phrases that contain each word that have pos or neg sentiment
words_stats['pos_pct'] = words_stats['pos_count']/words_stats['total_count']
words_stats['neg_pct'] = words_stats['neg_count']/words_stats['total_count']

In [None]:
#remove words that do not have a particular class (pos or neg) mainly associated with them
thres_pct = 0.5
words_stats_reduced = words_stats[(words_stats.pos_pct>=thres_pct)|(words_stats.neg_pct>=thres_pct)]

In [None]:
words_stats_reduced.head()

Top words associated with positive sentiment:

In [None]:
words_stats_reduced[words_stats_reduced.pos_pct>words_stats_reduced.neg_pct].head(10)

In [None]:
#save all words that appear frequently in positive tweets
pos_words = words_stats_reduced[words_stats_reduced.pos_pct>words_stats_reduced.neg_pct].index.values

Top words associated with negative sentiment:

In [None]:
words_stats_reduced[words_stats_reduced.pos_pct<words_stats_reduced.neg_pct].head(10)

In [None]:
#save all words that appear frequently in negative tweets
neg_words = words_stats_reduced[words_stats_reduced.pos_pct<words_stats_reduced.neg_pct].index.values

**Baseline model**

We will first test a simple model which is based on the balance between the count of words that are commonly encountered in positive sentiment tweets and words commonly used in negative sentiment tweets. If a given tweet contains more 'positive' words than 'negative' words it is classified as positive while if it contains more 'negative' words it is classified as negative. If there is the same number of 'positive' and 'negative' words (or the difference between the two counts is not greater than a pre-defined limit) then the tweet is classified as 'neutral'.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
def baseline_classifier(matrix, pos_words, neg_words, limit=0):
    words_balance = pd.DataFrame(matrix[pos_words].sum(axis=1)-matrix[neg_words].sum(axis=1)).rename(columns={0:'balance'})
    words_balance['sentiment'] = words_balance.balance.apply(lambda x: 'positive' if x>limit else ('negative' if x<-limit else 'neutral'))
    return words_balance['sentiment'].values

Run on training set with lim=0 (i.e. if n_pos>n_neg then 'positive'):

In [None]:
train_pred = baseline_classifier(words_matrix,pos_words,neg_words,limit=0)

In [None]:
print('Results on training set:')
print('Accuracy score: ', format(accuracy_score(train.sentiment.values, train_pred)))
print('Precision score: ', format(precision_score(train.sentiment.values, train_pred,average=None)))
print('Recall score: ', format(recall_score(train.sentiment.values, train_pred,average=None)))
print('F1 score: ', format(f1_score(train.sentiment.values, train_pred,average=None)))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(train.sentiment.values, train_pred, ['positive','neutral','negative'])

ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix') 
ax.xaxis.set_ticklabels(['positive','neutral','negative'])
ax.yaxis.set_ticklabels(['positive','neutral','negative'])
plt.show()

Run on training set with lim=1 (i.e. if n_pos-n_neg>1 then 'positive'):

In [None]:
train_pred = baseline_classifier(words_matrix,pos_words,neg_words,limit=1)

In [None]:
print('Results on training set:')
print('Accuracy score: ', format(accuracy_score(train.sentiment.values, train_pred)))
print('Precision score: ', format(precision_score(train.sentiment.values, train_pred,average=None)))
print('Recall score: ', format(recall_score(train.sentiment.values, train_pred,average=None)))
print('F1 score: ', format(f1_score(train.sentiment.values, train_pred,average=None)))

While recall improves for 'neutral' when lim>0, it significantly drops for 'positive' and 'negative' tweets.

Run on test set with lim=0:

In [None]:
#transform test to bag of words matrix, using count vectorizer fitted to train data
test_array = count_vector.transform(test['text']).toarray()
test_matrix =  pd.DataFrame(test_array, columns = count_vector.get_feature_names())

In [None]:
test_pred = baseline_classifier(test_matrix,pos_words,neg_words,limit=0)

In [None]:
print('Results on training set:')
print('Accuracy score: ', format(accuracy_score(test.sentiment.values, test_pred)))
print('Precision score: ', format(precision_score(test.sentiment.values, test_pred,average=None)))
print('Recall score: ', format(recall_score(test.sentiment.values, test_pred,average=None)))
print('F1 score: ', format(f1_score(test.sentiment.values, test_pred,average=None)))

In [None]:
cm = confusion_matrix(test.sentiment.values, test_pred, ['positive','neutral','negative'])

ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix') 
ax.xaxis.set_ticklabels(['positive','neutral','negative'])
ax.yaxis.set_ticklabels(['positive','neutral','negative'])
plt.show()

**Naive Bayes Model**

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train.sentiment.astype(str))
train_labels = le.transform(train.sentiment.astype(str))

In [None]:
train_matrix.astype(np.int64)

In [None]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(train_matrix, train_labels)