# Sentiment Analysis on Tweets

For this project, we will be using a large dataset of Twitter reviews for US Airlines, which are classified into three categories: positive, negative, or neutral. Our goal is to train a model based on this dataset to determine sentiments, i.e. positive, negative, or neutral, of new tweets on Airlines.
 

## Load Data

In [17]:
import pandas as pd
import numpy as np

df = pd.read_csv('Tweets.csv')
df2 = df[['text','airline_sentiment']]
# the tweets we are using are those labelled with 100% confidence
df2 = df2[df['airline_sentiment_confidence'] == 1]
textArray = np.array(df2['text'])

## Vectorize Tweets

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
tweet_vec = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize) 
tweet_counts = tweet_vec.fit_transform(textArray)
print(tweet_counts.shape)

(10445, 5442)


## Create TF-IDF Vector

In [19]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
tweet_tfidf = tfidf_transformer.fit_transform(tweet_counts)
print(tweet_tfidf.shape)

(10445, 5442)


## Create Target Vector

In [20]:
# map negative to -1, positve to 1, and neutral to 0
target = np.zeros(textArray.shape[0])
target[df2['airline_sentiment'] == 'positive'] = 1
target[df2['airline_sentiment'] == 'negative'] = -1

## Using SVM from SKlearn

### Use k-fold cross validation

In [21]:
from sklearn import svm
svc = svm.SVC(probability=False,  kernel="rbf", C=2.8, gamma=.0073,verbose=10)

from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
nfold = 10 
kf = KFold(n_splits=nfold, shuffle = True)
C = np.zeros([3, 3])
acc = []
i = 0
for train, test in kf.split(tweet_tfidf):
    i = i+1
    Xtr = tweet_tfidf[train,:]
    ytr = target[train]
    Xts = tweet_tfidf[test,:]
    yts = target[test] 
    
    svc.fit(Xtr,ytr) 
    yhat = svc.predict(Xts)
    C = C + confusion_matrix(yts, yhat, labels=[1,-1,0])
    acci = np.mean(yhat == yts)
    acc.append(acci)
    print(i)

[LibSVM]1
[LibSVM]2
[LibSVM]3
[LibSVM]4
[LibSVM]5
[LibSVM]6
[LibSVM]7
[LibSVM]8
[LibSVM]9
[LibSVM]10


### Confusion Matrix and Accuracy

In [22]:
accm= np.mean(acc) 
acc_se = np.std(acc)/np.sqrt(nfold-1)
Cm = C / np.sum(C, axis = 1)[:,None] 
print('C = ')
print(np.array_str(C, precision=4, suppress_small=True))
print('Cm = ')
print(np.array_str(Cm, precision=4, suppress_small=True))
print('Accuracy =  {0:.4f}, SE={1:.4f}'.format(accm, acc_se)) 


C = 
[[ 420. 1094.    1.]
 [   3. 7378.    1.]
 [  11. 1499.   38.]]
Cm = 
[[0.2772 0.7221 0.0007]
 [0.0004 0.9995 0.0001]
 [0.0071 0.9683 0.0245]]
Accuracy =  0.7502, SE=0.0034


## Using Multinomial Naive Bayes Classifier from SKlearn

### Use k-fold cross validation

In [25]:
from sklearn.naive_bayes import MultinomialNB
nfold = 10 
kf = KFold(n_splits=nfold, shuffle = True)
C = np.zeros([3, 3])
acc = []
i = 0
for train, test in kf.split(tweet_tfidf):
    i = i+1
    Xtr = tweet_tfidf[train,:]
    ytr = target[train]
    Xts = tweet_tfidf[test,:]
    yts = target[test] 
    clf = MultinomialNB().fit(Xtr,ytr)
    yhat = clf.predict(Xts)
    C = C + confusion_matrix(yts, yhat, labels=[1,-1,0])
    acci = np.mean(yhat == yts)
    acc.append(acci)
#     print(i)

### Confusion Matrix and Accuracy

In [26]:
accm= np.mean(acc) 
acc_se = np.std(acc)/np.sqrt(nfold-1)
Cm = C / np.sum(C, axis = 1)[:,None] 
print('C = ')
print(np.array_str(C, precision=4, suppress_small=True))
print('Cm = ')
print(np.array_str(Cm, precision=4, suppress_small=True))
print('Accuracy =  {0:.4f}, SE={1:.4f}'.format(accm, acc_se)) 


C = 
[[ 450. 1058.    7.]
 [   1. 7371.   10.]
 [  18. 1276.  254.]]
Cm = 
[[0.297  0.6983 0.0046]
 [0.0001 0.9985 0.0014]
 [0.0116 0.8243 0.1641]]
Accuracy =  0.7731, SE=0.0052
