## DEFINE

### ---- Define the problem ----

The purpose of this project is to analyze the movie review statements left by customers in order to determine the sentiment.  Sarcasm and use of certain adjectives might lead to false positive and false negative statements. 

In [1]:
##install packages
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

### ---- Load the data ----

In [2]:
##read in csv
import pandas as p
train=p.read_csv("/Users/arielledortch/Downloads/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

In [3]:
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train[0])
print(y_train[0])
print(X_test[0])
print(y_test[0])

(93636,) (93636,) (62424,) (62424,)
almost in a class with that of Wilde
3
escape movie
2


## DISCOVER

### ---- Examine the data ----

In [4]:
# Check how many training examples in each category
# this is important to see whether the data set is balanced or skewed

training_labels = set(y_train)
print(training_labels)
from scipy.stats import itemfreq
training_category_dist = itemfreq(y_train)
print(training_category_dist)

{0, 1, 2, 3, 4}
[[    0  4141]
 [    1 16449]
 [    2 47718]
 [    3 19859]
 [    4  5469]]


`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  import sys


In [5]:
# Check how many testing examples in each category
# this is important to see whether the data set is balanced or skewed

testing_labels = set(y_test)
print(testing_labels)
from scipy.stats import itemfreq
testing_category_dist = itemfreq(y_test)
print(testing_category_dist)

{0, 1, 2, 3, 4}
[[    0  2931]
 [    1 10824]
 [    2 31864]
 [    3 13068]
 [    4  3737]]


`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  import sys


### ---- Vectorize the data ----

In [6]:
# Read the sklearn documentation to understand all vectorization options

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# several commonly used vectorizer setting

#  unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')

#  unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')

#  unigram and bigram term frequency vectorizer, set minimum document frequency to 5
bigram_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')

#  unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')

In [7]:
# The vectorizer can do "fit" and "transform"
# fit is a process to collect unique tokens into the vocabulary
# transform is a process to convert each document to vector based on the vocabulary
# These two processes can be done together using fit_transform(), or used individually: fit() or transform()

# fit vocabulary in training documents and transform the training documents into vectors
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)

# check the content of a document vector
print(X_train_vec.shape)
print(X_train_vec[0].toarray())

# check the size of the constructed vocabulary
print(len(unigram_count_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_count_vectorizer.vocabulary_.items())[:10])

(93636, 11967)
[[0 0 0 ... 0 0 0]]
11967
[('class', 1858), ('wilde', 11742), ('derring', 2802), ('chilling', 1764), ('affecting', 313), ('meanspirited', 6557), ('personal', 7662), ('low', 6296), ('involved', 5602), ('worth', 11868)]


In [8]:
# use the vocabulary constructed from the training data to vectorize the test data. 
# Therefore, use "transform" only, not "fit_transform", 
# otherwise "fit" would generate a new vocabulary from the test data

X_test_vec = unigram_count_vectorizer.transform(X_test)

# print out #examples and #features in the test set
print(X_test_vec.shape)

(62424, 11967)


## DEVELOP

### ---- Create models ----

In [9]:
# import the MNB module
from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
nb_clf.score(X_test_vec,y_test)

0.606401384083045

In [11]:
# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4])
print(cm)

[[  742  1276   797   105    11]
 [  614  4126  5397   655    32]
 [  248  2385 25756  3239   236]
 [   19   456  5570  6253   770]
 [    1    53   729  1977   977]]


In [12]:
# print classification report

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['0','1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names))

[0.45689655 0.49734812 0.67337708 0.51132554 0.482231  ]
[0.25315592 0.38118995 0.80831032 0.47849709 0.26143966]
              precision    recall  f1-score   support

           0       0.46      0.25      0.33      2931
           1       0.50      0.38      0.43     10824
           2       0.67      0.81      0.73     31864
           3       0.51      0.48      0.49     13068
           4       0.48      0.26      0.34      3737

   micro avg       0.61      0.61      0.61     62424
   macro avg       0.52      0.44      0.47     62424
weighted avg       0.59      0.61      0.59     62424



In [24]:
feature_ranks = sorted(zip(nb_clf.coef_[0], unigram_count_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
very_negative_10 = feature_ranks[-10:]
print("Very Negative")
for i in range(0, len(very_negative_10)):
    print(very_negative_10[i])
print()

## get 10 features that are least relevant to "very negative" sentiment (they are at the top of the ranked list)
not_very_negative_10 = feature_ranks[:10]
print("Very Positive")
for i in range(0, len(not_very_negative_10)):
    print(not_very_negative_10[i])
print()

Very Negative
(-5.941598005980322, 'time')
(-5.931015896649785, 'characters')
(-5.92054459678249, 'minutes')
(-5.92054459678249, 'story')
(-5.910181809746943, 'comedy')
(-5.689102242653584, 'just')
(-5.137785257532857, 'like')
(-4.975504451622348, 'bad')
(-4.832403607981675, 'film')
(-4.3215779842156845, 'movie')

Very Positive
(-10.484892788250326, '102')
(-10.484892788250326, '10th')
(-10.484892788250326, '127')
(-10.484892788250326, '13th')
(-10.484892788250326, '14')
(-10.484892788250326, '16')
(-10.484892788250326, '163')
(-10.484892788250326, '168')
(-10.484892788250326, '170')
(-10.484892788250326, '1790')



In [13]:
# import the LinearSVC module
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [15]:
# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred2 = svm_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm2=confusion_matrix(y_test, y_pred2, labels=[0,1,2,3,4])
print(cm2)

[[  918  1221   697    82    13]
 [  701  4080  5504   514    25]
 [  195  2106 27081  2310   172]
 [   34   396  6048  5533  1057]
 [    3    51   590  1772  1321]]


In [16]:
# print classification report

print(precision_score(y_test, y_pred2, average=None))
print(recall_score(y_test, y_pred2, average=None))
target_names = ['0','1','2','3','4']
print(classification_report(y_test, y_pred2, target_names=target_names))

[0.49594814 0.51948052 0.67838176 0.54186661 0.51043277]
[0.31320368 0.37694013 0.8498933  0.42340067 0.35349211]
              precision    recall  f1-score   support

           0       0.50      0.31      0.38      2931
           1       0.52      0.38      0.44     10824
           2       0.68      0.85      0.75     31864
           3       0.54      0.42      0.48     13068
           4       0.51      0.35      0.42      3737

   micro avg       0.62      0.62      0.62     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.60      0.62      0.60     62424



In [22]:
## Linear SVC also ranks all features based on their contribution to distinguish the two concepts in each binary classifier
## For category "0" (very negative), get all features and their weights and sort them in increasing order
feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_count_vectorizer.get_feature_names()))

## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
very_negative_10 = feature_ranks[-10:]
print("Very Negative")
for i in range(0, len(very_negative_10)):
    print(very_negative_10[i])
print()

## get 10 features that are least relevant to "very negative" sentiment (they are at the top of the ranked list)
not_very_negative_10 = feature_ranks[:10]
print("Very Positive")
for i in range(0, len(not_very_negative_10)):
    print(not_very_negative_10[i])
print()

Very Negative
(1.62160999004353, 'cesspool')
(1.6484881338928794, 'disappointment')
(1.6592494240393827, 'pompous')
(1.6683696592808896, 'stinks')
(1.6927739439360225, 'distasteful')
(1.6955904869852574, 'unwatchable')
(1.7526397573716288, 'unbearable')
(1.7873567405771817, 'stinker')
(1.8228706330454685, 'disgusting')
(1.8233057147577225, 'worthless')

Very Positive
(-1.8329269147029115, 'hawke')
(-1.7372807353160562, 'giddy')
(-1.6832953441761966, 'collar')
(-1.5847292421661077, 'swimfan')
(-1.5720764835196204, 'blue')
(-1.4801113634537777, 'dogtown')
(-1.4138361223835711, 'clamoring')
(-1.409353239619115, 'joan')
(-1.3918162689843427, 'victim')
(-1.3400001519321338, 'compulsively')



In [59]:
### Task 2 ####
# The vectorizer can do "fit" and "transform"
# fit is a process to collect unique tokens into the vocabulary
# transform is a process to convert each document to vector based on the vocabulary
# These two processes can be done together using fit_transform(), or used individually: fit() or transform()

# fit vocabulary in training documents and transform the training documents into vectors
X_train_vec2 = bigram_count_vectorizer.fit_transform(X_train)

# check the content of a document vector
print(X_train_vec2.shape)
print(X_train_vec2[0].toarray())

# check the size of the constructed vocabulary
print(len(bigram_count_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(bigram_count_vectorizer.vocabulary_.items())[:10])

(55, 61)
[[1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 3
  2 0 0 0 1 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
61
[('restaurant', 41), ('overall', 36), ('ordered', 35), ('really', 40), ('good', 21), ('sauce', 44), ('best', 3), ('amazing', 0), ('experience', 14), ('service', 46)]


In [60]:
# use the vocabulary constructed from the training data to vectorize the test data. 
# Therefore, use "transform" only, not "fit_transform", 
# otherwise "fit" would generate a new vocabulary from the test data

X_test_vec2 = bigram_count_vectorizer.transform(X_test)

# print out #examples and #features in the test set
print(X_test_vec2.shape)

(37, 61)


In [61]:
# import the MNB module
from sklearn.naive_bayes import MultinomialNB

# initialize the MNB model
nb_clf2= MultinomialNB()

# use the training data to train the MNB model
nb_clf2.fit(X_train_vec2,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
nb_clf2.score(X_test_vec2,y_test)

0.8947368421052632

In [31]:
# print confusion matrix (row: ground truth; col: prediction)

y_pred3 = nb_clf2.fit(X_train_vec2, y_train).predict(X_test_vec2)
cm3=confusion_matrix(y_test, y_pred3, labels=['p','n'])
print(cm3)

[[8 0]
 [2 9]]


In [32]:
print(precision_score(y_test, y_pred3, average=None))
print(recall_score(y_test, y_pred3, average=None))
target_names = ['p','n']
print(classification_report(y_test, y_pred3, target_names=target_names))

[1.  0.8]
[0.81818182 1.        ]
              precision    recall  f1-score   support

           p       1.00      0.82      0.90        11
           n       0.80      1.00      0.89         8

   micro avg       0.89      0.89      0.89        19
   macro avg       0.90      0.91      0.89        19
weighted avg       0.92      0.89      0.90        19



In [33]:
# initialize the LinearSVC model
svm_clf2 = LinearSVC(C=1)

# use the training data to train the model
svm_clf2.fit(X_train_vec2,y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [34]:
svm_clf2.score(X_test_vec2,y_test)

0.7894736842105263

In [35]:
# print confusion matrix (row: ground truth; col: prediction)

y_pred4 = svm_clf2.fit(X_train_vec2, y_train).predict(X_test_vec2)
cm4=confusion_matrix(y_test, y_pred4, labels=['p','n'])
print(cm4)

[[ 5  3]
 [ 1 10]]


In [36]:
print(precision_score(y_test, y_pred4, average=None))
print(recall_score(y_test, y_pred4, average=None))
target_names = ['p','n']
print(classification_report(y_test, y_pred4, target_names=target_names))

[0.76923077 0.83333333]
[0.90909091 0.625     ]
              precision    recall  f1-score   support

           p       0.77      0.91      0.83        11
           n       0.83      0.62      0.71         8

   micro avg       0.79      0.79      0.79        19
   macro avg       0.80      0.77      0.77        19
weighted avg       0.80      0.79      0.78        19



## DEPLOY

In [None]:
kaggle_test=pd.read_csv("/Users/arielledortch/Downloads/kaggle-sentiment/test.tsv", delimiter='\t') 
# preserve the id column of the test examples 
kaggle_ids=kaggle_test['PhraseId'].values 
# read in the text content of the examples 
kaggle_X_test=kaggle_test['Phrase'].values 
# vectorize the test examples using the vocabulary fitted from the 60% training data 
kaggle_X_test_vec=unigram_count_vectorizer.transform(kaggle_X_test) 
# predict using the NB classifier that we built 
kaggle_pred=svm_clf.fit(X_train_vec, y_train).predict(kaggle_X_test_vec) 
# combine the test example ids with their predictions 
kaggle_submission=zip(kaggle_ids, kaggle_pred) 
# prepare output file 
outf=open('kaggle_SVC_submission.csv', 'w') 
# write header outf.write('PhraseId,Sentiment\n') # write predictions with ids to the output file 
for x, value in enumerate(kaggle_submission): 
    outf.write(str(value[0]) + ',' + str(value[1]) + '\n') 
    # close the output file 
    outf.close()