# Sentiment Analysis for Movie Reviews

In [27]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [14]:
movies = pd.read_csv('moviereviews.tsv',sep='\t')
movies.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [17]:
# drop null values
movies.dropna(inplace=True)
# drop blank values
blanks = []
for index, label, review in movies.itertuples():
    if type(review) == str:
        if(review.isspace()):
            blanks.append(index)
movies.drop(blanks,inplace = True)

In [20]:
movies['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [25]:
# Sentiment Analysis
sid = SentimentIntensityAnalyzer()
movies['scores'] = movies['review'].apply(lambda review: sid.polarity_scores(review))
movies['compound'] = movies['scores'].apply(lambda d:d['compound'])
movies.head()

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",0.9953
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",-0.7264


In [26]:
# create a new column comp_score to define, based on compound column, if review is "pos" or "neg"
movies['comp_score'] = movies['compound'].apply(lambda score: 'pos' if score >= 0 else 'neg')
movies.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",0.9953,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",-0.7264,neg


In [28]:
# accuracy -> spoiler alert: it`s not very good...
accuracy_score(movies['label'], movies['comp_score'])

0.63673890608875128

In [29]:
print(classification_report(movies['label'], movies['comp_score']))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

   micro avg       0.64      0.64      0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938



In [30]:
print(confusion_matrix(movies['label'], movies['comp_score']))

[[427 542]
 [162 807]]
