In [1]:
import numpy as np
import pandas as pd
from textblob import TextBlob
import nltk

In [2]:
# Import the movie review data as a data frame and ensure that the data is loaded properly.

data = pd.read_csv("labeledTrainData.tsv",sep ="\t")
data.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for £1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


In [3]:
# How many of each positive and negative reviews are there?

data["sentiment"].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [4]:
# Use TextBlob to classify each movie review as positive or negative. Assume that a polarity score greater than or 
# equal to zero is a positive sentiment and less than 0 is a negative sentiment.

# Create polarity score (sentiment_2)

data['sentiment_2'] = data["review"].apply(lambda x: (TextBlob(x).sentiment.polarity))
data.head(10)

Unnamed: 0,id,sentiment,review,sentiment_2
0,5814_8,1,With all this stuff going down at the moment w...,0.001277
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941
3,3630_4,0,It must be assumed that those who praised this...,0.134753
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842
5,8196_8,1,I dont know why people think this is such a ba...,0.105882
6,7166_2,0,"This movie could have been very good, but come...",-0.027054
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.06875
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.09881
9,8713_10,1,<br /><br />This movie is full of references. ...,0.258333


In [5]:
# Convert polarity score to 0 (negative) and 1 (positive) (sentiment_3)

data["sentiment_3"]= ""
data.loc[data.sentiment_2>=0,"sentiment_3"]=1
data.loc[data.sentiment_2<0,"sentiment_3"]=0

data.head(10)

Unnamed: 0,id,sentiment,review,sentiment_2,sentiment_3
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0
5,8196_8,1,I dont know why people think this is such a ba...,0.105882,1
6,7166_2,0,"This movie could have been very good, but come...",-0.027054,0
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.06875,1
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.09881,1
9,8713_10,1,<br /><br />This movie is full of references. ...,0.258333,1


In [6]:
# Number of positive and negative reviews based on Sentiment_3

data["sentiment_3"].value_counts()

1    19017
0     5983
Name: sentiment_3, dtype: int64

In [19]:
# Check the accuracy of this model. Is this model better than random guessing? TextBlob is more Accurate.

# Original model accuracy (0.5)

original = 12500/len(data)
original

0.5

In [20]:
# TextBlob model accuracy (0.76068)

textblob = 19017/len(data)
textblob

0.76068

In [9]:
# Using VADER

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

Int_Analyzer = SentimentIntensityAnalyzer()

In [10]:
# Int_Analyzer polarity (Overall)

data["Overall"] = data["review"].apply(lambda review:Int_Analyzer.polarity_scores(review))
data.head(10)

Unnamed: 0,id,sentiment,review,sentiment_2,sentiment_3,Overall
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co..."
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co..."
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co..."
5,8196_8,1,I dont know why people think this is such a ba...,0.105882,1,"{'neg': 0.177, 'neu': 0.607, 'pos': 0.215, 'co..."
6,7166_2,0,"This movie could have been very good, but come...",-0.027054,0,"{'neg': 0.158, 'neu': 0.717, 'pos': 0.125, 'co..."
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.06875,1,"{'neg': 0.059, 'neu': 0.903, 'pos': 0.038, 'co..."
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.09881,1,"{'neg': 0.069, 'neu': 0.746, 'pos': 0.185, 'co..."
9,8713_10,1,<br /><br />This movie is full of references. ...,0.258333,1,"{'neg': 0.062, 'neu': 0.759, 'pos': 0.179, 'co..."


In [12]:
# Cmpound Overall to polarity scores (vader_total)

data["vader_total"] = data["Overall"].apply(lambda score_dict:score_dict["compound"])
data.head(10)

Unnamed: 0,id,sentiment,review,sentiment_2,sentiment_3,Overall,vader_total
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",-0.8879
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.9736
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co...",-0.9883
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co...",-0.1202
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co...",0.6115
5,8196_8,1,I dont know why people think this is such a ba...,0.105882,1,"{'neg': 0.177, 'neu': 0.607, 'pos': 0.215, 'co...",0.3935
6,7166_2,0,"This movie could have been very good, but come...",-0.027054,0,"{'neg': 0.158, 'neu': 0.717, 'pos': 0.125, 'co...",-0.6863
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.06875,1,"{'neg': 0.059, 'neu': 0.903, 'pos': 0.038, 'co...",-0.4517
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.09881,1,"{'neg': 0.069, 'neu': 0.746, 'pos': 0.185, 'co...",0.9707
9,8713_10,1,<br /><br />This movie is full of references. ...,0.258333,1,"{'neg': 0.062, 'neu': 0.759, 'pos': 0.179, 'co...",0.7184


In [17]:
# Convert vader_total to 1 (positive) and 0 (Negative)

data["Overall_2"]=''
data.loc[data.vader_total>=0,"Overall_2"]=1
data.loc[data.vader_total<0,"Overall_2"]=0

data.head(10)

Unnamed: 0,id,sentiment,review,sentiment_2,sentiment_3,Overall,vader_total,Overall_2
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,"{'neg': 0.128, 'neu': 0.751, 'pos': 0.121, 'co...",-0.8879,0
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,"{'neg': 0.08, 'neu': 0.713, 'pos': 0.207, 'com...",0.9736,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,"{'neg': 0.135, 'neu': 0.809, 'pos': 0.055, 'co...",-0.9883,0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,"{'neg': 0.062, 'neu': 0.884, 'pos': 0.054, 'co...",-0.1202,0
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,"{'neg': 0.122, 'neu': 0.743, 'pos': 0.135, 'co...",0.6115,1
5,8196_8,1,I dont know why people think this is such a ba...,0.105882,1,"{'neg': 0.177, 'neu': 0.607, 'pos': 0.215, 'co...",0.3935,1
6,7166_2,0,"This movie could have been very good, but come...",-0.027054,0,"{'neg': 0.158, 'neu': 0.717, 'pos': 0.125, 'co...",-0.6863,0
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.06875,1,"{'neg': 0.059, 'neu': 0.903, 'pos': 0.038, 'co...",-0.4517,0
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.09881,1,"{'neg': 0.069, 'neu': 0.746, 'pos': 0.185, 'co...",0.9707,1
9,8713_10,1,<br /><br />This movie is full of references. ...,0.258333,1,"{'neg': 0.062, 'neu': 0.759, 'pos': 0.179, 'co...",0.7184,1


In [18]:
# Number of positive and negative (Overall_2)

data["Overall_2"].value_counts()

1    16611
0     8389
Name: Overall_2, dtype: int64

In [21]:
# VADER accuracy (0.66444). The VADER model accuracy is higher than the original model but less than TextBlob model

vader = 16611/len(data)
vader

0.66444