# Sentiment analysis of IMDB Dataset

#### The dataset consists of 50000 movie reviews labelled as positive or negative. I have used Multinomial NB and SVMs and Logistic Regression and VADER to model this sentiment analysis

## Imports

In [18]:
import pandas as pd
import numpy as np
import spacy
import nltk
nlp = spacy.load('en_core_web_lg')

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## EDA

In [4]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
len(df)

50000

In [6]:
df['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

## Data Cleaning

In [7]:
import re
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

In [8]:
for i in range(0, len(df)):
    df['review'][i] = df['review'][i].strip()

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [11]:
reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in df['review']]
reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in df['review']]
reviews = [line.lower() for line in reviews]

In [12]:
df['reviews'] = reviews
df['reviews'][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me. the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word. it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away. i would say the main appeal of the show is due to the fact that it goes where other shows

In [13]:
df.drop('review', inplace = True, axis = 1)

In [14]:
df.head()

Unnamed: 0,sentiment,reviews
0,positive,one of the other reviewers has mentioned that ...
1,positive,a wonderful little production. the filming te...
2,positive,i thought this was a wonderful way to spend ti...
3,negative,basically there's a family where a little boy ...
4,positive,"petter mattei's ""love in the time of money"" is..."


In [16]:
blanks = []

for i,sentiment,rv in df.itertuples():
    if type(rv)==str:  
        if rv.isspace():
            blanks.append(i)
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


## Splitting into training and testing datasets

In [19]:
from sklearn.model_selection import train_test_split

X = df['reviews']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Creating a pipeline to perform TF-IDF Vectorizing and Modelling of Algorithm

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB()),])
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC()),])

## Multinomial NB model and its results

In [27]:
text_clf_nb.fit(X_train, y_train)
predictions = text_clf_nb.predict(X_test)

In [28]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[7280  928]
 [1394 6898]]


In [29]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

    negative       0.84      0.89      0.86      8208
    positive       0.88      0.83      0.86      8292

   micro avg       0.86      0.86      0.86     16500
   macro avg       0.86      0.86      0.86     16500
weighted avg       0.86      0.86      0.86     16500



In [30]:
print(metrics.accuracy_score(y_test,predictions))

0.8592727272727273


## Linear SVC Model and its results

In [31]:
text_clf_lsvc.fit(X_train, y_train)
predictions1 = text_clf_lsvc.predict(X_test)

In [32]:
print(metrics.confusion_matrix(y_test,predictions1))

[[7289  919]
 [ 736 7556]]


In [33]:
print(metrics.classification_report(y_test,predictions1))

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      8208
    positive       0.89      0.91      0.90      8292

   micro avg       0.90      0.90      0.90     16500
   macro avg       0.90      0.90      0.90     16500
weighted avg       0.90      0.90      0.90     16500



In [34]:
print(metrics.accuracy_score(y_test,predictions1))

0.8996969696969697


## Adding selected stopwords and performing the same models

In [35]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', 'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', 'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this','to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [36]:
text_clf_nb2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),('clf', MultinomialNB()),])
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),('clf', LinearSVC()),])

## Multinomial NB model with stopwords and its results

In [37]:
text_clf_nb2.fit(X_train, y_train)
predictions2 = text_clf_nb2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions2))

[[7255  953]
 [1295 6997]]


In [38]:
print(metrics.classification_report(y_test,predictions2))

              precision    recall  f1-score   support

    negative       0.85      0.88      0.87      8208
    positive       0.88      0.84      0.86      8292

   micro avg       0.86      0.86      0.86     16500
   macro avg       0.86      0.86      0.86     16500
weighted avg       0.86      0.86      0.86     16500



In [39]:
print(metrics.accuracy_score(y_test,predictions2))

0.8637575757575757


## Linear SVC Model with stopwords and its results

In [42]:
text_clf_lsvc2.fit(X_train, y_train)
predictions3 = text_clf_lsvc2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions3))

[[7287  921]
 [ 755 7537]]


In [43]:
print(metrics.classification_report(y_test,predictions3))

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      8208
    positive       0.89      0.91      0.90      8292

   micro avg       0.90      0.90      0.90     16500
   macro avg       0.90      0.90      0.90     16500
weighted avg       0.90      0.90      0.90     16500



In [44]:
print(metrics.accuracy_score(y_test,predictions3))

0.8984242424242425


## Using countvectorizer and performing Logistic regression

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(X_train)
X_train1 = cv.transform(X_train)
X_test1 = cv.transform(X_test)

In [63]:
from sklearn.linear_model import LogisticRegression

## Logistic regression for various values of C

In [131]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train1, y_train)
    print (f"Confusion matric for for C = {c}: \n {metrics.confusion_matrix(y_test, lr.predict(X_test1))}")
    print('\n')
    print (f"Classification report for C = {c}: \n {metrics.classification_report(y_test, lr.predict(X_test1))}")
    print('\n')
    print (f"Accuracy for C = {c}: {metrics.accuracy_score(y_test, lr.predict(X_test1))}")
    print('\n')

Confusion matric for for C = 0.01: 
 [[7178 1030]
 [ 852 7440]]


Classification report for C = 0.01: 
               precision    recall  f1-score   support

    negative       0.89      0.87      0.88      8208
    positive       0.88      0.90      0.89      8292

   micro avg       0.89      0.89      0.89     16500
   macro avg       0.89      0.89      0.89     16500
weighted avg       0.89      0.89      0.89     16500



Accuracy for C = 0.01: 0.8859393939393939


Confusion matric for for C = 0.05: 
 [[7246  962]
 [ 804 7488]]


Classification report for C = 0.05: 
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      8208
    positive       0.89      0.90      0.89      8292

   micro avg       0.89      0.89      0.89     16500
   macro avg       0.89      0.89      0.89     16500
weighted avg       0.89      0.89      0.89     16500



Accuracy for C = 0.05: 0.892969696969697


Confusion matric for for C = 0.25: 
 [[7244  96

## Lemmatizing every review and performing analysis

### Function to lemmatize sentences in pandas

In [76]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()


def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            
            lemmatized_sentence.append(word)
        else:
            
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [77]:
df['LemmatizedReviews'] = df['reviews'].apply(lambda x: lemmatize_sentence(x))

## Performing analysis on lemmatized reviews

In [79]:
X1 = df['LemmatizedReviews']

In [80]:
X_train2, X_test2, y_train, y_test = train_test_split(X1, y, test_size=0.33, random_state=42)

In [85]:
text_clf_nb.fit(X_train2, y_train)
pred = text_clf_nb.predict(X_test2)
print(metrics.confusion_matrix(y_test,pred))

[[7230  978]
 [1407 6885]]


In [86]:
print(metrics.classification_report(y_test,pred))

              precision    recall  f1-score   support

    negative       0.84      0.88      0.86      8208
    positive       0.88      0.83      0.85      8292

   micro avg       0.86      0.86      0.86     16500
   macro avg       0.86      0.86      0.86     16500
weighted avg       0.86      0.86      0.86     16500



In [87]:
print(metrics.accuracy_score(y_test,pred))

0.8554545454545455


In [88]:
text_clf_lsvc.fit(X_train2, y_train)
pred1 = text_clf_lsvc.predict(X_test2)
print(metrics.confusion_matrix(y_test,pred1))

[[7274  934]
 [ 763 7529]]


In [89]:
print(metrics.classification_report(y_test,pred1))

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      8208
    positive       0.89      0.91      0.90      8292

   micro avg       0.90      0.90      0.90     16500
   macro avg       0.90      0.90      0.90     16500
weighted avg       0.90      0.90      0.90     16500



In [91]:
print(metrics.accuracy_score(y_test,pred1))

0.8971515151515151


## Applying stopwords after Lemmatization and performing same analysis

In [92]:
text_clf_nb2.fit(X_train2, y_train)
pred2 = text_clf_nb2.predict(X_test2)
print(metrics.confusion_matrix(y_test,pred2))

[[7209  999]
 [1341 6951]]


In [93]:
print(metrics.classification_report(y_test,pred2))

              precision    recall  f1-score   support

    negative       0.84      0.88      0.86      8208
    positive       0.87      0.84      0.86      8292

   micro avg       0.86      0.86      0.86     16500
   macro avg       0.86      0.86      0.86     16500
weighted avg       0.86      0.86      0.86     16500



In [94]:
print(metrics.accuracy_score(y_test,pred2))

0.8581818181818182


In [95]:
text_clf_lsvc2.fit(X_train2, y_train)
pred3 = text_clf_lsvc2.predict(X_test2)
print(metrics.confusion_matrix(y_test,pred3))

[[7260  948]
 [ 794 7498]]


In [96]:
print(metrics.classification_report(y_test,pred3))

              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      8208
    positive       0.89      0.90      0.90      8292

   micro avg       0.89      0.89      0.89     16500
   macro avg       0.89      0.89      0.89     16500
weighted avg       0.89      0.89      0.89     16500



In [97]:
print(metrics.accuracy_score(y_test,pred3))

0.8944242424242425


In [132]:
cv = CountVectorizer(binary=True)
cv.fit(X_train)
X_train3 = cv.transform(X_train2)
X_test3 = cv.transform(X_test2)

In [133]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train3, y_train)
    print (f"Confusion matric for for C = {c}: \n {metrics.confusion_matrix(y_test, lr.predict(X_test3))}")
    print('\n')
    print (f"Classification report for C = {c}: \n {metrics.classification_report(y_test, lr.predict(X_test3))}")
    print('\n')
    print (f"Accuracy for C = {c}: {metrics.accuracy_score(y_test, lr.predict(X_test3))}")
    print('\n')

Confusion matric for for C = 0.01: 
 [[7144 1064]
 [ 942 7350]]


Classification report for C = 0.01: 
               precision    recall  f1-score   support

    negative       0.88      0.87      0.88      8208
    positive       0.87      0.89      0.88      8292

   micro avg       0.88      0.88      0.88     16500
   macro avg       0.88      0.88      0.88     16500
weighted avg       0.88      0.88      0.88     16500



Accuracy for C = 0.01: 0.8784242424242424


Confusion matric for for C = 0.05: 
 [[7215  993]
 [ 878 7414]]


Classification report for C = 0.05: 
               precision    recall  f1-score   support

    negative       0.89      0.88      0.89      8208
    positive       0.88      0.89      0.89      8292

   micro avg       0.89      0.89      0.89     16500
   macro avg       0.89      0.89      0.89     16500
weighted avg       0.89      0.89      0.89     16500



Accuracy for C = 0.05: 0.8866060606060606


Confusion matric for for C = 0.25: 
 [[7227  9

## Stemming using PorterStemmer and Snowball Stemmer

In [100]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
stemmer = SnowballStemmer("english")
stemmer1 = PorterStemmer()

In [110]:
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [stemmer1.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [106]:
df['porterReviews'] = df['reviews'].apply(stem_sentences)

In [109]:
def stem1_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [111]:
df['snowballReviews'] = df['reviews'].apply(stem1_sentences)

In [113]:
X2 = df['porterReviews']
X3 = df['snowballReviews']

## Performing same analysis of both models on reviews obtained after Porter stemming and Snowball stemming

In [114]:
X_train3, X_test3, y_train, y_test = train_test_split(X2, y, test_size=0.33, random_state=42)

In [115]:
X_train4, X_test4, y_train, y_test = train_test_split(X3, y, test_size=0.33, random_state=42)

In [116]:
text_clf_nb.fit(X_train3, y_train)
predict1 = text_clf_nb.predict(X_test3)
print(metrics.confusion_matrix(y_test,predict1))

[[7255  953]
 [1400 6892]]


In [117]:
print(metrics.classification_report(y_test,predict1))

              precision    recall  f1-score   support

    negative       0.84      0.88      0.86      8208
    positive       0.88      0.83      0.85      8292

   micro avg       0.86      0.86      0.86     16500
   macro avg       0.86      0.86      0.86     16500
weighted avg       0.86      0.86      0.86     16500



In [118]:
print(metrics.accuracy_score(y_test,predict1))

0.8573939393939394


In [119]:
text_clf_lsvc.fit(X_train3, y_train)
predict2 = text_clf_lsvc.predict(X_test3)
print(metrics.confusion_matrix(y_test,predict2))

[[7270  938]
 [ 751 7541]]


In [120]:
print(metrics.classification_report(y_test,predict2))

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      8208
    positive       0.89      0.91      0.90      8292

   micro avg       0.90      0.90      0.90     16500
   macro avg       0.90      0.90      0.90     16500
weighted avg       0.90      0.90      0.90     16500



In [121]:
print(metrics.accuracy_score(y_test,predict2))

0.8976363636363637


In [123]:
text_clf_nb.fit(X_train4, y_train)
predict3 = text_clf_nb.predict(X_test4)
print(metrics.confusion_matrix(y_test,predict3))

[[7245  963]
 [1417 6875]]


In [124]:
print(metrics.classification_report(y_test,predict3))

              precision    recall  f1-score   support

    negative       0.84      0.88      0.86      8208
    positive       0.88      0.83      0.85      8292

   micro avg       0.86      0.86      0.86     16500
   macro avg       0.86      0.86      0.86     16500
weighted avg       0.86      0.86      0.86     16500



In [125]:
print(metrics.accuracy_score(y_test,predict3))

0.8557575757575757


In [126]:
text_clf_lsvc.fit(X_train4, y_train)
predict4 = text_clf_lsvc.predict(X_test4)
print(metrics.confusion_matrix(y_test,predict4))

[[7256  952]
 [ 765 7527]]


In [127]:
print(metrics.classification_report(y_test,predict4))

              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      8208
    positive       0.89      0.91      0.90      8292

   micro avg       0.90      0.90      0.90     16500
   macro avg       0.90      0.90      0.90     16500
weighted avg       0.90      0.90      0.90     16500



In [128]:
print(metrics.accuracy_score(y_test,predict4))

0.8959393939393939


## Using VADER to perform analysis

In [134]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [138]:
df['scores'] = df['reviews'].apply(lambda review: sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 'positive' if c >=0 else 'negative')

In [139]:
df.head()

Unnamed: 0,sentiment,reviews,LemmatizedReviews,porterReviews,snowballReviews,scores,compound,comp_score
0,positive,one of the other reviewers has mentioned that ...,one of the other reviewer have mention that af...,one of the other review ha mention that after ...,one of the other review has mention that after...,"{'neg': 0.198, 'neu': 0.746, 'pos': 0.057, 'co...",-0.9947,negative
1,positive,a wonderful little production. the filming te...,a wonderful little production . the filming te...,a wonder littl production. the film techniqu i...,a wonder littl production. the film techniqu i...,"{'neg': 0.054, 'neu': 0.771, 'pos': 0.175, 'co...",0.9641,positive
2,positive,i thought this was a wonderful way to spend ti...,i think this be a wonderful way to spend time ...,i thought thi wa a wonder way to spend time on...,i thought this was a wonder way to spend time ...,"{'neg': 0.092, 'neu': 0.69, 'pos': 0.217, 'com...",0.978,positive
3,negative,basically there's a family where a little boy ...,basically there 's a family where a little boy...,basic there' a famili where a littl boy (jake)...,basic there a famili where a littl boy (jake) ...,"{'neg': 0.141, 'neu': 0.777, 'pos': 0.083, 'co...",-0.8996,negative
4,positive,"petter mattei's ""love in the time of money"" is...",petter mattei 's `` love in the time of money ...,"petter mattei' ""love in the time of money"" is ...","petter mattei ""love in the time of money"" is a...","{'neg': 0.052, 'neu': 0.791, 'pos': 0.157, 'co...",0.9766,positive


In [141]:
print(metrics.confusion_matrix(df['sentiment'],df['comp_score']))

[[13507 11493]
 [ 3624 21376]]


In [143]:
print(metrics.classification_report(df['sentiment'],df['comp_score']))

              precision    recall  f1-score   support

    negative       0.79      0.54      0.64     25000
    positive       0.65      0.86      0.74     25000

   micro avg       0.70      0.70      0.70     50000
   macro avg       0.72      0.70      0.69     50000
weighted avg       0.72      0.70      0.69     50000



In [144]:
metrics.accuracy_score(df['sentiment'],df['comp_score'])

0.69766