In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('IMDB Dataset.csv', encoding = 'Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [4]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [None]:
#Remove HTML formatting

In [5]:
from bs4 import BeautifulSoup

for i in range(len(df)):
    df['review'][i] = BeautifulSoup(df['review'][i]).get_text()

In [None]:
#Remove punctuation

In [7]:
import re

for i in range(len(df)):
    df["review"][i] = re.sub("[,.;:'-*]", " ", df["review"][i])

In [72]:
#Covert to lowercase

In [8]:
for i in range(len(df)):
    df["review"][i] = df["review"][i].lower()

In [None]:
#Remove Stopwords

In [9]:
import nltk
from nltk.corpus import stopwords

for w in stopwords.words('english'):
    df['review'] = df["review"].str.replace(' ' + str(w) + ' ', ' ')

In [None]:
#Stemming the words

In [10]:
from nltk.stem import LancasterStemmer

lancaster = LancasterStemmer()

for i in range(len(df)):
    df['review'][i] = df['review'][i].split(' ')
    df['review'][i] = [lancaster.stem(y) for y in df['review'][i]]
    df['review'][i] = ' '.join(df['review'][i])

In [11]:
#df['review'][0]

'on review ment watch 1 oz episod hook  right  exact hap first thing struck oz brut unflinch scen viol  set right word go  trust  show faint heart timid  show pul punch regard drug  sex viol  hardc  class us word cal oz nicknam giv oswald maxim sec stat penit  focus main emerald city  expery sect prison cel glass front fac inward  priv high agend  em city hom many  ary  muslim  gangsta  latino  christians  it  ir    scuffl  dea star  dodgy deal shady agr nev far away would say main ap show due fact goe show dar  forget pretty pict paint mainstream audy  forget charm  forget rom   oz mess around  first episod ev saw struck nasty sur  say ready  watch  develop tast oz  got accustom high level graph viol  viol  injust  crook guard sold nickel  inm kil ord get away  wel man  middl class inm turn prison bitch due lack street skil prison expery  watch oz  may becom comfort uncomfort view    that get touch dark sid '

In [None]:
#Vectorization of words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 5000) 

x = vectorizer.fit_transform(df['review']).toarray()
y = df['sentiment']

In [13]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [None]:
#Split data

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=32)

In [16]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [17]:
X_train

array([[-0.05371202, -0.08443223, -0.33188426, ..., -0.07828176,
        -0.05862699, -0.04765582],
       [-0.05371202, -0.08443223, -0.33188426, ..., -0.07828176,
        -0.05862699, -0.04765582],
       [-0.05371202, -0.08443223,  1.61884613, ..., -0.07828176,
        -0.05862699, -0.04765582],
       ...,
       [-0.05371202, -0.08443223, -0.33188426, ..., -0.07828176,
        -0.05862699, -0.04765582],
       [-0.05371202, -0.08443223, -0.33188426, ..., -0.07828176,
        -0.05862699, -0.04765582],
       [-0.05371202, -0.08443223,  5.52030691, ..., -0.07828176,
        -0.05862699, -0.04765582]])

In [18]:
X_test

array([[-0.05371202, -0.08443223, -0.33188426, ..., -0.07828176,
        -0.05862699, -0.04765582],
       [-0.05371202, -0.08443223, -0.33188426, ..., -0.07828176,
        -0.05862699, -0.04765582],
       [-0.05371202, -0.08443223, -0.33188426, ..., -0.07828176,
        -0.05862699, -0.04765582],
       ...,
       [-0.05371202, -0.08443223, -0.33188426, ..., -0.07828176,
        -0.05862699, -0.04765582],
       [-0.05371202, -0.08443223, -0.33188426, ..., -0.07828176,
        -0.05862699, -0.04765582],
       [-0.05371202, -0.08443223, -0.33188426, ..., -0.07828176,
        -0.05862699, -0.04765582]])

In [19]:
y_train

38274    0
49945    0
36190    1
44920    0
14665    0
        ..
24828    1
20414    1
9526     0
42539    0
10967    0
Name: sentiment, Length: 37500, dtype: int64

In [20]:
y_test

35083    1
9005     0
23836    0
42777    1
13222    1
        ..
28652    0
8070     0
35178    1
17922    0
23951    0
Name: sentiment, Length: 12500, dtype: int64

In [None]:
#Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

logClassifier = LogisticRegression(random_state=11) 
logreg = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
         intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
         penalty='elasticnet', random_state=None, solver='saga', tol=0.0001,l1_ratio=1,
         verbose=0, warm_start=False)

In [22]:
logreg.fit(X_train, y_train)

LogisticRegression(l1_ratio=1, multi_class='ovr', n_jobs=1,
                   penalty='elasticnet', solver='saga')

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score,fbeta_score, recall_score, f1_score

lb_predict = logreg.predict(X_test)
print("***********************")
print(f"{'Recall Score:':18}{recall_score(y_test, lb_predict):.3f}")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, lb_predict):.3f}")
print(f"{'Precision Score: ':18}{precision_score(y_test, lb_predict):.3f}")
print(f"{'F1 Score: ':18}{f1_score(y_test, lb_predict ):.3f}")
print("***********************")

***********************
Recall Score:     0.887
Accuracy Score:   0.875
Precision Score:  0.869
F1 Score:         0.878
***********************


In [None]:
#Naive Bayes

In [24]:
from sklearn.naive_bayes import GaussianNB

#Creating the model
nb_model = GaussianNB()

In [25]:
nb_model.fit(X_train, y_train)

GaussianNB()

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score,fbeta_score, recall_score, f1_score

nb_predict = nb_model.predict(X_test)
print("***********************")
print(f"{'Recall Score:':18}{recall_score(y_test, nb_predict):.3f}")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, nb_predict):.3f}")
print(f"{'Precision Score: ':18}{precision_score(y_test, nb_predict):.3f}")
print(f"{'F1 Score: ':18}{f1_score(y_test, nb_predict):.3f}")
print("***********************")

***********************
Recall Score:     0.552
Accuracy Score:   0.701
Precision Score:  0.794
F1 Score:         0.651
***********************


In [None]:
#SVM

In [27]:
from sklearn.svm import SVC

SVM = SVC()

In [None]:
SVM.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score,fbeta_score, recall_score, f1_score

svm_predict = SVM.predict(X_test)
print("***********************")
print(f"{'Recall Score:':18}{recall_score(y_test, svm_predict):.3f}")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, svm_predict):.3f}")
print(f"{'Precision Score: ':18}{precision_score(y_test, svm_predict):.3f}")
print(f"{'F1 Score: ':18}{f1_score(y_test, svm_predict):.3f}")
print("***********************")