In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("dark_background")
sns.set_palette("dark")
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score,accuracy_score,recall_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [2]:
df = pd.read_csv("D:\\Portfolio\\03_IMDB_reviews\\IMDB_Preprocessed.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,sentiment,text_clean
0,0,1,one reviewers mentioned watching 1 oz episode ...
1,1,1,wonderful little production br br filming tech...
2,2,1,thought wonderful way spend time hot summer we...
3,3,0,basically family little boy jake thinks zombie...
4,4,1,petter mattei love time money visually stunnin...


In [3]:
df.shape

(49582, 3)

In [4]:
df["sentiment"].value_counts()

1    24884
0    24698
Name: sentiment, dtype: int64

In [5]:
y = df['sentiment'].values
X = df[['text_clean']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y)

In [6]:
df.loc[df['text_clean'].isnull()].shape

(0, 3)

In [7]:
#Getting TF-IDF
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print("="*100)

# Handling NaN values by replacing them with an empty string
X_train['text_clean'].fillna('', inplace=True)
X_test['text_clean'].fillna('', inplace=True)

vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,4), max_features=10000)
vectorizer.fit(X_train['text_clean'])
X_train_tfidf = vectorizer.transform(X_train['text_clean'].values)
X_test_tfidf = vectorizer.transform(X_test['text_clean'].values)
print("After vectorizations")
print(X_train_tfidf.shape, y_train.shape)
print(X_test_tfidf.shape, y_test.shape)
print("="*100)

(34707, 1) (34707,)
(14875, 1) (14875,)
After vectorizations
(34707, 10000) (34707,)
(14875, 10000) (14875,)


In [8]:
X_train_tfidf.shape

(34707, 10000)

In [9]:
X_test_tfidf.shape

(14875, 10000)

In [10]:
y_train.shape

(34707,)

In [11]:
X_train=X_train_tfidf.toarray()
X_test=X_test_tfidf.toarray()

# Gausian Naive Bayes

In [12]:
# Training Naive bayes
# Finding Accuracy, AUC, False positive rate, True positive rate, confusion matrix and classificatio report
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit(X_train, y_train)
pred = NB.predict(X_test)
accNB = accuracy_score(y_test, pred)
y_pred_prob = NB.predict_proba(X_test)
predT=NB.predict(X_train)
aucScoreNB = roc_auc_score(y_test,  y_pred_prob[:,1])
fprNB, tprNB, thresholds = roc_curve(y_test, y_pred_prob[:,1] )
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("***AUC score***")
print("AUC score for NB is ",aucScoreNB)
print("***Accuracy score***")
print("Train Accuracy score for NB is ",accuracy_score(y_train, predT))
print("Test Accuracy score for NB is ",accuracy_score(y_test, pred))
print("***Recall score***")
print("Train recall score for NB is ",recall_score(y_train, predT))
print("Test recall score for NB is ",recall_score(y_test, pred))

[[6083 1327]
 [1207 6258]]
              precision    recall  f1-score   support

           0       0.83      0.82      0.83      7410
           1       0.83      0.84      0.83      7465

    accuracy                           0.83     14875
   macro avg       0.83      0.83      0.83     14875
weighted avg       0.83      0.83      0.83     14875

***AUC score***
AUC score for NB is  0.8579570880935142
***Accuracy score***
Train Accuracy score for NB is  0.8739447373728643
Test Accuracy score for NB is  0.8296470588235294
***Recall score***
Train recall score for NB is  0.880073482978357
Test recall score for NB is  0.838312123241795


# Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
# Create a Logistic Regression classifier
LR = LogisticRegression()
LR.fit(X_train, y_train)
pred = LR.predict(X_test)
accLR = accuracy_score(y_test, pred)
y_pred_prob = LR.predict_proba(X_test)
predT=LR.predict(X_train)
aucScoreLR = roc_auc_score(y_test,  y_pred_prob[:,1])
fprLR, tprLR, thresholds = roc_curve(y_test, y_pred_prob[:,1] )
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("***AUC score***")
print("AUC score for LR is ",aucScoreLR)
print("***Accuracy score***")
print("Train Accuracy score for LR is ",accuracy_score(y_train, predT))
print("Test Accuracy score for LR is ",accuracy_score(y_test, pred))
print("***Recall score***")
print("Train recall score for LR is ",recall_score(y_train, predT))
print("Test recall score for LR is ",recall_score(y_test, pred))

[[6552  858]
 [ 692 6773]]
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      7410
           1       0.89      0.91      0.90      7465

    accuracy                           0.90     14875
   macro avg       0.90      0.90      0.90     14875
weighted avg       0.90      0.90      0.90     14875

***AUC score***
AUC score for LR is  0.9600131608324227
***Accuracy score***
Train Accuracy score for LR is  0.9294378655602616
Test Accuracy score for LR is  0.8957983193277311
***Recall score***
Train recall score for LR is  0.9376542855502612
Test recall score for LR is  0.9073007367716008


# Decision tree

In [14]:
# Decision Trees (Multiple if-else statements!)
from sklearn.tree import DecisionTreeClassifier
# Create a DecisionTree classifier
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
pred = DT.predict(X_test)
accDT = accuracy_score(y_test, pred)
y_pred_prob = DT.predict_proba(X_test)
predT=DT.predict(X_train)
aucScoreDT = roc_auc_score(y_test,  y_pred_prob[:,1])
fprDT, tprDT, thresholds = roc_curve(y_test, y_pred_prob[:,1] )
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("***AUC score***")
print("AUC score for DT is ",aucScoreDT)
print("***Accuracy score***")
print("Train Accuracy score for DT is ",accuracy_score(y_train, predT))
print("Test Accuracy score for DT is ",accuracy_score(y_test, pred))
print("***Recall score***")
print("Train recall score for DT is ",recall_score(y_train, predT))
print("Test recall score for DT is ",recall_score(y_test, pred))

[[5303 2107]
 [2184 5281]]
              precision    recall  f1-score   support

           0       0.71      0.72      0.71      7410
           1       0.71      0.71      0.71      7465

    accuracy                           0.71     14875
   macro avg       0.71      0.71      0.71     14875
weighted avg       0.71      0.71      0.71     14875

***AUC score***
AUC score for DT is  0.7115446080810764
***Accuracy score***
Train Accuracy score for DT is  1.0
Test Accuracy score for DT is  0.7115294117647059
***Recall score***
Train recall score for DT is  1.0
Test recall score for DT is  0.7074346952444742


# Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier
# Create a RandomForestClassifier classifier
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
pred = RF.predict(X_test)
accRF = accuracy_score(y_test, pred)
y_pred_prob = RF.predict_proba(X_test)
predT=RF.predict(X_train)
aucScoreRF = roc_auc_score(y_test,  y_pred_prob[:,1])
fprRF, tprRF, thresholds = roc_curve(y_test, y_pred_prob[:,1] )
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("***AUC score***")
print("AUC score for RF is ",aucScoreRF)
print("***Accuracy score***")
print("Train Accuracy score for RF is ",accuracy_score(y_train, predT))
print("Test Accuracy score for RF is ",accuracy_score(y_test, pred))
print("***Recall score***")
print("Train recall score for RF is ",recall_score(y_train, predT))
print("Test recall score for RF is ",recall_score(y_test, pred))

[[6284 1126]
 [1206 6259]]
              precision    recall  f1-score   support

           0       0.84      0.85      0.84      7410
           1       0.85      0.84      0.84      7465

    accuracy                           0.84     14875
   macro avg       0.84      0.84      0.84     14875
weighted avg       0.84      0.84      0.84     14875

***AUC score***
AUC score for RF is  0.925608087403836
***Accuracy score***
Train Accuracy score for RF is  1.0
Test Accuracy score for RF is  0.8432268907563025
***Recall score***
Train recall score for RF is  1.0
Test recall score for RF is  0.8384460817146685


# XG Boost

In [16]:
from xgboost import XGBClassifier
# Create a RandomForestClassifier classifier
XGB = XGBClassifier()
XGB.fit(X_train, y_train)
pred = XGB.predict(X_test)
accXGB = accuracy_score(y_test, pred)
y_pred_prob = XGB.predict_proba(X_test)
predT=XGB.predict(X_train)
aucScoreXGB = roc_auc_score(y_test,  y_pred_prob[:,1])
fprXGB, tprXGB, thresholds = roc_curve(y_test, y_pred_prob[:,1] )
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("***AUC score***")
print("AUC score for XGB is ",aucScoreXGB)
print("***Accuracy score***")
print("Train Accuracy score for XGB is ",accuracy_score(y_train, predT))
print("Test Accuracy score for XGB is ",accuracy_score(y_test, pred))
print("***Recall score***")
print("Train recall score for XGB is ",recall_score(y_train, predT))
print("Test recall score for XGB is ",recall_score(y_test, pred))

[[6181 1229]
 [ 944 6521]]
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      7410
           1       0.84      0.87      0.86      7465

    accuracy                           0.85     14875
   macro avg       0.85      0.85      0.85     14875
weighted avg       0.85      0.85      0.85     14875

***AUC score***
AUC score for XGB is  0.93306736881877
***Accuracy score***
Train Accuracy score for XGB is  0.9341343244878555
Test Accuracy score for XGB is  0.8539159663865546
***Recall score***
Train recall score for XGB is  0.9497674952637924
Test recall score for XGB is  0.8735432016075017


Logistic regression seems to be best by use of TFIDF vectorization