In [62]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import re
# import string
# string.punctuation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score
from tqdm import tqdm_notebook as tqdm
import sklearn.metrics as metrics

[nltk_data] Downloading package stopwords to /home/arooba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
path_to_dataset = r'./dataset/IMDB Dataset.csv' # path to your dataset
df = pd.read_csv(path_to_dataset)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [54]:
le = LabelEncoder()
le.fit(df['sentiment'])
df['encoded_sentiment'] = le.transform(df['sentiment'])

Text preprocessing

In [55]:
for index, row in tqdm(df.iterrows()):
    text = row['review']
    text = re.sub(r"\xa0"," ",text)
    text = text.split("\n") # splitting using new line character
    text = [re.sub(r'[^a-zA-Z0-9.,)\-(/?\t ]','',sentence) for sentence in text] # removing everything other than these a-zA-Z0-9.,)\-(/?\t
    text = [re.sub(r'(?<=[^0-9])/(?=[^0-9])',' ',sentence) for sentence in text]
    text = [re.sub("\t+"," ",sentence) for sentence in text] # converting multiple tabs and spaces ito a single tab or space
    text = [re.sub(" +"," ",sentence) for sentence in text]
    text = [re.sub("\.\.+","",sentence) for sentence in text]# these were the commmon noises in out data, depends on data
    text = [re.sub("\A ?","",sentence) for sentence in text]
    text = [sentence for sentence in text if(len(sentence) != 1 and not re.fullmatch("(\d|\d\d|\d\d\d)",sentence))]
    text = [re.sub('\A\(?(\d|\d\d\d|\d\d|[a-zA-Z])(\.|\))\s?(?=[A-Z])','\n',sentence) for sentence in text]#dividing into para wrt to points
    text = [re.sub("\A\(([ivx]+)\)\s?(?=[a-zA-Z0-9])",'\n',sentence) for sentence in text] #dividing into para wrt to roman points
    text_new = " ".join(text) # joining all the lines into a single text
    text_new = re.sub(" +"," ",text_new)
    df['review'][index] = text_new
            
print("Completed")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(df.iterrows()):


0it [00:00, ?it/s]

Completed


In [56]:
X_train, X_test, Y_train, Y_test = train_test_split(df['review'], df['encoded_sentiment'], test_size = 0.2, shuffle = True, random_state= 32)

In [57]:
## vectorization
max_feature_num = 50000
vectorizer = TfidfVectorizer(max_features = max_feature_num)
train_vecs = vectorizer.fit_transform(X_train)
test_vecs = TfidfVectorizer(max_features = max_feature_num, vocabulary = vectorizer.vocabulary_).fit_transform(X_test)
# check the dimensions of feature vectors
train_vecs.shape, test_vecs.shape

((40000, 50000), (10000, 50000))

Custom Confusion Matrix

In [58]:
x_axes = ['positive', 'negative']
y_axes =  ['negative', 'positive']

def conf_matrix(z, x=x_axes, y=y_axes):
    
    z = np.flip(z, 0)

    # change each element of z to type string for annotations
    z_text = [[str(y) for y in x] for x in z]

    # set up figure 
    fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

    # add title
    fig.update_layout(title_text='<b>Confusion matrix</b>',
                      xaxis = dict(title='Predicted value'),
                      yaxis = dict(title='Real value')
                     )

    # add colorbar
    fig['data'][0]['showscale'] = True
    
    return fig

### Support Vector Machine

In [59]:
clf = svm.SVC(kernel='rbf', max_iter = 300) # Linear Kernel

#Train the model using the training sets
clf.fit(train_vecs, Y_train)

# testing
y_predsvm = clf.predict(test_vecs)



In [60]:
train_accuracy = clf.score(train_vecs, Y_train)
print("Training accuracy:", train_accuracy)

Training accuracy: 0.726225


In [63]:
print("Test accuracy:", metrics.accuracy_score(Y_test, y_predsvm))

Test accuracy: 0.7201


In [64]:
print("F1 score: " + str(f1_score(Y_test, y_predsvm, average = "macro")))
print("Precision score: " +str(precision_score(Y_test, y_predsvm, average = "macro")))
print("Recall score: " +str(recall_score(Y_test, y_predsvm, average = "macro")))  

F1 score: 0.7200102884965371
Precision score: 0.7212006870311105
Recall score: 0.7206382649089382


In [65]:
conf_matrix(metrics.confusion_matrix(Y_test, y_predsvm))

### Logistic Regression

In [66]:
LR = LogisticRegression()

#Train the model using the training sets
LR.fit(train_vecs, Y_train)

# testing
y_predLR = LR.predict(test_vecs)

In [67]:
train_accuracy = LR.score(train_vecs, Y_train)
print("Training accuracy:", train_accuracy)

Training accuracy: 0.9305


In [68]:
print("Testing accuracy:",metrics.accuracy_score(Y_test, y_predLR))

Testing accuracy: 0.8975


In [69]:
print("F1 score: " + str(f1_score(Y_test, y_predLR, average = "macro")))
print("Precision score: " +str(precision_score(Y_test, y_predLR, average = "macro")))
print("Recall score: " +str(recall_score(Y_test, y_predLR, average = "macro"))) 

F1 score: 0.8974353832658113
Precision score: 0.8976357113285716
Recall score: 0.8973361988609592


In [70]:
conf_matrix(metrics.confusion_matrix(Y_test, y_predLR))

### Random Forest

In [72]:
RF = RandomForestClassifier(n_estimators = 450, max_depth = 13, random_state = 43)

#Train the model using the training sets
RF.fit(train_vecs, Y_train)

# testing
y_predRF = RF.predict(test_vecs)

In [73]:
train_accuracy = RF.score(train_vecs, Y_train)
print("Training accuracy:", train_accuracy)

Training accuracy: 0.89785


In [74]:
print("Testing accuracy:",metrics.accuracy_score(Y_test, y_predRF))

Testing accuracy: 0.839


In [75]:
print("F1 score: " + str(f1_score(Y_test, y_predRF, average = "macro")))
print("Precision score: " +str(precision_score(Y_test, y_predRF, average = "macro")))
print("Recall score: " +str(recall_score(Y_test, y_predRF, average = "macro"))) 

F1 score: 0.8388860021797022
Precision score: 0.8391165026908503
Recall score: 0.8387916641058735


In [76]:
conf_matrix(metrics.confusion_matrix(Y_test, y_predRF))

### Naive Bayes

In [77]:
nb = MultinomialNB()

# Train the model
nb.fit(train_vecs, Y_train)

MultinomialNB()

In [78]:
# Make class anf probability predictions
y_pred_class = nb.predict(test_vecs)
y_pred_prob = nb.predict_proba(test_vecs)[:, 1]

In [79]:
train_accuracy = nb.score(train_vecs, Y_train)
print("Training accuracy:", train_accuracy)

Training accuracy: 0.896675


In [80]:
print('Test accuracy:', metrics.accuracy_score(Y_test, y_pred_class))

Test accuracy: 0.8623


In [81]:
print("F1 score: " + str(f1_score(Y_test, y_pred_class, average = "macro")))
print("Precision score: " +str(precision_score(Y_test, y_pred_class, average = "macro")))
print("Recall score: " +str(recall_score(Y_test, y_pred_class, average = "macro"))) 

F1 score: 0.8622885961186446
Precision score: 0.8630803783360907
Recall score: 0.8627273984389404


In [82]:
conf_matrix(metrics.confusion_matrix(Y_test, y_pred_class))

### Decision Trees

In [83]:
#criterion = entropy or gini

clf_model = DecisionTreeClassifier(criterion = "entropy", random_state = 42, 
                                   max_depth = 23, 
                                   min_samples_leaf = 11)

# testing
clf_model.fit(train_vecs, Y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=23, min_samples_leaf=11,
                       random_state=42)

In [84]:
y_predDT = clf_model.predict(test_vecs)

In [85]:
train_accuracy = clf_model.score(train_vecs, Y_train)
print("Training accuracy:", train_accuracy)

Training accuracy: 0.83325


In [86]:
print("Testing accuracy:",metrics.accuracy_score(Y_test, y_predDT))

Testing accuracy: 0.7188


In [87]:
print("F1 score: " + str(f1_score(Y_test, y_predDT, average = "macro")))
print("Precision score: " +str(precision_score(Y_test, y_predDT, average = "macro")))
print("Recall score: " +str(recall_score(Y_test, y_predDT, average = "macro"))) 

F1 score: 0.7174232748922058
Precision score: 0.7210160021763934
Recall score: 0.7177938638272592


In [88]:
conf_matrix(metrics.confusion_matrix(Y_test, y_predDT))