## Mounting Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Loading Required Libraries

In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading Data

In [3]:
# using the SQLite Table to read data.
con = sqlite3.connect(F'/content/drive/MyDrive/Aazon_Review_Data/database.sqlite') 

#filtering only positive and negative reviews i.e. ignoring neutral reviews with Score = 3
data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3""", con)
print(data.shape)
data.head()

(525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
#sampling 100k reviews
df = data.sample(n = 100000, random_state = 1).reset_index(drop = True)
df.shape

(100000, 10)

In [5]:
#proprtion of review scores
df.Score.value_counts()

5    69027
4    15328
1     9963
2     5682
Name: Score, dtype: int64

In [6]:
# Give reviews with Score > 3 a positive rating, and reviews with a score < 3 a negative rating.
df['Score'] = np.where(df['Score'] > 3, 0, 1)
df.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,87501,B001D0GV6I,AXL5GJLYKKLMX,"KCBrad ""KCBrad""",0,0,0,1272672000,Great flavor,This is one of my favorite flavors. This Fren...
1,476117,B001EQ4EHE,A261ERFPYHP556,"Vinsanity18 ""Vince""",1,1,0,1274054400,Tastes Great,I bought this for my girl friend that recently...
2,225032,B001LNTY70,A2GH0L50430WJF,E. Reynolds,0,0,0,1336262400,Mostly lime!,Not a big fan of chili actually... so these al...


## Data Cleaning

In [7]:
#Sorting data according to ProductId in ascending order
df.sort_values('ProductId', axis = 0, ascending = True, inplace = True, kind = 'quicksort', na_position = 'last')

In [8]:
#Deduplication of entries
df.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep ='first', inplace = True)
df.shape

(86856, 10)

<b>Insight:-</b> In some rows, value of HelpfulnessNumerator can be greater than HelpfulnessDenominator which is not practically possible hence these two rows too are removed from calcualtions.
It is not possible that more number of people found these reviews useful compared to number of people who have seen this review.

In [9]:
#removing records with HelpfulnessNumerator > HelpfulnessDenominator
df = df[df.HelpfulnessNumerator <= df.HelpfulnessDenominator]
df.shape

(86855, 10)

In [10]:
#selecting required columns
df = df[['Text', 'Score']]
df.head()

Unnamed: 0,Text,Score
77580,"The same author wrote ""Where the Wild Things A...",0
73066,This book contains a collection of twelve shor...,0
72413,This copy is smaller than I expected (mostly b...,0
70563,I can remember seeing the show when it aired o...,0
94893,"Great book, perfect condition arrived in a sho...",0


In [11]:
#Distribution of +ve and -ve Reviews
print("Proportion of -ve Reviews :", round(df['Score'].mean(), 2), "\n")
df['Score'].value_counts()

Proportion of -ve Reviews : 0.16 



0    73233
1    13622
Name: Score, dtype: int64

## Text Preprocessing

In [12]:
#stopwords 
stop_words = stopwords.words('english') 
negative = ["no", "nor", "not", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn',
          "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn',
          "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
          'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'don', "don't"]
stop_words = [x for x in stop_words if x not in negative]

#stemming
stemmer = PorterStemmer()  

#function to replace negative words by not to reduce the dimension of data
def replace_by_not(x):
    if x in negative:
       x = 'not'
    return x

For word2vec, stemming and lemmatization is not advised, so we will be not using in our text pre-processing function

In [13]:
#text pre-processing function
def preprocess(text):
    text = text.lower()                                                         #to lower case
    text = re.sub('http[s]?://\S+', ' ', text)                                  #removing urls
    text = re.sub('<[^<]+?>', ' ', text)                                        #removing html tags
    text = re.sub('\S*\d\S*', ' ', text)                                        #removing alphanumeric words
    text = re.sub('[^A-Za-z]+', ' ', text)                                      #removing special characters
    text = re.sub(r'\b[a-zA-Z]\b', ' ', text)                                   #removing single character (length) words
    text = re.sub('\s{2,}', ' ', text)                                          #removing multiple white spaces
    text = text.strip()                                                         #removing spaces from start & end of the text
    text_tokenized = word_tokenize(text)                                        #tokenization
    
    tokens = []
    for token in text_tokenized:                              
        if token not in stop_words:                                             #removing stopwords  
            token = replace_by_not(token)                                       #replacing negative words by "NOT" to reduce the dimension of data                                             #replacing negative words by "NOT" to reduce the dimension of data       
            token = stemmer.stem(token)                                         #stemming
            tokens.append(token)                                                #returns text back in sentence form
    return tokens                                                               #returns tokenized text in list

In [14]:
#applying text pre-processing function
df['Text'] = df['Text'].apply(lambda x: preprocess(x))
df.head()

Unnamed: 0,Text,Score
77580,"[author, wrote, wild, thing, carol, king, wrot...",0
73066,"[book, contain, collect, twelv, short, stateme...",0
72413,"[copi, smaller, expect, mostli, not, pay, atte...",0
70563,"[rememb, see, show, air, televis, year, ago, c...",0
94893,"[great, book, perfect, condit, arriv, short, a...",0


In [15]:
#Extracting X & y for Training
X = df['Text']
y = df['Score']

del df

## Featurization - Bag of Words, TF-IDF

### Bag of Word: Uni-gram & Bi-gram Combined (Count)

In [16]:
#Bag of words : Uni-gram & Bi-gram
count_vect = CountVectorizer(ngram_range = (1, 2), token_pattern = None, tokenizer = lambda doc: doc, preprocessor = lambda doc: doc, min_df = 20, max_features = 2500) 
count_vect.fit(X)

X_uni_bi = count_vect.transform(X)
print("Some Feature Names ", count_vect.get_feature_names()[:20])

Some Feature Names  ['abl', 'abl find', 'abl get', 'absolut', 'absolut delici', 'absolut love', 'absorb', 'accept', 'accord', 'acid', 'across', 'act', 'activ', 'actual', 'actual tast', 'ad', 'ad sugar', 'add', 'add littl', 'add water']


### TF-IDF

In [17]:
#tf-idf
tf_idf_vect = TfidfVectorizer(ngram_range = (1, 2), token_pattern = None, tokenizer = lambda doc: doc, preprocessor = lambda doc: doc, min_df = 20, max_features = 2500)
tf_idf_vect.fit(X)

X_tf_idf = tf_idf_vect.transform(X)
print("Some Feature Names ", tf_idf_vect.get_feature_names()[0:10])

Some Feature Names  ['abl', 'abl find', 'abl get', 'absolut', 'absolut delici', 'absolut love', 'absorb', 'accept', 'accord', 'acid']


## Word2Vec

### Building Word2Vec Model

In [18]:
#word2vec model using gensim library
word2vec = Word2Vec(X, min_count = 10, window = 10, size = 200, workers = 3)

### Vocabulary of Word2Vec Model

In [19]:
#vobabulary of word2vec model
vocabulary = word2vec.wv.vocab

### Average of Word2Vec Vectors for Document Vector

In [20]:
#avg word2vec function
def avg_word2vec(x): 
    vectors = np.zeros((200, ), dtype = "float32")
    words = 0
    for item in x:
        if item in vocabulary:                                    #checking if word is present in word2vec vocabulary 
           words += 1                                                           
           vectors = np.add(vectors, word2vec.wv[item])           #vector representation of the word
    avg = np.divide(vectors, words)                               #average of all available vectors
    return avg if words > 0 else np.zeros((200, ), dtype = "float32")

In [21]:
#calculating avg word2vec
df_size = len(X)
ndim = 200
X_word2vec = np.zeros((df_size, ndim),  dtype = "float32")
for i, x in enumerate(X):
    X_word2vec[i] = avg_word2vec(x)
X_word2vec.shape

(86855, 200)

### Creating Train & Test Data

In [22]:
#train & test partioning for ngram data
X_train_ngram, X_test_ngram, y_train_ngram, y_test_ngram = train_test_split(X_uni_bi, y, test_size = 0.3, shuffle = True, random_state = 1)
del X_uni_bi

In [23]:
#train & test partioning for tf-idf data
X_train_tf_idf, X_test_tf_idf, y_train_tf_idf, y_test_tf_idf = train_test_split(X_tf_idf.toarray(), y, test_size = 0.3, shuffle = True, random_state = 1)
del X_tf_idf

In [24]:
#train & test partioning for word2vec data
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(X_word2vec, y, test_size = 0.3, shuffle = True, random_state = 1)
del X_word2vec

In [25]:
#shape of training data of ngram, tf-idf and word2vec data respectively
print(X_train_ngram.shape, y_train_ngram.shape)
print(X_train_tf_idf.shape, y_train_tf_idf.shape)
print(X_train_word2vec.shape, y_train_word2vec.shape)

(60798, 2500) (60798,)
(60798, 2500) (60798,)
(60798, 200) (60798,)


In [26]:
#checking proportion of -ve reviews in training & data of ngram, tf-idf and word2vec data respectively
np.mean(y_train_tf_idf)
print("Train n-gram: ", round(np.mean(y_train_ngram), 2), "   Test n-gram", round(np.mean(y_test_ngram), 2))
print("Train tf-idf: ", round(np.mean(y_train_tf_idf), 2),  "   Test tf-idf", round(np.mean(y_test_tf_idf), 2))
print("Train word2vec: ", round(np.mean(y_train_word2vec), 2),  "   Test word2vec", round(np.mean(y_test_word2vec), 2))

Train n-gram:  0.16    Test n-gram 0.16
Train tf-idf:  0.16    Test tf-idf 0.16
Train word2vec:  0.16    Test word2vec 0.16


## Applying Machine Learning Algorithms for Sentiment Analysis
1. Naive Bayes
2. Logistic Regression
3. Decision Tree
4. Random Forest

### Function for Performance Metrics

In [27]:
#function to get performance metrics
def accuracy_metrics(y, y_pred, y_prob):
    results = confusion_matrix(y, y_pred) 
    print('Confusion Matrix :')
    print(results) 
    print('Accuracy Score :',accuracy_score(y, y_pred))
    print('Precision : ', precision_score(y, y_pred))
    print('Recall : ', recall_score(y, y_pred))
    print('FI Score : ', f1_score(y, y_pred, average='macro'))
    print('Area under the Curve : ', roc_auc_score(y, y_prob))

## Naive Bayes

### Naive Bayes on Bag of Words

In [28]:
#naive bayes on bag of words
nb = MultinomialNB()
nb.fit(X_train_ngram, y_train_ngram)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
#accuracy metrics on train data
train_prob = nb.predict_proba(X_train_ngram)[:,1]
train_pred = nb.predict(X_train_ngram)

accuracy_metrics(y_train_ngram, train_pred, train_prob)

Confusion Matrix :
[[47779  3474]
 [ 2227  7318]]
Accuracy Score : 0.9062304681075035
Precision :  0.6780948851000741
Recall :  0.7666841278156102
FI Score :  0.831686166604258
Area under the Curve :  0.9359902600905132


In [30]:
#accuracy metrics on test data
test_prob = nb.predict_proba(X_test_ngram)[:,1]
test_pred = nb.predict(X_test_ngram)

accuracy_metrics(y_test_ngram, test_pred, test_prob)

Confusion Matrix :
[[20505  1475]
 [  997  3080]]
Accuracy Score : 0.9051310588325594
Precision :  0.6761800219538968
Recall :  0.755457444199166
FI Score :  0.8283863074337239
Area under the Curve :  0.9318589234131056


### Naive Bayes on TF-IDF

In [31]:
#naive bayes on tf-idf
nb = GaussianNB()
nb.fit(X_train_tf_idf, y_train_tf_idf)

GaussianNB(priors=None, var_smoothing=1e-09)

In [32]:
#accuracy metrics on train data
train_prob = nb.predict_proba(X_train_tf_idf)[:,1]
train_pred = nb.predict(X_train_tf_idf)

accuracy_metrics(y_train_tf_idf, train_pred, train_prob)

Confusion Matrix :
[[40929 10324]
 [ 1208  8337]]
Accuracy Score : 0.8103227079838152
Precision :  0.4467606237607845
Recall :  0.8734415924567837
FI Score :  0.7338343237179887
Area under the Curve :  0.8662790072199789


In [33]:
#accuracy metrics on test data
test_prob = nb.predict_proba(X_test_tf_idf)[:,1]
test_pred = nb.predict(X_test_tf_idf)

accuracy_metrics(y_test_tf_idf, test_pred, test_prob)

Confusion Matrix :
[[17514  4466]
 [  654  3423]]
Accuracy Score : 0.8035076946693787
Precision :  0.4338952972493345
Recall :  0.8395879323031641
FI Score :  0.7222964318333382
Area under the Curve :  0.8482404288421499


### Naive Bayes on Word2Vec

In [34]:
#naive bayes on word2vec
nb = GaussianNB()
nb.fit(X_train_word2vec, y_train_word2vec)

GaussianNB(priors=None, var_smoothing=1e-09)

In [35]:
#accuracy metrics on train data
train_prob = nb.predict_proba(X_train_word2vec)[:,1]
train_pred = nb.predict(X_train_word2vec)

accuracy_metrics(y_train_word2vec, train_pred, train_prob)

Confusion Matrix :
[[39822 11431]
 [ 1965  7580]]
Accuracy Score : 0.7796638047304187
Precision :  0.39871653253379624
Recall :  0.7941330539549503
FI Score :  0.6934527977007843
Area under the Curve :  0.8667273720358287


In [36]:
#accuracy metrics on test data
test_prob = nb.predict_proba(X_test_word2vec)[:,1]
test_pred = nb.predict(X_test_word2vec)

accuracy_metrics(y_test_word2vec, test_pred, test_prob)

Confusion Matrix :
[[17182  4798]
 [  801  3276]]
Accuracy Score : 0.7851249184480178
Precision :  0.40574684171414416
Recall :  0.8035320088300221
FI Score :  0.6995551413408979
Area under the Curve :  0.870759205806871


## Logistic Regression

### LogIstic Regression on Bag of Words

In [37]:
#logistic regression on bag of words
lr = LogisticRegression(max_iter = 1000, random_state = 1)
lr.fit(X_train_ngram, y_train_ngram)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
#accuracy metrics on train data
train_prob = lr.predict_proba(X_train_ngram)[:,1]
train_pred = lr.predict(X_train_ngram)

accuracy_metrics(y_train_ngram, train_pred, train_prob)

Confusion Matrix :
[[50188  1065]
 [ 2522  7023]]
Accuracy Score : 0.9410013487285765
Precision :  0.8683234421364985
Recall :  0.7357778941854374
FI Score :  0.8810359724174082
Area under the Curve :  0.9691562691951737


In [39]:
#accuracy metrics on test data
test_prob = lr.predict_proba(X_test_ngram)[:,1]
test_pred = lr.predict(X_test_ngram)

accuracy_metrics(y_test_ngram, test_pred, test_prob)

Confusion Matrix :
[[21306   674]
 [ 1303  2774]]
Accuracy Score : 0.9241278735080785
Precision :  0.8045243619489559
Recall :  0.6804022565611969
FI Score :  0.8464687288977981
Area under the Curve :  0.9417165202249778


### Logistic Regression on TF-IDF

In [40]:
#logistic regression on tf-idf
lr = LogisticRegression(max_iter = 1000, random_state = 1)
lr.fit(X_train_tf_idf, y_train_tf_idf)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
#accuracy metrics on train data
train_prob = lr.predict_proba(X_train_tf_idf)[:,1]
train_pred = lr.predict(X_train_tf_idf)

accuracy_metrics(y_train_tf_idf, train_pred, train_prob)

Confusion Matrix :
[[50431   822]
 [ 3268  6277]]
Accuracy Score : 0.9327280502648113
Precision :  0.8842090435272574
Recall :  0.6576217915138816
FI Score :  0.8576478027954744
Area under the Curve :  0.9662158155287479


In [42]:
#accuracy metrics on test data
test_prob = lr.predict_proba(X_test_tf_idf)[:,1]
test_pred = lr.predict(X_test_tf_idf)

accuracy_metrics(y_test_tf_idf, test_pred, test_prob)

Confusion Matrix :
[[21544   436]
 [ 1545  2532]]
Accuracy Score : 0.9239743638945389
Precision :  0.8530997304582211
Recall :  0.6210448859455482
FI Score :  0.8374264200932436
Area under the Curve :  0.9565904786008553


### Logistic Regression on Word2Vec

In [43]:
#logistic regression on word2vec
lr = LogisticRegression(max_iter = 1000, random_state = 1)
lr.fit(X_train_word2vec, y_train_word2vec)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
#accuracy metrics on train data
train_prob = lr.predict_proba(X_train_word2vec)[:,1]
train_pred = lr.predict(X_train_word2vec)

accuracy_metrics(y_train_word2vec, train_pred, train_prob)

Confusion Matrix :
[[49693  1560]
 [ 3977  5568]]
Accuracy Score : 0.9089279252606993
Precision :  0.7811447811447811
Recall :  0.5833420639078052
FI Score :  0.8075670814107918
Area under the Curve :  0.9330691876759605


In [45]:
#accuracy metrics on test data
test_prob = lr.predict_proba(X_test_word2vec)[:,1]
test_pred = lr.predict(X_test_word2vec)

accuracy_metrics(y_test_word2vec, test_pred, test_prob)

Confusion Matrix :
[[21267   713]
 [ 1707  2370]]
Accuracy Score : 0.9071266838085735
Precision :  0.768731754784301
Recall :  0.5813097866077999
FI Score :  0.8040891831575567
Area under the Curve :  0.9328244197291314


## Decision Tree

### Decision Tree on Bag of Words

In [46]:
#decision tree on bag of words
dt = DecisionTreeClassifier(min_samples_leaf = 30, random_state = 1)
dt.fit(X_train_ngram, y_train_ngram)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [47]:
#accuracy metrics on train data
train_prob = dt.predict_proba(X_train_ngram)[:,1]
train_pred = dt.predict(X_train_ngram)

accuracy_metrics(y_train_ngram, train_pred, train_prob)

Confusion Matrix :
[[49453  1800]
 [ 5017  4528]]
Accuracy Score : 0.8878746011381954
Precision :  0.7155499367888748
Recall :  0.4743844944997381
FI Score :  0.7530243753109165
Area under the Curve :  0.9062161019088973


In [48]:
#accuracy metrics on test data
test_prob = dt.predict_proba(X_test_ngram)[:,1]
test_pred = dt.predict(X_test_ngram)

accuracy_metrics(y_test_ngram, test_pred, test_prob)

Confusion Matrix :
[[21092   888]
 [ 2312  1765]]
Accuracy Score : 0.8771923091683617
Precision :  0.6652845834903882
Recall :  0.43291636006867795
FI Score :  0.7270038285154072
Area under the Curve :  0.8490123694852256


### Decision Tree on TF-IDF

In [49]:
#decision tree on tf-idf
dt = DecisionTreeClassifier(min_samples_leaf = 30, random_state = 1)
dt.fit(X_train_tf_idf, y_train_tf_idf)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [50]:
#accuracy metrics on train data
train_prob = dt.predict_proba(X_train_tf_idf)[:,1]
train_pred = dt.predict(X_train_tf_idf)

accuracy_metrics(y_train_tf_idf, train_pred, train_prob)

Confusion Matrix :
[[49503  1750]
 [ 4903  4642]]
Accuracy Score : 0.8905720582913912
Precision :  0.7262202753441802
Recall :  0.4863279203771608
FI Score :  0.759788526202852
Area under the Curve :  0.9202115723806358


In [51]:
#accuracy metrics on test data
test_prob = dt.predict_proba(X_test_tf_idf)[:,1]
test_pred = dt.predict(X_test_tf_idf)

accuracy_metrics(y_test_tf_idf, test_pred, test_prob)

Confusion Matrix :
[[20999   981]
 [ 2325  1752]]
Accuracy Score : 0.8731243044095637
Precision :  0.6410537870472008
Recall :  0.4297277409860191
FI Score :  0.7207818780382123
Area under the Curve :  0.8329765693297563


### Decision Tree on Word2Vec

In [52]:
#decision tree on word2vec
dt = DecisionTreeClassifier(min_samples_leaf = 50, random_state = 1)
dt.fit(X_train_word2vec, y_train_word2vec)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=50, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [53]:
#accuracy metrics on train data
train_prob = dt.predict_proba(X_train_word2vec)[:,1]
train_pred = dt.predict(X_train_word2vec)

accuracy_metrics(y_train_word2vec, train_pred, train_prob)

Confusion Matrix :
[[49607  1646]
 [ 4863  4682]]
Accuracy Score : 0.8929405572551729
Precision :  0.7398862199747156
Recall :  0.4905185961236249
FI Score :  0.7641830216955781
Area under the Curve :  0.9279673672170383


In [54]:
#accuracy metrics on test data
test_prob = dt.predict_proba(X_test_word2vec)[:,1]
test_pred = dt.predict(X_test_word2vec)

accuracy_metrics(y_test_word2vec, test_pred, test_prob)

Confusion Matrix :
[[20972  1008]
 [ 2454  1623]]
Accuracy Score : 0.8671374294815213
Precision :  0.6168757126567845
Recall :  0.3980868285504047
FI Score :  0.703827195494878
Area under the Curve :  0.8322509726883963


## Random Forest

### Random Forest on Bag of Words

In [55]:
#random forest on bag of words
rf = RandomForestClassifier(n_estimators = 200, min_samples_leaf = 20, n_jobs = 3, random_state = 1)
rf.fit(X_train_ngram, y_train_ngram)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=3,
                       oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [56]:
#accuracy metrics on train data
train_prob = rf.predict_proba(X_train_ngram)[:,1]
train_pred = rf.predict(X_train_ngram)

accuracy_metrics(y_train_ngram, train_pred, train_prob)

Confusion Matrix :
[[51190    63]
 [ 7861  1684]]
Accuracy Score : 0.8696667653541235
Precision :  0.9639381797366915
Recall :  0.17642744892613935
FI Score :  0.6132132139422937
Area under the Curve :  0.937857848068626


In [57]:
#accuracy metrics on test data
test_prob = rf.predict_proba(X_test_ngram)[:,1]
test_pred = rf.predict(X_test_ngram)

accuracy_metrics(y_test_ngram, test_pred, test_prob)

Confusion Matrix :
[[21945    35]
 [ 3424   653]]
Accuracy Score : 0.867252561691676
Precision :  0.9491279069767442
Recall :  0.16016678930586215
FI Score :  0.6005142808096615
Area under the Curve :  0.9225754655100418


### Random Forest on TF-IDF

In [58]:
#random forest on tf-idf
rf = RandomForestClassifier(n_estimators = 200, min_samples_leaf = 20, n_jobs = 3, random_state = 1)
rf.fit(X_train_tf_idf, y_train_tf_idf)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=3,
                       oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [59]:
#accuracy metrics on train data
train_prob = rf.predict_proba(X_train_tf_idf)[:,1]
train_pred = rf.predict(X_train_tf_idf)

accuracy_metrics(y_train_tf_idf, train_pred, train_prob)

Confusion Matrix :
[[51167    86]
 [ 7227  2318]]
Accuracy Score : 0.879716438040725
Precision :  0.9642262895174709
Recall :  0.2428496595075956
FI Score :  0.6606432033489271
Area under the Curve :  0.9532842636652772


In [60]:
#accuracy metrics on test data
test_prob = rf.predict_proba(X_test_tf_idf)[:,1]
test_pred = rf.predict(X_test_tf_idf)

accuracy_metrics(y_test_tf_idf, test_pred, test_prob)

Confusion Matrix :
[[21923    57]
 [ 3237   840]]
Accuracy Score : 0.8735848332501823
Precision :  0.9364548494983278
Recall :  0.20603384841795438
FI Score :  0.6339396853455533
Area under the Curve :  0.9241737253948836


### Random Forest on Word2Vec

In [61]:
#random forest on word2vec
rf = RandomForestClassifier(n_estimators = 200, min_samples_leaf = 40, n_jobs = 3, random_state = 1)
rf.fit(X_train_word2vec, y_train_word2vec)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=40, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=3,
                       oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [62]:
#accuracy metrics on train data
train_prob = rf.predict_proba(X_train_word2vec)[:,1]
train_pred = rf.predict(X_train_word2vec)

accuracy_metrics(y_train_word2vec, train_pred, train_prob)

Confusion Matrix :
[[50941   312]
 [ 6018  3527]]
Accuracy Score : 0.8958847330504293
Precision :  0.9187288356342798
Recall :  0.36951283394447354
FI Score :  0.7342754677459087
Area under the Curve :  0.9580670002201612


In [63]:
#accuracy metrics on test data
test_prob = rf.predict_proba(X_test_word2vec)[:,1]
test_pred = rf.predict(X_test_word2vec)

accuracy_metrics(y_test_word2vec, test_pred, test_prob)

Confusion Matrix :
[[21750   230]
 [ 2882  1195]]
Accuracy Score : 0.8805695206662317
Precision :  0.8385964912280702
Recall :  0.29310767721363745
FI Score :  0.6838117860015048
Area under the Curve :  0.9145330348034189


Logistic regression is giving best performance wrt all metrics. Both precision & recall are comparatively high and balanced compared to other methods.

## Applying SMOTE Algorithm for Handling Class Imbalance

SMOTE on Bag of Words

In [64]:
#ngram data - smote algorithm for handling data imbalance
print("Before OverSampling, Proportion of Label '1': {}".format(np.mean(y_train_ngram)))

sm = SMOTE(k_neighbors = 20, random_state = 1)
X_train_ngram_smote, y_train_ngram_smote = sm.fit_sample(X_train_ngram, y_train_ngram)

print("After OverSampling, Proportion of Label '1': {}".format(np.mean(y_train_ngram_smote)))

Before OverSampling, Proportion of Label '1': 0.15699529589789138
After OverSampling, Proportion of Label '1': 0.5


SMOTE on TF-IDF

In [65]:
#ngram data - smote algorithm for handling data imbalance
print("Before OverSampling, Proportion of Label '1': {}".format(np.mean(y_train_tf_idf)))

sm = SMOTE(k_neighbors = 20, random_state = 1)
X_train_tf_idf_smote, y_train_tf_idf_smote = sm.fit_sample(X_train_tf_idf, y_train_tf_idf)

print("After OverSampling, Proportion of Label '1': {}".format(np.mean(y_train_tf_idf_smote)))

Before OverSampling, Proportion of Label '1': 0.15699529589789138
After OverSampling, Proportion of Label '1': 0.5


SMOTE on Word2Vec

In [66]:
#ngram data - smote algorithm for handling data imbalance
print("Before OverSampling, Proportion of Label '1': {}".format(np.mean(y_train_word2vec)))


sm = SMOTE(k_neighbors = 20, random_state = 1)
X_train_word2vec_smote, y_train_word2vec_smote = sm.fit_sample(X_train_word2vec, y_train_word2vec)

print("After OverSampling, Proportion of Label '1': {}".format(np.mean(y_train_word2vec_smote)))

Before OverSampling, Proportion of Label '1': 0.15699529589789138
After OverSampling, Proportion of Label '1': 0.5


### Logistic Regression on SMOTE Oversampled Bag of Words Data

In [67]:
#logistic regression on bag of words (smote)
lr = LogisticRegression(max_iter = 1000, random_state = 1)
lr.fit(X_train_ngram_smote, y_train_ngram_smote)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [68]:
#accuracy metrics on train data
train_prob = lr.predict_proba(X_train_ngram)[:,1]
train_pred = lr.predict(X_train_ngram)

accuracy_metrics(y_train_ngram, train_pred, train_prob)

Confusion Matrix :
[[47968  3285]
 [ 1769  7776]]
Accuracy Score : 0.9168722655350505
Precision :  0.7030105777054516
Recall :  0.8146673651126244
FI Score :  0.8523435363482039
Area under the Curve :  0.9514724421400438


In [69]:
#accuracy metrics on test data
test_prob = lr.predict_proba(X_test_ngram)[:,1]
test_pred = lr.predict(X_test_ngram)

accuracy_metrics(y_test_ngram, test_pred, test_prob)

Confusion Matrix :
[[20318  1662]
 [  942  3135]]
Accuracy Score : 0.9000652415857543
Precision :  0.6535334584115072
Recall :  0.7689477557027226
FI Score :  0.823168234405951
Area under the Curve :  0.9181385490365961


### Logistic Regression on SMOTE Oversampled TF-IDF Data

In [70]:
#logistic regression on tf-idf (smote)
lr = LogisticRegression(max_iter = 1000, random_state = 1)
lr.fit(X_train_tf_idf_smote, y_train_tf_idf_smote)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [71]:
#accuracy metrics on train data
train_prob = lr.predict_proba(X_train_tf_idf)[:,1]
train_pred = lr.predict(X_train_tf_idf)

accuracy_metrics(y_train_tf_idf, train_pred, train_prob)

Confusion Matrix :
[[47056  4197]
 [  975  8570]]
Accuracy Score : 0.9149314122175072
Precision :  0.6712618469491658
Recall :  0.8978522786799371
FI Score :  0.8580517502087988
Area under the Curve :  0.9688861029453647


In [72]:
#accuracy metrics on test data
test_prob = lr.predict_proba(X_test_tf_idf)[:,1]
test_pred = lr.predict(X_test_tf_idf)

accuracy_metrics(y_test_tf_idf, test_pred, test_prob)

Confusion Matrix :
[[20066  1914]
 [  580  3497]]
Accuracy Score : 0.9042867559580918
Precision :  0.6462761042321198
Recall :  0.857738533235222
FI Score :  0.8393163806634101
Area under the Curve :  0.9564549059360719


### Logistic Regression on SMOTE Oversampled Word2Vec Data

In [73]:
#logistic regression on word2vec (smote)
lr = LogisticRegression(max_iter = 1000, random_state = 1)
lr.fit(X_train_word2vec_smote, y_train_word2vec_smote)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [74]:
#accuracy metrics on train data
train_prob = lr.predict_proba(X_train_word2vec)[:,1]
train_pred = lr.predict(X_train_word2vec)

accuracy_metrics(y_train_word2vec, train_pred, train_prob)

Confusion Matrix :
[[44304  6949]
 [ 1408  8137]]
Accuracy Score : 0.8625448205533077
Precision :  0.5393742542754872
Recall :  0.852488213724463
FI Score :  0.787263186816549
Area under the Curve :  0.9331626935543218


In [75]:
#accuracy metrics on test data
test_prob = lr.predict_proba(X_test_word2vec)[:,1]
test_pred = lr.predict(X_test_word2vec)

accuracy_metrics(y_test_word2vec, test_pred, test_prob)

Confusion Matrix :
[[18982  2998]
 [  588  3489]]
Accuracy Score : 0.8623786314617953
Precision :  0.5378449206104516
Recall :  0.8557763061074319
FI Score :  0.7871197960878874
Area under the Curve :  0.9326510956177299


We can see that applying smote algorithm has boosted the performance of logistic regression model by a good margin. Specially the our focus metric recall has increased significantly.  

Overall logistic regression model trained on oversampled tf-idf data performs the best.