# Importing libraries

In [2]:
# For data processing and handling
import numpy as np
import pandas as pd

# For nlp data pre-processing
import re
import nltk

#For Data Scaling
from sklearn.preprocessing import MinMaxScaler

# Candidate Model library import
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Proposed model library (voting stack using svc + LogRegression + Decision Tree)
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# For model evaluation
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
import time

np.random.seed(5678)

# The Dataset
We will use the imbd 50k dataset. This [50000 x 2] has 1 column of review and the second is the label.
The dataset holds 25k positive labelled and 25k negative labelled rows.

We assessed this dataset is large enough to train on and get interesting results on figuring the candidate algorithms have been checked to perform well on high dimensional and sparse matrices (something nlp algorithms eventually work on).

In [3]:
dataset = pd.read_csv('https://airafique.com/yeet/imbd_ds.csv')

In [4]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Lets see about length and word count of each review

In [5]:
dataset['review'].str.count('\s+').agg('mean'),dataset['review'].str.len().agg('mean')

(230.15824, 1309.43102)

We have 1310 characters and 230 words per review on average. This is a lot to work with.

In [6]:
dataset['sentiment'].loc[dataset['sentiment'] == "positive"] = 1
dataset['sentiment'].loc[dataset['sentiment'] == "negative"] = 0

# Data preprocessing

We have a large set of information to work with. Our dataset length range from 3 to roughly 2500 words. NLP tasks require us to vectorize the text information for easier processing by the computer but we need to feed a constant sized matrix to the computer too. With current information, our computer will tire out.
Fortunately, we won't do something so crazy and infact we will clean our data first. The steps we will undertake are as follows

- **Remove html tags** : our reviews have some html tags that are of no use in the long run and we are better of removing them.
- **only consider alphabets** : since we need to work with words, using only alphabets will be helpful here so we effectively remove punctations and numbers.
- **convert text to lower case** : useful for processing and later parsing of inputs. Its a useful part in removing "noise" in the data.
- **Perform word Tokenization** : Break down related words as they mean the same in learning. E.g caring, cares are at root just care. This is lemmatization which we use here.
- **remove stopwords** : Words like "the, and, or, be.. etc" that are of no significant impact are removed. This shortens our corpus and keeps relevant enough words for training.
- **Vectorize the words** : The words are converted into numeric representations as numeric manipulations are easier to work on for the machine learning algorithms.

In [7]:
from bs4 import BeautifulSoup
import html as ihtml

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
corpus = []
for i in range(0, len(dataset)):
    review = BeautifulSoup(ihtml.unescape(dataset['review'][i]), "html.parser").text
    review = re.sub(r"\s+", " ", review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    lemma = WordNetLemmatizer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [lemma.lemmatize(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus[3]

'basically family little boy jake think zombie closet parent fighting time movie slower soap opera suddenly jake decides become rambo kill zombie ok first going make film must decide thriller drama drama movie watchable parent divorcing arguing like real life jake closet totally ruin film expected see boogeyman similar movie instead watched drama meaningless thriller spot well playing parent descent dialog shot jake ignore'

# Similarity check
 After cleaning the data, we perform a similarity analysis.
 We have 120 words on average now. We will use the count vectorizer function but prior to that, we want to see how post processing, our sentences are like.

We wish to check similarity between sentences.
Since 50000 x 50000 will take along time
We select 25 rows at random and check similarity against a threshold of more than 50% (this operation took 27 minutes to complete so one can extrapolate how much time it would have taken for 50000x50000 operations.)

In [33]:
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [48]:
import random
random_select= random.sample(range(0, 50000), 25)

In [51]:
print(random_select)

[47165, 48677, 25206, 37723, 14633, 49515, 2191, 11189, 25209, 27060, 17814, 17985, 12452, 619, 25456, 21442, 20594, 22355, 25912, 39137, 40524, 18815, 44014, 45749, 12447]


In [49]:
collect =0
for i in random_select:
  for j in range(0,50000):
    if similar(corpus[i], corpus[j]) > 0.5:
      collect +=1
print('Done')
print(collect)

Done
26


In [60]:
print('Combinations were',25*50000)
print('Threshold passes found',collect)
print('%age similar',collect/ (25*500))

Combinations were 1250000
Threshold passes found 26
%age similar 0.00208


In [24]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=800)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values
y = np.array(y, dtype=np.int64)

In [25]:
# Out of 944/1000 834/888  748/800
(pd.DataFrame(X) == 0).astype(int).sum(axis=1).agg('mean')

748.24868

In [26]:
scaler = MinMaxScaler((0,1))
X = scaler.fit_transform(X)

In [27]:
pd.DataFrame(X).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,790,791,792,793,794,795,796,797,798,799
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.008743,0.01497,0.004457,0.004398,0.00594,0.008993,0.032153,0.011005,0.020911,0.008697,...,0.006988,0.008557,0.01037,0.010326,0.026494,0.008577,0.013695,0.00406,0.014092,0.001289
std,0.04159,0.064195,0.032569,0.023896,0.027764,0.056147,0.068664,0.040236,0.048657,0.041012,...,0.030093,0.041668,0.044457,0.042966,0.062906,0.039702,0.04814,0.031028,0.04814,0.015683
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [76]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=800)
pca_model.fit(X)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9607661495342142


In [77]:
X = pca_model.transform(X)
X.shape

(50000, 800)

# The Models
We will use 4 algorithms namely
- Gaussian Naive Bayes
- Random Forest Classification
- Support Vector Classification
- XGBoost

And measure the performance against our proposed model which has following features
- A voting classifier of 3 models
  - Support Vector Classification
  - Decision Tree Classification
  - Logistic REgression Classification
- Using hard voting criteria

In [29]:
skf = StratifiedKFold(n_splits=10) 

gnb_model = GaussianNB()
rfc_model = RandomForestClassifier()
svc_model = SVC(kernel='linear', gamma='auto',max_iter=2500)
xgb_model = xgb.XGBClassifier(objective="binary:logistic")

#Proposed model

svc_part = LinearSVC()
logr_part = LogisticRegression(solver='lbfgs',max_iter=1000)
dtc_part = DecisionTreeClassifier()

proposed_model = VotingClassifier(estimators=[('svc_part', svc_part), ('dtc_part', dtc_part), ('logr_part', logr_part)], voting='hard')

In [30]:
def evaluate_model_training(crossvalid,model,X,y):
  time_record = []
  accuracy_stratified = []
  fscore_stratified = []

  for train_index, test_index in skf.split(X, y): 
      X_train_fold, X_test_fold = X[train_index], X[test_index] 
      y_train_fold, y_test_fold = y[train_index], y[test_index]
      
      #time diff calculation with traning
      init = time.time() 
      model.fit(X_train_fold, y_train_fold) 
      time_record.append(time.time()-init)

      #accuracy appening
      accuracy_stratified.append(model.score(X_test_fold, y_test_fold))
      
      #f1 score appening
      fscore_stratified.append(f1_score(y_test_fold, model.predict(X_test_fold)))
  return time_record, accuracy_stratified, fscore_stratified


def report_generator(df):
  df = df.append(df.agg(['mean', 'std']))
  return df

In [31]:
gnb_time,gnb_accuracy,gnb_fscore = evaluate_model_training(skf,gnb_model,X,y)

svc_time,svc_accuracy,svc_fscore = evaluate_model_training(skf,svc_model,X,y)

rfc_time,rfc_accuracy,rfc_fscore = evaluate_model_training(skf,rfc_model,X,y)

xgb_time,xgb_accuracy,xgb_fscore = evaluate_model_training(skf,xgb_model,X,y)

proposed_time,proposed_accuracy,proposed_fscore = evaluate_model_training(skf,proposed_model,X,y)



In [32]:
pd.set_option('precision',10)

In [33]:
# Convert the lsts into dataframe and appending their mean,std
accuracy_dataFrame = pd.DataFrame(
    {'gnb': gnb_accuracy,
     'svc': svc_accuracy,
     'rfc': rfc_accuracy,
     'xgb': xgb_accuracy,
     'proposed':proposed_accuracy
    })

fscore_dataFrame = pd.DataFrame(
    {'gnb': gnb_fscore,
     'svc': svc_fscore,
     'rfc': rfc_fscore,
     'xgb': xgb_fscore,
     'proposed':proposed_fscore
    })

time_dataFrame = pd.DataFrame(
    {'gnb': gnb_time,
     'svc': svc_time,
     'rfc': rfc_time,
     'xgb': xgb_time,
     'proposed':proposed_time
    })

time_report = report_generator(time_dataFrame)
accuracy_report = report_generator(accuracy_dataFrame)
fscore_report = report_generator(fscore_dataFrame)

In [34]:
accuracy_report

Unnamed: 0,gnb,svc,rfc,xgb,proposed
0,0.8096,0.5574,0.8268,0.8096,0.8612
1,0.8066,0.5652,0.8302,0.8126,0.8618
2,0.8082,0.5352,0.8288,0.8,0.8536
3,0.8108,0.5332,0.828,0.8066,0.8516
4,0.8038,0.5442,0.8114,0.7906,0.8472
5,0.804,0.542,0.8264,0.7986,0.861
6,0.8064,0.537,0.831,0.7996,0.8576
7,0.8068,0.531,0.8306,0.802,0.8576
8,0.8094,0.5498,0.824,0.796,0.8588
9,0.807,0.5548,0.8238,0.807,0.862


In [84]:
!pip install xlsxwriter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xlsxwriter
  Downloading XlsxWriter-3.0.3-py3-none-any.whl (149 kB)
[K     |████████████████████████████████| 149 kB 5.2 MB/s 
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.0.3


In [35]:
writer = pd.ExcelWriter('pandas_multiple.xlsx', engine='xlsxwriter')

# Write each dataframe to a different worksheet.
time_report.to_excel(writer, sheet_name='TimeRecord')
accuracy_report.to_excel(writer, sheet_name='Accuracy')
fscore_report.to_excel(writer, sheet_name='Fscore')

# Close the Pandas Excel writer and output the Excel file.
writer.close()

In [None]:

from sklearn.model_selection import GridSearchCV

parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator =svc_model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = StratifiedKFold(10),
                           n_jobs = -1)
grid_search = grid_search.fit(X, y)


KeyboardInterrupt: ignored