# Importing libraries

In [None]:
# For data processing and handling
import numpy as np
import pandas as pd

# For nlp data pre-processing
import re
import nltk

#For Data Scaling
from sklearn.preprocessing import MinMaxScaler

# Candidate Model library import
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Proposed model library (voting stack using svc + LogRegression + Decision Tree)
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# For model evaluation
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
import time

np.random.seed(5678)

# The Dataset
We will use the imbd 50k dataset. This [50000 x 2] has 1 column of review and the second is the label.
The dataset holds 25k positive labelled and 25k negative labelled rows.

We assessed this dataset is large enough to train on and get interesting results on figuring the candidate algorithms have been checked to perform well on high dimensional and sparse matrices (something nlp algorithms eventually work on).

In [None]:
dataset = pd.read_csv('Data/imbd_ds.csv')

In [None]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
dataset.iloc[0,0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

Lets see about length and word count of each review

In [None]:
dataset['review'].str.count('\s+').agg('mean'),dataset['review'].str.len().agg('mean')

(230.15824, 1309.43102)

# Data preprocessing

We have a large set of information to work with. Our dataset length range from 3 to roughly 2500 words. NLP tasks require us to vectorize the text information for easier processing by the computer but we need to feed a constant sized matrix to the computer too. With current information, our computer will tire out.
Fortunately, we won't do something so crazy and infact we will clean our data first. The steps we will undertake are as follows

- **Remove html tags** : our reviews have some html tags that are of no use in the long run and we are better of removing them.
- **only consider alphabets** : since we need to work with words, using only alphabets will be helpful here so we effectively remove punctations and numbers.
- **convert text to lower case** : useful for processing and later parsing of inputs. Its a useful part in removing "noise" in the data.
- **Perform word Tokenization** : Break down related words as they mean the same in learning. E.g caring, cares are at root just care. This is lemmatization which we use here.
- **remove stopwords** : Words like "the, and, or, be.. etc" that are of no significant impact are removed. This shortens our corpus and keeps relevant enough words for training.
- **Vectorize the words** : The words are converted into numeric representations as numeric manipulations are easier to work on for the machine learning algorithms.

We have 1310 characters and 230 words per review on average. This is a lot to work with.

In [None]:
dataset['sentiment'].loc[dataset['sentiment'] == "positive"] = 1
dataset['sentiment'].loc[dataset['sentiment'] == "negative"] = 0

In [None]:
from bs4 import BeautifulSoup
import html as ihtml

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
corpus = []
for i in range(0, len(dataset)):
    review = BeautifulSoup(ihtml.unescape(dataset['review'][i]), "html.parser").text
    review = re.sub(r"\s+", " ", review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    lemma = WordNetLemmatizer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    #lemmatize the word which is not a stop word
    review = [lemma.lemmatize(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus[0]

'one reviewer mentioned watching oz episode hooked right exactly happened first thing struck oz brutality unflinching scene violence set right word go trust not show faint hearted timid show pull punch regard drug sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy not high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence not violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experience w

In [None]:
collect_len = 0
collect_words = 0
for i in range(0,50000):
  collect_len += len(corpus[i])
  collect_words += len(re.findall(r'\w+', corpus[i]))
  
print('Post processing average lenth of row is ',collect_len/50000)
print('Post processing average words in row is ',collect_words/50000)

Post processing average lenth of row is  805.58712
Post processing average words in row is  119.3565


We have reduced average words to ~120 with 805 length.

In [None]:
str_total = " "
for i in range(0,50000):
  str_total += corpus[i]

In [None]:
def word_count(str):
    counts = dict()
    words = str.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts
string_collect =word_count(str_total)

In [None]:
col = 0
for x in string_collect.keys():
  col +=1

print(col)

134282


In [None]:
print('Words that have been used atleast 1 time ',len(string_collect.keys()))

Words that have been used atleast 1 time  134282


In [None]:
len([k for k,v in string_collect.items() if float(v) ==1 ]), len([k for k,v in string_collect.items() if float(v) ==1 ])/len(string_collect.keys())

(76297, 0.5681848646877466)

In [None]:
len([k for k,v in string_collect.items() if float(v) <1000 ]), len([k for k,v in string_collect.items() if float(v) <1000 ])/len(string_collect.keys())

(133311, 0.9927689489283746)

56% of the words in all the rows have only 1 occurance.
At this stage, since usless words have been trimmed out, we have a fairly large corpus to work with. As such, we have to put a cap on vectorization.

After experimentation with numbers, discussion and more, 1000 vocabulary size was deemed appropriate to use for the machine learning tasks.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values
y = np.array(y, dtype=np.int64)

In [None]:
print('We now have ',(pd.DataFrame(X) == 0).astype(int).sum(axis=1).agg('mean'),' sparse results')

We now have  944.3755  sparse results


In [None]:
scaler = MinMaxScaler((0,1))
X = scaler.fit_transform(X)

In [None]:
pd.DataFrame(X).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.00575,0.008743,0.01497,0.004457,0.004398,0.00594,0.008993,0.032153,0.011005,0.020911,...,0.010326,0.00549,0.005065,0.026494,0.008577,0.013695,0.00406,0.014092,0.002909,0.001289
std,0.041042,0.04159,0.064195,0.032569,0.023896,0.027764,0.056147,0.068664,0.040236,0.048657,...,0.042966,0.039495,0.039378,0.062906,0.039702,0.04814,0.031028,0.04814,0.022815,0.015683
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=800)
pca_model.fit(X)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9607661495342142


In [None]:
X = pca_model.transform(X)
X.shape

(50000, 800)

# The Models
We will use 4 algorithms namely
- Gaussian Naive Bayes
- Random Forest Classification
- Support Vector Classification
- XGBoost

And measure the performance against our proposed model which has following features
- A voting classifier of 3 models
  - Support Vector Classification
  - Decision Tree Classification
  - Logistic REgression Classification
- Using hard voting criteria

In [None]:
skf = StratifiedKFold(n_splits=10) 

gnb_model = GaussianNB()
rfc_model = RandomForestClassifier()
svc_model = SVC(kernel='linear', gamma='auto',max_iter=2500)
xgb_model = xgb.XGBClassifier(objective="binary:logistic")

#Proposed model

svc_part = LinearSVC()
logr_part = LogisticRegression(solver='lbfgs',max_iter=1000)
dtc_part = DecisionTreeClassifier()
proposed_model = VotingClassifier(estimators=[('svc_part', svc_part), ('dtc_part', dtc_part), ('logr_part', logr_part)], voting='hard')

In [None]:
def evaluate_model_training(crossvalid,model,X,y):
  time_record = []
  accuracy_stratified = []
  fscore_stratified = []

  for train_index, test_index in skf.split(X, y): 
      X_train_fold, X_test_fold = X[train_index], X[test_index] 
      y_train_fold, y_test_fold = y[train_index], y[test_index]
      
      #time diff calculation with traning
      init = time.time() 
      model.fit(X_train_fold, y_train_fold) 
      time_record.append(time.time()-init)

      #accuracy appening
      accuracy_stratified.append(model.score(X_test_fold, y_test_fold))
      
      #f1 score appening
      fscore_stratified.append(f1_score(y_test_fold, model.predict(X_test_fold)))
  return time_record, accuracy_stratified, fscore_stratified


def report_generator(df):
  df = df.append(df.agg(['mean', 'std']))
  return df

In [None]:
gnb_time,gnb_accuracy,gnb_fscore = evaluate_model_training(skf,gnb_model,X,y)

svc_time,svc_accuracy,svc_fscore = evaluate_model_training(skf,svc_model,X,y)

rfc_time,rfc_accuracy,rfc_fscore = evaluate_model_training(skf,rfc_model,X,y)

xgb_time,xgb_accuracy,xgb_fscore = evaluate_model_training(skf,xgb_model,X,y)

proposed_time,proposed_accuracy,proposed_fscore = evaluate_model_training(skf,proposed_model,X,y)



In [None]:
pd.set_option('precision',10)

In [None]:
# Convert the lsts into dataframe and appending their mean,std
accuracy_dataFrame = pd.DataFrame(
    {'gnb': gnb_accuracy,
     'svc': svc_accuracy,
     'rfc': rfc_accuracy,
     'xgb': xgb_accuracy,
     'proposed':proposed_accuracy
    })

fscore_dataFrame = pd.DataFrame(
    {'gnb': gnb_fscore,
     'svc': svc_fscore,
     'rfc': rfc_fscore,
     'xgb': xgb_fscore,
     'proposed':proposed_fscore
    })

time_dataFrame = pd.DataFrame(
    {'gnb': gnb_time,
     'svc': svc_time,
     'rfc': rfc_time,
     'xgb': xgb_time,
     'proposed':proposed_time
    })

time_report = report_generator(time_dataFrame)
accuracy_report = report_generator(accuracy_dataFrame)
fscore_report = report_generator(fscore_dataFrame)

In [None]:
accuracy_report

Unnamed: 0,gnb,svc,rfc,xgb,proposed
0,0.8084,0.5742,0.8364,0.8096,0.8682
1,0.8088,0.543,0.8332,0.8144,0.8694
2,0.8094,0.5594,0.8326,0.8026,0.8576
3,0.8156,0.5706,0.8366,0.8088,0.8588
4,0.807,0.5462,0.8212,0.7938,0.8526
5,0.806,0.5502,0.8294,0.8044,0.8652
6,0.8098,0.5494,0.832,0.8032,0.8642
7,0.811,0.5628,0.8314,0.8064,0.8614
8,0.8104,0.5438,0.825,0.7974,0.8602
9,0.8102,0.576,0.832,0.8102,0.8648


In [None]:
!pip install xlsxwriter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xlsxwriter
  Downloading XlsxWriter-3.0.3-py3-none-any.whl (149 kB)
[K     |████████████████████████████████| 149 kB 5.2 MB/s 
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.0.3


In [None]:
writer = pd.ExcelWriter('pandas_multiple.xlsx', engine='xlsxwriter')

# Write each dataframe to a different worksheet.
time_report.to_excel(writer, sheet_name='TimeRecord')
accuracy_report.to_excel(writer, sheet_name='Accuracy')
fscore_report.to_excel(writer, sheet_name='Fscore')

# Close the Pandas Excel writer and output the Excel file.
writer.close()