# Modelling

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
import inflect
import re
import spacy
from nltk import SnowballStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize

import string

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

import pickle

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

## Data Ingestion

In [2]:
df = pd.read_csv(r'../data/reviews_sentiment_batminton.csv')

df

Unnamed: 0.1,Unnamed: 0,Review,Ratings
0,0,"Nice product Nice product, good quality, but p...",4
1,1,Don't waste your money They didn't supplied Yo...,1
2,2,Did not meet expectations Worst product. Damag...,1
3,3,"Fair Quite O. K. , but nowadays the quality o...",3
4,4,Over priced Over pricedJust â?¹620 ..from reta...,1
...,...,...,...
7009,8503,Yones Mavis 350 Blue cap Wrost and duplicate p...,1
7010,8504,For Mavis350 Received product intact and seale...,5
7011,8505,Very Good Delivered before time but price is h...,3
7012,8506,Don't waste your money up to the mark but same...,4


In [3]:
def getSentiment(x):
    if x<=2: return 'Negative'
    elif x==3: return 'Neutral'
    else: return 'Positive'
    
df['Sentiment']=df['Ratings'].apply(getSentiment)

df

Unnamed: 0.1,Unnamed: 0,Review,Ratings,Sentiment
0,0,"Nice product Nice product, good quality, but p...",4,Positive
1,1,Don't waste your money They didn't supplied Yo...,1,Negative
2,2,Did not meet expectations Worst product. Damag...,1,Negative
3,3,"Fair Quite O. K. , but nowadays the quality o...",3,Neutral
4,4,Over priced Over pricedJust â?¹620 ..from reta...,1,Negative
...,...,...,...,...
7009,8503,Yones Mavis 350 Blue cap Wrost and duplicate p...,1,Negative
7010,8504,For Mavis350 Received product intact and seale...,5,Positive
7011,8505,Very Good Delivered before time but price is h...,3,Neutral
7012,8506,Don't waste your money up to the mark but same...,4,Positive


In [4]:
df['Sentiment']=df['Sentiment'].map({'Neutral':1,'Positive':1,'Negative':0})

df

Unnamed: 0.1,Unnamed: 0,Review,Ratings,Sentiment
0,0,"Nice product Nice product, good quality, but p...",4,1
1,1,Don't waste your money They didn't supplied Yo...,1,0
2,2,Did not meet expectations Worst product. Damag...,1,0
3,3,"Fair Quite O. K. , but nowadays the quality o...",3,1
4,4,Over priced Over pricedJust â?¹620 ..from reta...,1,0
...,...,...,...,...
7009,8503,Yones Mavis 350 Blue cap Wrost and duplicate p...,1,0
7010,8504,For Mavis350 Received product intact and seale...,5,1
7011,8505,Very Good Delivered before time but price is h...,3,1
7012,8506,Don't waste your money up to the mark but same...,4,1


In [5]:
df = df[["Unnamed: 0","Review","Sentiment"]]

df

Unnamed: 0.1,Unnamed: 0,Review,Sentiment
0,0,"Nice product Nice product, good quality, but p...",1
1,1,Don't waste your money They didn't supplied Yo...,0
2,2,Did not meet expectations Worst product. Damag...,0
3,3,"Fair Quite O. K. , but nowadays the quality o...",1
4,4,Over priced Over pricedJust â?¹620 ..from reta...,0
...,...,...,...
7009,8503,Yones Mavis 350 Blue cap Wrost and duplicate p...,0
7010,8504,For Mavis350 Received product intact and seale...,1
7011,8505,Very Good Delivered before time but price is h...,1
7012,8506,Don't waste your money up to the mark but same...,1


## Text Pre processing

In [6]:
def text_preprocessing(corpus,flag):
    
    p = inflect.engine()
    corpus = re.sub(r'\d+',lambda x: p.number_to_words(x.group(0)),corpus)
    
    corpus = re.sub('[^a-zA-Z]',' ',corpus)
    
    corpus = corpus.lower()
    
    corpus = ' '.join(corpus.split())

    words = word_tokenize(corpus)
    if flag == "stemming":

        stemmer = SnowballStemmer(language='english')
        return ' '.join(stemmer.stem(word) for word in words if word not in set(nltk.corpus.stopwords.words('english')))
    else:

        lemmatizer = WordNetLemmatizer()
        return ' '.join(lemmatizer.lemmatize(word) for word in words if word not in set(nltk.corpus.stopwords.words('english')))


In [7]:
df['Review'] = df['Review'].apply(lambda x: text_preprocessing(x,flag="stemming"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Review'] = df['Review'].apply(lambda x: text_preprocessing(x,flag="stemming"))


In [8]:
nlp=spacy.load('en_core_web_lg')
df['Vector']=df['Review'].apply(lambda x: nlp(x).vector)
df

Unnamed: 0.1,Unnamed: 0,Review,Sentiment,Vector
0,0,nice product nice product good qualiti price r...,1,"[-0.29536363, 0.5639642, -1.9529978, -0.389275..."
1,1,wast money suppli yonex mavi three hundr fifti...,0,"[0.2179594, 0.46288133, -0.9466747, -0.2688799..."
2,2,meet expect worst product damag shuttlecock pa...,0,"[-0.84949654, 0.2578784, -1.1397167, -0.498824..."
3,3,fair quit k nowaday qualiti cork like three fi...,1,"[-1.1681322, 0.5521747, -0.39572743, 0.8393060..."
4,4,price pricedjust six hundr twenti retail under...,0,"[-1.0531131, 0.26580384, -1.0410473, 0.6594077..."
...,...,...,...,...
7009,8503,yone mavi three hundr fifti blue cap wrost dup...,0,"[-1.6141624, 0.3796603, -0.7565399, 0.20961499..."
7010,8504,mavisthre hundr fifti receiv product intact se...,1,"[-0.4803, 0.16311428, -0.0070071393, 0.0004014..."
7011,8505,good deliv time price high marketread,1,"[0.21146166, 0.31128332, -1.8140117, 0.9644416..."
7012,8506,wast money mark avail market less price read,1,"[-0.93028754, 0.44709933, -1.73231, 0.125628, ..."


## Split the data

In [9]:
X_train,X_test,y_train,y_test=train_test_split(df['Vector'],df['Sentiment'],test_size=0.2)
print(X_train.shape,X_test.shape)

(5611,) (1403,)


In [10]:
X_train_stack = np.stack(X_train)
X_test_stack = np.stack(X_test)
print(X_train_stack.shape,X_test_stack.shape)

(5611, 300) (1403, 300)


In [11]:
scaler = MinMaxScaler()
X_train_stack_sc = scaler.fit_transform(X_train_stack)
X_test_stack_sc = scaler.transform(X_test_stack)

In [12]:
with open(r'../models/min_max_scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [13]:
with open(r'../models/min_max_scaler.pkl', 'rb') as file:
    scaler=pickle.load(file)
    y=scaler.transform([X_test_stack[0]])
print(y)

[[0.75759013 0.26176122 0.48488131 0.56352651 0.7014109  0.62483931
  0.44528753 0.64725192 0.49259311 0.35775628 0.5213206  0.62727857
  0.45489421 0.41639194 0.34748546 0.70390153 0.44839317 0.45655401
  0.5862108  0.36052058 0.43060324 0.53044358 0.62257395 0.45435448
  0.65704474 0.7466632  0.53955647 0.72052152 0.48876661 0.52913311
  0.47542664 0.21597212 0.31481454 0.55800376 0.47434434 0.54729333
  0.31338841 0.41455962 0.52104571 0.60307672 0.5375095  0.57530567
  0.28544863 0.30503977 0.43769573 0.64506862 0.46029383 0.33296116
  0.4063288  0.51816417 0.66625375 0.56514776 0.48927745 0.5078314
  0.51026715 0.486644   0.39026724 0.53608043 0.16805363 0.52162225
  0.53307392 0.45274156 0.70222118 0.73816125 0.55988556 0.5724099
  0.59469585 0.22030105 0.39868069 0.43443944 0.53745515 0.41733604
  0.3603473  0.54401162 0.68501388 0.58614126 0.29232838 0.66717576
  0.39751319 0.48939855 0.46292669 0.24496159 0.55872923 0.51837882
  0.51447435 0.58473592 0.48263468 0.57081011 0.58

## Model Building

In [14]:
model = MultinomialNB()
model.fit(X_train_stack_sc, y_train)

In [15]:
y_pred = model.predict(X_test_stack_sc)
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [16]:
print('Accuracy Score:',accuracy_score(y_test, y_pred))

Accuracy Score: 0.8545972915181753


In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       204
           1       0.85      1.00      0.92      1199

    accuracy                           0.85      1403
   macro avg       0.43      0.50      0.46      1403
weighted avg       0.73      0.85      0.79      1403



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Saving the model

In [18]:
with open(r'../models/naive_bayes_model.pkl', 'wb') as file:
    pickle.dump(model, file)

## Testing the model

In [19]:
with open(r'../models/naive_bayes_model.pkl', 'rb') as file:
    model = pickle.load(file)
    y=model.predict([X_test_stack_sc[0]])
print(y)

[1]


In [20]:
pipelines = {
    'naive_bayes': Pipeline([
        ('classifier', MultinomialNB())
    ]),
    'decision_tree': Pipeline([
        ('classifier', DecisionTreeClassifier())
    ]),
    'logistic_regression': Pipeline([
        ('classifier', LogisticRegression())
    ])
}

In [21]:
param_grids = {
    'naive_bayes': [
        {
            'classifier__alpha' : [1, 10]
        }
    ],
    'decision_tree': [
        {
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'logistic_regression': [
        {
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced']
        }
    ]
}


In [22]:
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='f1', 
                               return_train_score=True,
                               verbose=1
                              )
    
    grid_search.fit(X_train_stack_sc, y_train)
    
    best_models[algo] = grid_search.best_estimator_
    
    print('Score on Test Data: ', grid_search.score(X_test_stack_sc, y_test))

********** naive_bayes **********
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Score on Test Data:  0.921598770176787
********** decision_tree **********
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Score on Test Data:  0.926558157999206
********** logistic_regression **********
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Score on Test Data:  0.8772869254796966




In [23]:
grid_search.best_estimator_

Pipeline(steps=[('classifier',
                 LogisticRegression(C=10, class_weight='balanced', l1_ratio=0.6,
                                    penalty='elasticnet', solver='saga'))])

In [24]:
best_model = grid_search.best_estimator_
with open(r'../models/logistic_regression.pkl', 'wb') as file:
    pickle.dump(best_model, file)