In [1027]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [1028]:
df = pd.read_csv("./data/amazon_alexa.tsv", delimiter="\t", quoting=3)
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"""Sometimes while playing a game, you can answe...",1
3,5,31-Jul-18,Charcoal Fabric,"""I have had a lot of fun with this thing. My 4...",1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [1029]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3149 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [1030]:
df[df["verified_reviews"].isna()]

Unnamed: 0,rating,date,variation,verified_reviews,feedback
473,2,29-Jun-18,White,,0


In [1031]:
df["verified_reviews"] = df["verified_reviews"].fillna("")

Building a corpus out of the "verified_reviews" feature by performing the following:

1. Replacing non-alphabet characters with spaces
2. Converting all the characters to lower case
3. Using Porter Stemmer algorithm to convert the words to their root form

In [1032]:
from nltk import PorterStemmer # Porter Stemmer is an algorithm used to reduce English words to their root form by removing suffixes
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

porter_stemmer = PorterStemmer()

corpus = []

for i in range(len(df)):
    review = re.sub("[^a-zA-Z]", " ", df.iloc[i]["verified_reviews"])
    review = review.lower()
    review = review.split()
    review = [ porter_stemmer.stem(word) if word not in STOPWORDS else word for word in review ]
    review = " ".join(review)
    corpus.append(review)

len(corpus)

[nltk_data] Downloading package stopwords to /home/light/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


3150

In [1033]:
df["feedback"].value_counts()

feedback
1    2893
0     257
Name: count, dtype: int64

In [1034]:
df["feedback"].value_counts().loc[1] / df["feedback"].value_counts().loc[0]

np.float64(11.256809338521402)

Creating a bag of words

In [1035]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer()

In [1036]:
X = tfidv.fit_transform(corpus)
y = df["feedback"].values

In [1037]:
X.shape, y.shape

((3150, 2889), (3150,))

In [1038]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2205, 2889), (945, 2889), (2205,), (945,))

In [1039]:
from sklearn.preprocessing import MaxAbsScaler

scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

We need to adjust the class weights because there is an imbalance

In [1040]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier(n_estimators=1000, class_weight="balanced", random_state=42)
rfc_model.fit(X_train_scaled, y_train)

In [1041]:
from sklearn.metrics import classification_report

y_pred = rfc_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.36      0.46        88
           1       0.94      0.98      0.96       857

    accuracy                           0.92       945
   macro avg       0.78      0.67      0.71       945
weighted avg       0.91      0.92      0.91       945



In [1042]:
from sklearn.ensemble import AdaBoostClassifier

ada_model = AdaBoostClassifier(n_estimators=1000, random_state=42)
ada_model.fit(X_train_scaled, y_train)

In [1043]:
y_pred = ada_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.30      0.42        88
           1       0.93      0.99      0.96       857

    accuracy                           0.92       945
   macro avg       0.84      0.64      0.69       945
weighted avg       0.91      0.92      0.91       945



In [1044]:
from sklearn.ensemble import GradientBoostingClassifier

gbc_model = GradientBoostingClassifier(n_estimators=1000, random_state=42)
gbc_model.fit(X_train_scaled, y_train)

In [1045]:
y_pred = gbc_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.51      0.62        88
           1       0.95      0.98      0.97       857

    accuracy                           0.94       945
   macro avg       0.86      0.75      0.79       945
weighted avg       0.94      0.94      0.94       945



In [1046]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(alpha=1, reg_lambda=100, n_estimators=1000, seed=42)
xgb_model.fit(X_train_scaled, y_train)

In [1047]:
y_pred = xgb_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.45      0.59        88
           1       0.95      0.99      0.97       857

    accuracy                           0.94       945
   macro avg       0.89      0.72      0.78       945
weighted avg       0.94      0.94      0.93       945



GradientBoostingClassifier gives us the best results. So we are saving that model along with the vectorizer and the scaler that was used.

In [None]:
import pickle
import os

os.makedirs("./models", exist_ok=True)

with open("./models/vectorizer.pkl", "wb") as f:
    pickle.dump(tfidv, f)

with open("./models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("./models/gradient_boosting_classifier.pkl", "wb") as f:
    pickle.dump(gbc_model, f)

Making some predictions with the model

In [1049]:
def predict(text):
    input = tfidv.transform([text])
    scaled_input = scaler.transform(input)
    prob = gbc_model.predict_proba(scaled_input)[0]
    prediction = "positive" if prob[1] > prob[0] else "negative"
    return (prediction, float(prob.max().round(2)))

In [1050]:
predict("this is awesome!")

('positive', 0.95)

In [1051]:
predict("poor quality")

('negative', 0.99)

In [1052]:
predict("not working at times")

('negative', 0.99)

In [1053]:
predict("could have been better")

('positive', 0.98)