# best practices 

# 1 - preprocessing and cleaning

# 2 - train test split

# 3 - BOW , TF-IDF  , Word2vec

# 4 - model selection and hyperparameter tuning

In [1]:
import pandas as pd

In [2]:
df= pd.read_csv('all_kindle_review.csv')

In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [4]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'asin', 'helpful', 'rating', 'reviewText',
       'reviewTime', 'reviewerID', 'reviewerName', 'summary',
       'unixReviewTime'],
      dtype='object')

In [5]:
df[['reviewText','rating']]

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4
...,...,...
11995,Valentine cupid is a vampire- Jena and Ian ano...,4
11996,I have read all seven books in this series. Ap...,5
11997,This book really just wasn't my cuppa. The si...,3
11998,"tried to use it to charge my kindle, it didn't...",1


In [6]:
df.shape

(12000, 11)

In [7]:
df.isnull

<bound method DataFrame.isnull of        Unnamed: 0.1  Unnamed: 0        asin  helpful  rating  \
0                 0       11539  B0033UV8HI  [8, 10]       3   
1                 1        5957  B002HJV4DE   [1, 1]       5   
2                 2        9146  B002ZG96I4   [0, 0]       3   
3                 3        7038  B002QHWOEU   [1, 3]       3   
4                 4        1776  B001A06VJ8   [0, 1]       4   
...             ...         ...         ...      ...     ...   
11995         11995        2183  B001DUGORO   [0, 0]       4   
11996         11996        6272  B002JCSFSQ   [2, 2]       5   
11997         11997       12483  B0035N1V7K   [0, 1]       3   
11998         11998        3640  B001W1XT40   [1, 2]       1   
11999         11999       11398  B003370JUS   [5, 6]       3   

                                              reviewText   reviewTime  \
0      Jace Rankin may be short, but he's nothing to ...   09 2, 2010   
1      Great short read.  I didn't want to put it d

In [8]:
df.isnull().sum()

Unnamed: 0.1       0
Unnamed: 0         0
asin               0
helpful            0
rating             0
reviewText         0
reviewTime         0
reviewerID         0
reviewerName      38
summary            2
unixReviewTime     0
dtype: int64

In [9]:
df['rating'].value_counts()

rating
5    3000
4    3000
3    2000
2    2000
1    2000
Name: count, dtype: int64

# preprocessing and cleaning

# positive review is = 1 and negative review is = 0

In [10]:
df['rating']

0        3
1        5
2        3
3        3
4        4
        ..
11995    4
11996    5
11997    3
11998    1
11999    3
Name: rating, Length: 12000, dtype: int64

In [11]:
df['rating'] = df['rating'].apply(lambda x:0 if  x<3 else 1)

df['rating'].value_counts()

rating
1    8000
0    4000
Name: count, dtype: int64

In [12]:
print(df['rating'].unique())

[1 0]


## preprocessing


In [13]:

df['reviewText'] = df['reviewText'].str.lower()

## cleaning



In [14]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
import re

In [16]:
## removing special characters

df['reviewText'] = df['reviewText'].str.replace(r'[^a-z A-z 0-9\s]', '', regex=True)

## removing extra spaces
df['reviewText'] = df['reviewText'].str.replace(r'\s+', ' ', regex=True)

## removing leading and trailing spaces
df['reviewText'] = df['reviewText'].str.strip()

## removing stop words

stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

# remove urls
df['reviewText'] = df['reviewText'].str.replace(r'http\S+|www\S+|https\S+', '', regex=True)

# remove html tags
df['reviewText'] = df['reviewText'].str.replace(r'<.*?>', '', regex=True)

In [17]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",1,jace rankin may short hes nothing mess man hau...,"09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",1,great short read didnt want put read one sitti...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",1,ill start saying first four books wasnt expect...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",1,aggie angela lansbury carries pocketbooks inst...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",1,expect type book library pleased find price right,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [18]:
df['reviewText']

0        jace rankin may short hes nothing mess man hau...
1        great short read didnt want put read one sitti...
2        ill start saying first four books wasnt expect...
3        aggie angela lansbury carries pocketbooks inst...
4        expect type book library pleased find price right
                               ...                        
11995    valentine cupid vampire jena ian another vampi...
11996    read seven books series apocalypticadventure o...
11997    book really wasnt cuppa situation man capturin...
11998    tried use charge kindle didnt even register ch...
11999    taking instruction look often hidden world sex...
Name: reviewText, Length: 12000, dtype: object

# lemmatizer

In [19]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [20]:
def lemmatize_text(text):
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split())

In [21]:
df['reviewText'] = df['reviewText'].apply(lambda x: lemmatize_text(x))

In [22]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",1,jace rankin may short he nothing mess man haul...,"09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",1,great short read didnt want put read one sitti...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",1,ill start saying first four book wasnt expecti...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",1,aggie angela lansbury carry pocketbook instead...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",1,expect type book library pleased find price right,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [23]:
# train test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['reviewText'], df['rating'], test_size=0.2, random_state=42)

#  ---------------------  TF-IDF  ----------------

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

x_train_tfidf = tfidf.fit_transform(x_train).toarray()
x_test_tfidf = tfidf.transform(x_test).toarray()

# naive bayes classifier for sparse data

In [25]:
from sklearn.naive_bayes import GaussianNB

# nb_model_bow = GaussianNB().fit(x_train_bow, y_train)

nb_model_tfidf = GaussianNB().fit(x_train_tfidf, y_train)

In [26]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [27]:
y_pred_tfidf = nb_model_tfidf.predict(x_test_tfidf)

# performance mertics

# # acc of bow < acc of tf-idf

In [28]:
print("Accuracy for TF-IDF model:", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report for TF-IDF model:\n", classification_report(y_test, y_pred_tfidf))
print("Confusion Matrix for TF-IDF model:\n", confusion_matrix(y_test, y_pred_tfidf))


Accuracy for TF-IDF model: 0.5791666666666667
Classification Report for TF-IDF model:
               precision    recall  f1-score   support

           0       0.41      0.60      0.49       803
           1       0.74      0.57      0.64      1597

    accuracy                           0.58      2400
   macro avg       0.58      0.58      0.57      2400
weighted avg       0.63      0.58      0.59      2400

Confusion Matrix for TF-IDF model:
 [[483 320]
 [690 907]]


# deployment 

In [29]:
# ...after training and evaluating your model...

import pickle

# Save the TF-IDF vectorizer and the trained model
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open('nb_model_tfidf.pkl', 'wb') as f:
    pickle.dump(nb_model_tfidf, f)