#Steps Needs To Be Followed-
# Preprocessing and cleaning
# Train Test Split
# BOW,TFIDF,WORD2VEC
# Train ML Algorithm

In [4]:
import pandas as pd
df=pd.read_csv('all_kindle_review .csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [6]:
df=df[['reviewText','rating']]
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [7]:
df.shape

(12000, 2)

In [8]:
df.isnull().sum()

Unnamed: 0,0
reviewText,0
rating,0


In [10]:
df['rating'].unique()

array([3, 5, 4, 2, 1])

In [11]:
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,3000
4,3000
3,2000
2,2000
1,2000


#Preprocessing and Cleaning

In [14]:
#positive review is 1 and negative review is 0
df['rating']=df['rating'].apply(lambda x:0 if x <3 else 1)

In [15]:
df['rating'].unique()

array([1, 0])

In [16]:
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
1,8000
0,4000


In [18]:
#lower all the cases

df['reviewText']=df['reviewText'].str.lower()

In [19]:
df.head()

Unnamed: 0,reviewText,rating
0,"jace rankin may be short, but he's nothing to ...",1
1,great short read. i didn't want to put it dow...,1
2,i'll start by saying this is the first of four...,1
3,aggie is angela lansbury who carries pocketboo...,1
4,i did not expect this type of book to be in li...,1


In [21]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [22]:
from bs4 import BeautifulSoup

In [24]:
# REMOVING ALL THE SPECIAL CHARACTERS
df['reviewText']= df['reviewText'].apply(lambda x: re.sub('[^a-zA-Z0-9]+','',x))

#REMOVE the STOPWORDS
df['reviewText']=df['reviewText'].apply(lambda x: " ".join([y for y in x.split() if y not in stopwords.words('english')]))

#REMOVE THE URL
df['reviewText']=df['reviewText'].apply(lambda x:re.sub('r https?://\S+|www\.\S+','',str(x)))

#REMOVE HTML TAGS
df['reviewText']=df['reviewText'].apply(lambda x:BeautifulSoup(x,'lxml').get_text())

#REMOVE ADDITIONAL SPACES
df['reviewText']=df['reviewText'].apply(lambda x:" ".join(x.split()))

In [25]:
df.head()

Unnamed: 0,reviewText,rating
0,jacerankinmaybeshortbuthesnothingtomesswithast...,1
1,greatshortreadididntwanttoputitdownsoireadital...,1
2,illstartbysayingthisisthefirstoffourbookssoiwa...,1
3,aggieisangelalansburywhocarriespocketbooksinst...,1
4,ididnotexpectthistypeofbooktobeinlibrarywasple...,1


In [32]:
##Lematizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [28]:
lematizer = WordNetLemmatizer()

In [29]:
def lematize_words(text):
  return " ".join([lematizer.lemmatize(word) for word in text.split()])

In [33]:
df['reviewText']=df['reviewText'].apply(lambda x: lematize_words(x))

In [34]:
df.head()

Unnamed: 0,reviewText,rating
0,jacerankinmaybeshortbuthesnothingtomesswithast...,1
1,greatshortreadididntwanttoputitdownsoireadital...,1
2,illstartbysayingthisisthefirstoffourbookssoiwa...,1
3,aggieisangelalansburywhocarriespocketbooksinst...,1
4,ididnotexpectthistypeofbooktobeinlibrarywasple...,1


Train Test Split

In [35]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['reviewText'],df['rating'],test_size=0.20)

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()
X_train_bow = bow.fit_transform(X_train).toarray()
X_test_bow = bow.transform(X_test)

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test)

In [39]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [40]:
X_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
from sklearn.naive_bayes import GaussianNB
NBModel_bow= GaussianNB().fit(X_train_bow,y_train)
NBModel_tfidf= GaussianNB().fit(X_train_tfidf,y_train)

In [44]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [52]:
X_test_bow=X_test_bow.toarray()
X_test_tfidf = X_test_tfidf.toarray()
y_pred_bow = NBModel_bow.predict(X_test_bow)
y_pred_tfidf= NBModel_tfidf.predict(X_test_tfidf)

In [53]:
print('BOW accuracy:',accuracy_score(y_test,y_pred_bow))

BOW accuracy: 0.325


In [54]:
confusion_matrix(y_test,y_pred_bow)

array([[ 779,    0],
       [1620,    1]])

In [55]:
print('TFIDF accuracy:',accuracy_score(y_test,y_pred_tfidf))

TFIDF accuracy: 0.325


In [56]:
confusion_matrix(y_test,y_pred_tfidf)

array([[ 779,    0],
       [1620,    1]])

Word2Vec

In [57]:
!pip install gensim



In [60]:
import gensim
from gensim.models import word2vec , keyedvectors

In [61]:
import gensim.downloader as api

wv= api.load('word2vec-google-news-300')
vec_king = wv['king']



In [64]:
texts=df['reviewText']
texts

Unnamed: 0,reviewText
0,jacerankinmaybeshortbuthesnothingtomesswithast...
1,greatshortreadididntwanttoputitdownsoireadital...
2,illstartbysayingthisisthefirstoffourbookssoiwa...
3,aggieisangelalansburywhocarriespocketbooksinst...
4,ididnotexpectthistypeofbooktobeinlibrarywasple...
...,...
11995,valentinecupidisavampirejenaandiananothervampi...
11996,ihavereadallsevenbooksinthisseriesapocalyptica...
11997,thisbookreallyjustwasntmycuppathesituationofam...
11998,triedtouseittochargemykindleitdidntevenregiste...


In [65]:

import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Convert text data to Word2Vec vectors
def text_to_vector(text ,model):
    words = text.split()
    word_vectors = []
    for word in words:
        if word in model:
            word_vectors.append(model[word])

    # Return the average of all word vectors in the text
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X = np.array([text_to_vector(text, wv) for text in texts])
y = np.array(df['rating'])

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4. Train a classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 5. Make predictions and evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.67
