Importing Libraries AND Dataset

In [2]:
import pandas as pd
import re
import nltk
import gensim
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data=pd.read_csv("all_kindle_review.csv")

In [4]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [5]:
data=data[['reviewText','rating']]

In [6]:
data

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4
...,...,...
11995,Valentine cupid is a vampire- Jena and Ian ano...,4
11996,I have read all seven books in this series. Ap...,5
11997,This book really just wasn't my cuppa. The si...,3
11998,"tried to use it to charge my kindle, it didn't...",1


In [7]:
data.shape

(12000, 2)

In [8]:
data['rating'].unique()

array([3, 5, 4, 2, 1], dtype=int64)

Preprocessing AND Cleaning

In [9]:
data['rating']=data['rating'].apply(lambda X:0 if X<3 else 1)

In [10]:
data['rating'].nunique()

2

In [11]:
data['rating'].value_counts()

rating
1    8000
0    4000
Name: count, dtype: int64

In [12]:
##Lower all Cases
data['reviewText']=data['reviewText'].str.lower()

In [13]:
data['reviewText'].head()

0    jace rankin may be short, but he's nothing to ...
1    great short read.  i didn't want to put it dow...
2    i'll start by saying this is the first of four...
3    aggie is angela lansbury who carries pocketboo...
4    i did not expect this type of book to be in li...
Name: reviewText, dtype: object

In [14]:
## Removing special characters
data['reviewText']=data['reviewText'].apply(lambda x:re.sub('[^a-z A-z 0-9-]+', '',x))

In [15]:
## Remove the stopswords
data['reviewText']=data['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))

In [15]:
## Remove url 
data['reviewText']=data['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))


In [16]:
## Remove html tags
data['reviewText']=data['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

  data['reviewText']=data['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())


In [17]:
## Remove any additional spaces
data['reviewText']=data['reviewText'].apply(lambda x: " ".join(x.split()))

In [18]:
data.head()

Unnamed: 0,reviewText,rating
0,jace rankin may be short but hes nothing to me...,1
1,great short read i didnt want to put it down s...,1
2,ill start by saying this is the first of four ...,1
3,aggie is angela lansbury who carries pocketboo...,1
4,i did not expect this type of book to be in li...,1


In [19]:
#Lemmatizer
lemmatizer=WordNetLemmatizer()

In [20]:
def lemmatize_words (text):
    return " ".join([lemmatizer.lemmatize (word) for word in text.split()])

In [21]:
data['reviewText']=data['reviewText'].apply(lambda X:lemmatize_words(X))

Train-Test Split

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data['reviewText'],data['rating'],test_size=0.2)

Bag of WORDS

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()
X_train_bow=bow.fit_transform (X_train).toarray()
X_test_bow=bow.transform (X_test).toarray()

In [36]:
X_test_bow.shape

(2400, 35671)

TF-IDF

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
X_train_tfidf=tfidf.fit_transform(X_train).toarray()
X_test_tfidf=tfidf.transform(X_test).toarray()

In [38]:
X_train_tfidf.shape

(9600, 35671)

Word2Vec

In [39]:
from gensim.models import Word2Vec
sentences = X_train.tolist()
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

word2vec_model.save("word2vec.model")


In [54]:
def get_text_vector(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

In [57]:
X_train_vectors = np.array([get_text_vector(text, word2vec_model) for text in X_train])
X_test_vectors = np.array([get_text_vector(text, word2vec_model) for text in X_test])


Building Model

In [62]:
from sklearn.naive_bayes import GaussianNB
#BOW MODEL
nb_model_bow=GaussianNB().fit(X_train_bow, y_train)  
#TF-IDF MODEL
nb_model_tfidf=GaussianNB().fit(X_train_tfidf,y_train)  

In [None]:
#Word2Vec Model
nb_model_wordd2Vecf=GaussianNB().fit(X_train_vectors,y_train)

Testing Model & Making Predictions


In [66]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [67]:
y_pred_bow=nb_model_bow.predict(X_test_bow)

In [68]:
y_pred_tfidf=nb_model_tfidf.predict(X_test_tfidf)

In [69]:
y_pred_word2Vec=nb_model_wordd2Vecf.predict(X_test_vectors)

In [70]:
#Using BOW Model
confusion_matrix(y_test,y_pred_bow) 

array([[514, 281],
       [711, 894]], dtype=int64)

In [71]:
print("BOW accuracy: ",accuracy_score(y_test,y_pred_bow))

BOW accuracy:  0.5866666666666667


In [72]:
#Using TF-IDF Model
confusion_matrix(y_test,y_pred_tfidf)

array([[499, 296],
       [690, 915]], dtype=int64)

In [73]:
print("TF-IDF accuracy: ",accuracy_score(y_test,y_pred_tfidf))

TF-IDF accuracy:  0.5891666666666666


In [74]:
#Using Word2Vec Model
confusion_matrix(y_test,y_pred_word2Vec)

array([[ 397,  398],
       [ 570, 1035]], dtype=int64)

In [75]:
print("Word2Vec accuracy: ",accuracy_score(y_test,y_pred_word2Vec))

Word2Vec accuracy:  0.5966666666666667
