## Steps
## 1.Preprocessing
## 2.Train Test Split
## 3.BOW,TFIDF,WordVec
## 4.Train ML Algorithms

In [2]:
##Load the dataset
import pandas as pd
data=pd.read_csv('all_kindle_review.csv')

In [3]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [4]:
data=data[['reviewText','rating']]

In [5]:
data.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [6]:
data.shape

(12000, 2)

In [7]:
## missing values
data.isnull().sum()

Unnamed: 0,0
reviewText,0
rating,0


In [8]:
## unique rating
data['rating'].unique()

array([3, 5, 4, 2, 1])

In [9]:
data['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,3000
4,3000
3,2000
2,2000
1,2000


In [10]:
## preprocessing and cleaning
## positive review is 1 and negative review is 0
data['rating']=data['rating'].apply(lambda x:0 if x<3 else 1)

In [11]:
data

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",1
1,Great short read. I didn't want to put it dow...,1
2,I'll start by saying this is the first of four...,1
3,Aggie is Angela Lansbury who carries pocketboo...,1
4,I did not expect this type of book to be in li...,1
...,...,...
11995,Valentine cupid is a vampire- Jena and Ian ano...,1
11996,I have read all seven books in this series. Ap...,1
11997,This book really just wasn't my cuppa. The si...,1
11998,"tried to use it to charge my kindle, it didn't...",0


In [12]:
data['rating'].unique()

array([1, 0])

In [13]:
data['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
1,8000
0,4000


In [14]:
## Lower all the cases
data['reviewText']=data['reviewText'].str.lower()
data.head()

Unnamed: 0,reviewText,rating
0,"jace rankin may be short, but he's nothing to ...",1
1,great short read. i didn't want to put it dow...,1
2,i'll start by saying this is the first of four...,1
3,aggie is angela lansbury who carries pocketboo...,1
4,i did not expect this type of book to be in li...,1


In [15]:
data['reviewText']

Unnamed: 0,reviewText
0,"jace rankin may be short, but he's nothing to ..."
1,great short read. i didn't want to put it dow...
2,i'll start by saying this is the first of four...
3,aggie is angela lansbury who carries pocketboo...
4,i did not expect this type of book to be in li...
...,...
11995,valentine cupid is a vampire- jena and ian ano...
11996,i have read all seven books in this series. ap...
11997,this book really just wasn't my cuppa. the si...
11998,"tried to use it to charge my kindle, it didn't..."


In [16]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
from bs4 import BeautifulSoup


In [18]:
!pip install lxml



In [19]:
## Removing special characters
data['reviewText']=data['reviewText'].apply(lambda x:re.sub('[^a-z A-z 0-9-]+', '',x))
## Remove the stopswords
data['reviewText']=data['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
## Remove url
data['reviewText']=data['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))
## Remove html tags
data['reviewText']=data['reviewText'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
## Remove any additional spaces
data['reviewText']=data['reviewText'].apply(lambda x: " ".join(x.split()))

  data['reviewText']=data['reviewText'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())


In [20]:
data.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short hes nothing mess man hau...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four books wasnt expect...,1
3,aggie angela lansbury carries pocketbooks inst...,1
4,expect type book library pleased find price right,1


In [21]:
## Lemmatizer
from nltk.stem import WordNetLemmatizer

In [23]:
import nltk

In [22]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [24]:
lemmatizer=WordNetLemmatizer()

In [25]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [26]:
data['reviewText']=data['reviewText'].apply(lambda x:lemmatize_words(x))

In [27]:
data.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short he nothing mess man haul...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four book wasnt expecti...,1
3,aggie angela lansbury carry pocketbook instead...,1
4,expect type book library pleased find price right,1


In [28]:
## Train Test Split

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test= train_test_split(data['reviewText'],data['rating'],
                                              test_size=0.20)


In [29]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()
X_train_bow=bow.fit_transform(X_train).toarray()
X_test_bow=bow.transform(X_test).toarray()

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
X_train_tfidf=tfidf.fit_transform(X_train).toarray()
X_test_tfidf=tfidf.transform(X_test).toarray()


In [31]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [32]:
X_train_tfidf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.17164468, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [33]:
from sklearn.naive_bayes import GaussianNB
nb_model_bow=GaussianNB().fit(X_train_bow,Y_train)
nb_model_tfidf=GaussianNB().fit(X_train_tfidf,Y_train)

In [34]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [35]:
y_pred_bow=nb_model_bow.predict(X_test_bow)

In [36]:
y_pred_tfidf=nb_model_bow.predict(X_test_tfidf)


In [37]:
confusion_matrix(Y_test,y_pred_bow)

array([[503, 284],
       [728, 885]])

In [38]:
print("BOW accuracy: ",accuracy_score(Y_test,y_pred_bow))

BOW accuracy:  0.5783333333333334


In [39]:

confusion_matrix(Y_test,y_pred_tfidf)

array([[497, 290],
       [721, 892]])

In [40]:
print("TFIDF accuracy: ",accuracy_score(Y_test,y_pred_tfidf))

TFIDF accuracy:  0.57875


In [41]:
!pip install gensim



In [42]:
import gensim
from gensim.models import Word2Vec

In [43]:
from gensim.models import KeyedVectors

In [44]:
import gensim.downloader as api

In [51]:
import nltk

In [52]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [86]:
messages=pd.read_csv('all_kindle_review.csv')
messages=messages[['reviewText','rating']]
messages

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4
...,...,...
11995,Valentine cupid is a vampire- Jena and Ian ano...,4
11996,I have read all seven books in this series. Ap...,5
11997,This book really just wasn't my cuppa. The si...,3
11998,"tried to use it to charge my kindle, it didn't...",1


In [87]:
messages['rating']=messages['rating'].apply(lambda x:0 if x<3 else 1)
messages

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",1
1,Great short read. I didn't want to put it dow...,1
2,I'll start by saying this is the first of four...,1
3,Aggie is Angela Lansbury who carries pocketboo...,1
4,I did not expect this type of book to be in li...,1
...,...,...
11995,Valentine cupid is a vampire- Jena and Ian ano...,1
11996,I have read all seven books in this series. Ap...,1
11997,This book really just wasn't my cuppa. The si...,1
11998,"tried to use it to charge my kindle, it didn't...",0


In [88]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [90]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-z]',' ',messages['reviewText'][i])
    review=review.lower()
    review=review.split()
    review=[lemmatizer.lemmatize(word) for word in review]
    review=' '.join(review)
    corpus.append(review)

In [91]:
corpus

['jace rankin may be short but he s nothing to mess with a the man who wa just hauled out of the saloon by the undertaker know now he s a famous bounty hunter in oregon in the s who when he shot the man in the saloon just finished a year long quest to avenge his sister s murder and is now trying to figure out what to do next when the snotty nosed farm boy he just rescued from a gang of bully offer him money to kill a man who forced him off his ranch he reluctantly agrees to bring the man to justice but not to kill him outright but first he need to tell his sister s widower the news kyla kyle springer bailey ha been riding the trail and sleeping on the ground for the past month while trying to find jace she want revenge on the man who killed her husband and took her ranch amongst other crime and she s not so keen on the detour jace want to take but she realizes she s out of option so she hide behind her boy persona a best she can and try to keep pace when a confrontation along the way g

In [45]:
wv=api.load('word2vec-google-news-300')
vec_king= wv['king']
vec_king



array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

In [47]:
model=gensim.models.Word2Vec(data['reviewText'])



In [48]:
model

<gensim.models.word2vec.Word2Vec at 0x7dd2e3beead0>

In [92]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [93]:
words=[]
for sent in corpus:
    sent_token= sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))


In [94]:
words

[['jace',
  'rankin',
  'may',
  'be',
  'short',
  'but',
  'he',
  'nothing',
  'to',
  'mess',
  'with',
  'the',
  'man',
  'who',
  'wa',
  'just',
  'hauled',
  'out',
  'of',
  'the',
  'saloon',
  'by',
  'the',
  'undertaker',
  'know',
  'now',
  'he',
  'famous',
  'bounty',
  'hunter',
  'in',
  'oregon',
  'in',
  'the',
  'who',
  'when',
  'he',
  'shot',
  'the',
  'man',
  'in',
  'the',
  'saloon',
  'just',
  'finished',
  'year',
  'long',
  'quest',
  'to',
  'avenge',
  'his',
  'sister',
  'murder',
  'and',
  'is',
  'now',
  'trying',
  'to',
  'figure',
  'out',
  'what',
  'to',
  'do',
  'next',
  'when',
  'the',
  'snotty',
  'nosed',
  'farm',
  'boy',
  'he',
  'just',
  'rescued',
  'from',
  'gang',
  'of',
  'bully',
  'offer',
  'him',
  'money',
  'to',
  'kill',
  'man',
  'who',
  'forced',
  'him',
  'off',
  'his',
  'ranch',
  'he',
  'reluctantly',
  'agrees',
  'to',
  'bring',
  'the',
  'man',
  'to',
  'justice',
  'but',
  'not',
  'to',


In [95]:
model.corpus_count

12000

In [96]:
model.epochs

5

In [97]:
import gensim

In [98]:
model=gensim.models.Word2Vec(words)

In [99]:
## To Get all the vocabulary
model.wv.index_to_key

['the',
 'and',
 'to',
 'of',
 'it',
 'is',
 'this',
 'wa',
 'in',
 'that',
 'book',
 'for',
 'but',
 'story',
 'her',
 'with',
 'not',
 'she',
 'read',
 'you',
 'he',
 'have',
 'on',
 'be',
 'one',
 'are',
 'so',
 'character',
 'his',
 'just',
 'they',
 'like',
 'all',
 'there',
 'more',
 'me',
 'at',
 'my',
 'if',
 'good',
 'what',
 'about',
 'from',
 'an',
 'would',
 'out',
 'had',
 'ha',
 'love',
 'really',
 'very',
 'or',
 'when',
 'time',
 'by',
 'get',
 'author',
 'who',
 'up',
 'can',
 'were',
 'will',
 'some',
 'their',
 'no',
 'series',
 'them',
 'other',
 'well',
 'reading',
 'him',
 'much',
 'first',
 'into',
 'been',
 'even',
 'because',
 'how',
 'too',
 'short',
 'only',
 'did',
 'way',
 'didn',
 'great',
 'could',
 'know',
 'sex',
 'make',
 'little',
 'than',
 'don',
 'two',
 'then',
 'after',
 'thing',
 'do',
 'want',
 'plot',
 'think',
 'romance',
 'also',
 'find',
 'end',
 'which',
 'life',
 'see',
 'scene',
 'go',
 'enjoyed',
 'your',
 'never',
 'these',
 'written',


In [100]:
model.corpus_count

12000

In [101]:
model.epochs

5

In [102]:
model.wv.similar_by_word('kid')

[('teenager', 0.6982584595680237),
 ('child', 0.6870685815811157),
 ('bend', 0.6583506464958191),
 ('people', 0.648858368396759),
 ('baby', 0.6474354863166809),
 ('granted', 0.6446402668952942),
 ('jc', 0.6344547271728516),
 ('waited', 0.6343960762023926),
 ('alessandro', 0.6290537714958191),
 ('brain', 0.6235471963882446)]

In [103]:
model.wv['good'].shape

(100,)

In [104]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)

    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [105]:
!pip install tqdm



In [134]:
from tqdm import tqdm
import numpy as np

In [107]:
x=[]
for i in tqdm(range(len(words))):
    x.append(avg_word2vec(words[i]))

100%|██████████| 12000/12000 [00:46<00:00, 255.42it/s]


In [135]:
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.175522,0.075967,0.060273,-0.264168,0.776397,-0.372262,0.006357,0.474191,-0.325784,-0.392439,...,-0.019636,0.031671,0.150821,0.134938,0.020593,0.072794,-0.503172,0.311192,-0.417920,False
1,-0.353321,-0.339727,0.102601,0.061321,0.292578,-0.514194,-0.038665,0.246847,-0.803798,-0.541652,...,-0.174636,-0.106104,0.199153,-0.564100,0.007091,0.523488,-0.624697,0.511118,-0.415850,False
2,-0.174737,-0.076678,0.053160,-0.003060,0.326648,-0.436775,0.005656,0.397828,-0.610238,-0.450054,...,0.034778,0.303397,0.207329,-0.272792,-0.029267,0.256062,-0.774440,0.476534,-0.238877,False
3,-0.222068,-0.189298,-0.074381,0.044628,0.131755,-0.574464,-0.000209,0.340829,-0.473672,-0.310142,...,0.220643,0.247183,0.057670,-0.170481,-0.000367,0.211237,-0.482491,0.228438,0.023496,False
4,-0.454596,-0.136975,0.529760,-0.108510,0.019276,-0.490436,-0.117820,0.216996,-0.654606,-0.804452,...,0.076251,-0.162059,-0.128166,-0.527879,-0.069342,0.449739,-0.891806,0.725854,-0.264955,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,-0.239778,0.240336,-0.061046,-0.256718,0.268214,-0.397051,-0.005791,0.510242,-0.728223,-0.556155,...,0.186895,0.164245,0.681880,-0.064435,0.372224,-0.256378,-0.700966,0.294138,0.124051,False
11996,-0.158973,-0.328031,0.250516,0.144368,-0.194275,-0.674748,0.088973,0.162764,-0.809933,-0.651185,...,0.056078,0.097204,0.175135,-0.605552,-0.068619,0.419316,-0.785566,0.364891,-0.169870,False
11997,-0.227321,0.005228,0.032327,-0.172349,0.551698,-0.374667,-0.171445,0.512819,-0.516502,-0.547864,...,-0.227889,-0.080267,0.147221,-0.199495,-0.114244,0.389031,-0.542654,0.353247,-0.516062,False
11998,-0.003611,-0.132320,0.086119,-0.270567,0.721483,-0.257051,0.031319,0.434470,-0.522878,-0.729370,...,-0.374900,-0.078390,-0.012165,-0.141421,-0.016736,0.515916,-0.681900,0.510977,-0.569828,True


In [136]:
len(x)

12000

In [137]:
## independent features
X_new=np.array(x,dtype=object)


In [138]:
X_new.shape

(12000, 101)

In [139]:
X_new[0].shape

(101,)

In [140]:
import pandas as pd
import numpy as np

# Assuming 'x' is a list of arrays or DataFrames
df_list = [pd.DataFrame(x[i].reshape(1, -1)) for i in range(len(x))]

# Concatenate all the DataFrames in the list into a single DataFrame
df = pd.concat(df_list, ignore_index=True)

print(df)


AttributeError: 'Series' object has no attribute 'reshape'

In [176]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.175522,0.075967,0.060273,-0.264168,0.776397,-0.372262,0.006357,0.474191,-0.325784,-0.392439,...,-0.019636,0.031671,0.150821,0.134938,0.020593,0.072794,-0.503172,0.311192,-0.41792,False
1,-0.353321,-0.339727,0.102601,0.061321,0.292578,-0.514194,-0.038665,0.246847,-0.803798,-0.541652,...,-0.174636,-0.106104,0.199153,-0.5641,0.007091,0.523488,-0.624697,0.511118,-0.41585,False
2,-0.174737,-0.076678,0.05316,-0.00306,0.326648,-0.436775,0.005656,0.397828,-0.610238,-0.450054,...,0.034778,0.303397,0.207329,-0.272792,-0.029267,0.256062,-0.77444,0.476534,-0.238877,False
3,-0.222068,-0.189298,-0.074381,0.044628,0.131755,-0.574464,-0.000209,0.340829,-0.473672,-0.310142,...,0.220643,0.247183,0.05767,-0.170481,-0.000367,0.211237,-0.482491,0.228438,0.023496,False
4,-0.454596,-0.136975,0.52976,-0.10851,0.019276,-0.490436,-0.11782,0.216996,-0.654606,-0.804452,...,0.076251,-0.162059,-0.128166,-0.527879,-0.069342,0.449739,-0.891806,0.725854,-0.264955,False


In [177]:
df['Output']=y
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.175522,0.075967,0.060273,-0.264168,0.776397,-0.372262,0.006357,0.474191,-0.325784,-0.392439,...,-0.019636,0.031671,0.150821,0.134938,0.020593,0.072794,-0.503172,0.311192,-0.41792,False
1,-0.353321,-0.339727,0.102601,0.061321,0.292578,-0.514194,-0.038665,0.246847,-0.803798,-0.541652,...,-0.174636,-0.106104,0.199153,-0.5641,0.007091,0.523488,-0.624697,0.511118,-0.41585,False
2,-0.174737,-0.076678,0.05316,-0.00306,0.326648,-0.436775,0.005656,0.397828,-0.610238,-0.450054,...,0.034778,0.303397,0.207329,-0.272792,-0.029267,0.256062,-0.77444,0.476534,-0.238877,False
3,-0.222068,-0.189298,-0.074381,0.044628,0.131755,-0.574464,-0.000209,0.340829,-0.473672,-0.310142,...,0.220643,0.247183,0.05767,-0.170481,-0.000367,0.211237,-0.482491,0.228438,0.023496,False
4,-0.454596,-0.136975,0.52976,-0.10851,0.019276,-0.490436,-0.11782,0.216996,-0.654606,-0.804452,...,0.076251,-0.162059,-0.128166,-0.527879,-0.069342,0.449739,-0.891806,0.725854,-0.264955,False


In [178]:
x=df

In [179]:

x.isnull().sum()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
96,0
97,0
98,0
99,0


In [180]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.20)
X_train.columns= X_train.columns.astype(str)
X_test.columns= X_test.columns.astype(str)

In [181]:
y_train= y_train.astype(int)
y_train

array([1, 1, 0, ..., 0, 0, 1])

In [182]:
from sklearn.naive_bayes import GaussianNB
word2vec=GaussianNB().fit(X_train,y_train)

In [183]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [184]:
y_pred=word2vec.predict(X_test)


In [185]:
confusion_matrix(y_test,y_pred)

array([[1595,    0],
       [   0,  805]])

In [186]:
print("word2vec accuracy: ",accuracy_score(y_test,y_pred))

word2vec accuracy:  1.0


In [187]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00      1595
        True       1.00      1.00      1.00       805

    accuracy                           1.00      2400
   macro avg       1.00      1.00      1.00      2400
weighted avg       1.00      1.00      1.00      2400

