In [29]:
import pandas as pd
import gensim

### Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Sports and Outdoor category. The data is stored as a JSON file and can be read using pandas.

In [30]:
df = pd.read_json('C:\DS practice\Sports_and_Outdoors_5.json', lines = True)

In [31]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [32]:
df.shape

(296337, 9)

In [33]:
df.reviewText

0         This came in on time and I am veru happy with ...
1         I had a factory Glock tool that I was using fo...
2         If you don't have a 3/32 punch or would like t...
3         This works no better than any 3/32 punch you w...
4         I purchased this thinking maybe I need a speci...
                                ...                        
296332    This is a water bottle done right. It is a ver...
296333    If you're looking for an insulated water bottl...
296334    This Hydracentials Sporty 25 OZ, double insula...
296335    As usual I received this item free in exchange...
296336    Hydracentials insulated 25 oz water bottle.Thi...
Name: reviewText, Length: 296337, dtype: object

### Simple Preprocessing & Tokenization
The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [34]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)

In [35]:
review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

In [36]:
print(review_text[0])

['this', 'came', 'in', 'on', 'time', 'and', 'am', 'veru', 'happy', 'with', 'it', 'haved', 'used', 'it', 'already', 'and', 'it', 'makes', 'taking', 'out', 'the', 'pins', 'in', 'my', 'glock', 'very', 'easy']


### Training the Word2Vec Model
Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

#### Initialize the model

In [37]:
model = gensim.models.Word2Vec(window = 10,
                     workers = 4,
                     min_count = 2)

#### Build Vocabulary

In [38]:
model.build_vocab(review_text, progress_per = 1000)

In [39]:
model.corpus_count

296337

In [40]:
model.epochs

5

#### Train the word2vec model

In [41]:
model.train(review_text, total_examples = model.corpus_count, epochs = model.epochs )

(91342866, 121496535)

#### Save the model

In [42]:
model.save('C:\DS practice\Sports_and_Outdoors.model')

In [43]:
model.wv.most_similar('bad')

[('terrible', 0.6914101243019104),
 ('shabby', 0.6467660069465637),
 ('horrible', 0.6392822265625),
 ('funny', 0.5390329360961914),
 ('greatest', 0.5303934812545776),
 ('upset', 0.5227863192558289),
 ('good', 0.5181275010108948),
 ('lame', 0.5169687271118164),
 ('lazy', 0.5017289519309998),
 ('bummed', 0.5004640817642212)]

In [44]:
model.wv.most_similar('good')

[('decent', 0.8494853377342224),
 ('great', 0.7792516946792603),
 ('nice', 0.7117820382118225),
 ('fantastic', 0.6944197416305542),
 ('excellent', 0.6228534579277039),
 ('superb', 0.6107070446014404),
 ('terrific', 0.6100025177001953),
 ('reasonable', 0.6050522923469543),
 ('awesome', 0.5880100131034851),
 ('neat', 0.5714215040206909)]

In [45]:
model.wv.similarity(w1 = 'good' , w2= 'bad')

0.51812756

In [46]:
model.wv.similarity(w1 = 'first' , w2= 'last')

0.27099377