In [3]:
#!pip install --upgrade pip
#!pip install --upgrade gensim
#!pip install --upgrade python-Levenshtein

In [4]:
import gensim
import pandas as pd

# Reading and Exploring the dataset
The dataset we are using here is a subset of Amazon reviews from the cell phones and accessories category. The data stored as a json file and can be read using Pandas.
Link to the dataset is [The Amazon review dataset](https://snap.stanford.edu/data/web-Amazon.html)
- Select `Cell_Phones_&_Accessories.txt.gz` file
- Latest version of the file can be obtained from [https://nijianmo.github.io/amazon/index.html](https://nijianmo.github.io/amazon/index.html) -- Pick the "small" subset at [1,128,437 reviews](http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz)

In [16]:
df = pd.read_json("../datasets/Cell_Phones_and_Accessories_5.json", lines=True)

In [17]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5,True,"08 4, 2014",A24E3SXTC62LJI,7508492919,{'Color:': ' Bling'},Claudia Valdivia,Looks even better in person. Be careful to not...,Can't stop won't stop looking at it,1407110400,,
1,5,True,"02 12, 2014",A269FLZCB4GIPV,7508492919,,sarah ponce,When you don't want to spend a whole lot of ca...,1,1392163200,,
2,3,True,"02 8, 2014",AB6CHQWHZW4TV,7508492919,,Kai,"so the case came on time, i love the design. I...",Its okay,1391817600,,
3,2,True,"02 4, 2014",A1M117A53LEI8,7508492919,,Sharon Williams,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY...,CASE,1391472000,,
4,4,True,"02 3, 2014",A272DUT8M88ZS8,7508492919,,Bella Rodriguez,"I liked it because it was cute, but the studs ...",Cute!,1391385600,,


In [18]:
df.shape

(1128437, 12)

## Dropping reviewText samples having Nans

In [19]:
review_text_dropped_na = df.dropna(subset='reviewText')

In [20]:
review_text_dropped_na.shape

(1127672, 12)

## Simple preprocessing and tokenization
The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. 

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [22]:
review_text = review_text_dropped_na.reviewText.apply(gensim.utils.simple_preprocess)

In [23]:
review_text.head()

0    [looks, even, better, in, person, be, careful,...
1    [when, you, don, want, to, spend, whole, lot, ...
2    [so, the, case, came, on, time, love, the, des...
3    [don, care, for, it, gave, it, as, gift, and, ...
4    [liked, it, because, it, was, cute, but, the, ...
Name: reviewText, dtype: object

In [25]:
review_text_dropped_na.reviewText.head()

0    Looks even better in person. Be careful to not...
1    When you don't want to spend a whole lot of ca...
2    so the case came on time, i love the design. I...
3    DON'T CARE FOR IT.  GAVE IT AS A GIFT AND THEY...
4    I liked it because it was cute, but the studs ...
Name: reviewText, dtype: object

## Training the Word2Vec Model
Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using `min_count` parameter.

Workers define how many CPU threads to be used.

### Initialize the model

In [26]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

### Build the Vocabulary

In [27]:
model.build_vocab(review_text, progress_per=1000)

### Now, train the word2vec model

In [28]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(195367884, 266797320)

### Save the model

In [30]:
model.save("../model_files/word2vec-amazon-cell-accessories-reviews-small.model")

### Finding similar words and similarity between words
- GENSIM documentation on [Word2vec embeddings](https://radimrehurek.com/gensim/models/word2vec.html)

In [31]:
model.wv.most_similar("bad")

[('terrible', 0.7024766206741333),
 ('shabby', 0.6336917877197266),
 ('horrible', 0.6236634254455566),
 ('disappointing', 0.5862223505973816),
 ('good', 0.580049991607666),
 ('awful', 0.5702888369560242),
 ('meh', 0.5614873766899109),
 ('lousy', 0.5559502243995667),
 ('poor', 0.5554750561714172),
 ('funny', 0.5525959730148315)]

In [32]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.46054313

In [33]:
model.wv.similarity(w1="great", w2="good")

0.80240625