In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gensim
import nltk
import spacy


In [2]:
nlp = spacy.load("en_core_web_sm")


# Reading the data

In [3]:
sportReviews = pd.read_json("/kaggle/input/sports-and-outdoor-review-dataset/Sports_and_Outdoors_5.json",lines = True)
sportReviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [4]:
sportReviews['reviewText'][0]

'This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy'

In [5]:
sportReviews.shape

(296337, 9)

# Simple Preprocessing & Tokenization
The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'

In [7]:
# All data set (reviewText column)
texts = sportReviews.reviewText.astype(str).tolist()  # ensure strings

# SAmple of the  data set (reviewText column)
sample_texts = sportReviews.reviewText.astype(str).head(1000).tolist()


def spacy_preprocess_pipe(texts):
    for doc in nlp.pipe(texts, batch_size=50):
        yield [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

In [None]:
# Test for a sample of the data set
sport_reviews = list(spacy_preprocess_pipe(sample_texts))


In [None]:
sport_reviews[0]

In [8]:
# Now for the full data set
sport_reviews = list(spacy_preprocess_pipe(texts))


In [9]:
sport_reviews[1]

['factory',
 'Glock',
 'tool',
 'Glock',
 'lose',
 'need',
 'Ghost',
 'product',
 'prior',
 'know',
 'reliable',
 'decide',
 'order',
 'sure',
 'good',
 'factory',
 'tool']

# Training the Word2Vec Model


In [10]:
# Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead.
# A sentence with at least 2 words should only be considered, configure this using min_count parameter.
# Workers define how many CPU threads to be used

model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

# Build Vocabulary


In [11]:
model.build_vocab(sport_reviews,progress_per = 1000)

# Train the Word2Vec Model


In [12]:
model.train(sport_reviews,total_examples = model.corpus_count,epochs = model.epochs)

(50014404, 53943080)

# Finding Similar Words and Similarity between words


In [13]:
model.wv.most_similar("awful")


[('horrible', 0.8240437507629395),
 ('terrible', 0.8192796111106873),
 ('horrid', 0.5935847163200378),
 ('bad', 0.5927548408508301),
 ('strange', 0.5917527079582214),
 ('piss', 0.5854366421699524),
 ('blame', 0.5849658250808716),
 ('suck', 0.5796614289283752),
 ('joke', 0.5777196884155273),
 ('funny', 0.5724768042564392)]

In [14]:
model.wv.similarity(w1="good", w2="great")


0.7816763

In [18]:
model.wv.similarity(w1="great", w2="unfortunate")


-0.0785992

In [28]:
model.wv.similarity(w1="cheap", w2="inexpensive")


0.5550592