In [9]:
import pandas as pd 
DATASET = "amazon_reviews_us_Grocery_v1_00.tsv"
import logging
import warnings
import sys
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
logging.disable(sys.maxsize)



In [10]:
dataset = pd.read_table(DATASET, error_bad_lines=False, header=0, warn_bad_lines=False) 
# there are some malformed entries the dataset... let's ignore those for now


In [11]:
### Let's see what the dataset looks like... 
dataset.head()


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,42521656,R26MV8D0KG6QI6,B000SAQCWC,159713740,"The Cravings Place Chocolate Chunk Cookie Mix,...",Grocery,5,0.0,0.0,N,Y,Using these for years - love them.,"As a family allergic to wheat, dairy, eggs, nu...",2015-08-31
1,US,12049833,R1OF8GP57AQ1A0,B00509LVIQ,138680402,"Mauna Loa Macadamias, 11 Ounce Packages",Grocery,5,0.0,0.0,N,Y,Wonderful,"My favorite nut. Creamy, crunchy, salty, and ...",2015-08-31
2,US,107642,R3VDC1QB6MC4ZZ,B00KHXESLC,252021703,Organic Matcha Green Tea Powder - 100% Pure Ma...,Grocery,5,0.0,0.0,N,N,Five Stars,This green tea tastes so good! My girlfriend l...,2015-08-31
3,US,6042304,R12FA3DCF8F9ER,B000F8JIIC,752728342,15oz Raspberry Lyons Designer Dessert Syrup Sauce,Grocery,5,0.0,0.0,N,Y,Five Stars,I love Melissa's brand but this is a great sec...,2015-08-31
4,US,18123821,RTWHVNV6X4CNJ,B004ZWR9RQ,552138758,"Stride Spark Kinetic Fruit Sugar Free Gum, 14-...",Grocery,5,0.0,0.0,N,Y,Five Stars,good,2015-08-31


In [12]:
#lots of nan values... :( 
dataset.isnull().sum()

marketplace           0
customer_id           0
review_id             0
product_id            0
product_parent        0
product_title         0
product_category      0
star_rating          22
helpful_votes        23
total_votes          23
vine                 23
verified_purchase    23
review_headline      34
review_body          88
review_date          65
dtype: int64

In [13]:
# Lots of spammy duplicate reviews...might be worth checking if the same person is duplicating the reviews across multiple products

# Additions, because maybe same review_body is not duplicate i.e. "Good" as review.
x = dataset[dataset['review_body'].duplicated() == True]
x = x[dataset['customer_id'].duplicated() == True] # ADDED THIS
x = x[dataset['product_id'].duplicated() == True] # ADDED THIS
print(len(x))
x.head()

75144


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
2323,US,32433311,R3DPQJFH05T0HS,B00OZYNGUS,228095196,Viva Naturals - The FINEST Raw Organic Chia Se...,Grocery,5,0.0,0.0,N,Y,Great product.,I've been using this to help keep me from snac...,2015-08-31
2330,US,130585,R3IOACXWPYTRUP,B00HZ7HSTW,885171844,Beantown Roasters K Cups Variety Packs,Grocery,5,0.0,1.0,N,Y,The Office like it!,These were purchased for an office and they lo...,2015-08-31
2401,US,14100648,RA8F0DF5OJH3J,B000WV0RW8,653213046,Healthworks Chia Seeds 3lb 6lb Parent,Grocery,5,0.0,0.0,N,Y,GREAT! Thanks So much and God bless,GREAT! Thanks So much and God bless! &#60;&#...,2015-08-31
2436,US,43554935,RK0TC9FRNUL55,B00HQ3ZPJA,220133938,Chia Seeds,Grocery,5,0.0,0.0,N,Y,Five Stars,As advertised.,2015-08-31
2541,US,168472,R2WFNGP0E0ZLXW,B006ZMYLKC,322601427,Keurig,Grocery,5,0.0,0.0,N,Y,Five Stars,Satisfied,2015-08-31


In [14]:
#remove NaNs from the dataset  for now...
dataset = dataset.dropna()

In [15]:
def filter_heuristic(row):
    '''
    determine if a row is a negative review based on its score. Potentially extend it to include sentiment analysis 
    and presence of specific negative words 
    '''
    return row["star_rating"].astype(int) <=3

In [16]:
negative_rows = dataset.loc[filter_heuristic]
negative_rows.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
5,US,23649464,RIG9AWFOGRDVO,B00AL6QBZ6,681475449,Herr's Popcorn Hot Cheese 1 Oz (Pack of 30),Grocery,2,1.0,1.0,N,Y,Not Happy,The popcorn was stale.,2015-08-31
9,US,19624355,R1ODXB3C9UP3NL,B00J074W94,2499702,"Orgain Organic Plant Based Protein Powder, Pac...",Grocery,1,1.0,3.0,N,N,Disgusting now and difficult on digestion,Used to be a decent product. Disgusting now a...,2015-08-31
17,US,22765168,R3T6TTD2IN0EFZ,B00XDXMLL2,971154239,"Skippy Creamy Peanut Butter, with Salted Caram...",Grocery,1,4.0,4.0,N,N,"1 Out Of 5 Of My Co-Workers Thought It Was ""Okay""",I bought this from a local super market on a w...,2015-08-31
23,US,35636887,R9MISLBRG08FX,B00DBSFXUA,294404974,"Keebler Town House Pita Crackers, 9.5 Ounce",Grocery,1,0.0,0.0,N,Y,pita crackers,not craze about these. nothing really wrong wi...,2015-08-31
26,US,12650237,R2A9O8CWZ1PP74,B0083GJKR2,868929824,"Eclipse Sugar Free Gum, Spearmint, 120 Piece B...",Grocery,3,0.0,0.0,N,Y,Three Stars,it's gum..,2015-08-31


In [17]:
sorted_negative_rows = negative_rows.sort_values("product_id")
sorted_negative_rows.head() # ADDITION JUST TO CHECK

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
1975011,US,14434517,R25ATZCV5FBJMV,805470867,518591127,Communion Bread Pack of 500,Grocery,2.0,3.0,4.0,N,Y,The only thing my church has ever complained a...,I'm the pastor of a church plant and the only ...,2012-10-16
1365978,US,47145,RUI9CRY6RZNRB,805470867,518591127,Communion Bread Pack of 500,Grocery,1.0,0.0,0.0,N,Y,I HATE IT,what is that thing<br />it doesn't crunch like...,2014-02-21
1961839,US,37403535,R37YYRMOE7K2TB,805470867,518591127,Communion Bread Pack of 500,Grocery,1.0,1.0,1.0,N,Y,Communion bread,"I purchased 3 boxes, two of them the experatio...",2012-11-11
78777,US,41394716,R2QAW3WQRYVQF5,805470867,518591127,Communion Bread Pack of 500,Grocery,3.0,0.0,0.0,N,Y,Not what expected,"This came in a flimsy cardboard box, squished....",2015-08-04
1845770,US,11312642,R3M9ORU1I9L2C3,805470867,518591127,Communion Bread Pack of 500,Grocery,2.0,0.0,0.0,N,Y,Gross,These things taste nothing like the communion ...,2013-02-18


In [18]:
# def generate_wordcloud(text):
#     wordcloud = WordCloud(width = 5000, height = 5000, random_state=1, background_color='salmon', colormap='Pastel1',
#                           collocations=False, stopwords = STOPWORDS).generate(text)
#     plt.imshow(wordcloud)

In [19]:
# generate a naive word cloud without any preprocessing....
# generate_wordcloud(" ".join(sorted_negative_rows["review_body"]))

In [20]:
dataset.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

In [21]:
reviews = dataset["review_headline"]
print(reviews.head())
print(sorted_negative_rows["review_headline"])

0    Using these for years - love them.
1                             Wonderful
2                            Five Stars
3                            Five Stars
4                            Five Stars
Name: review_headline, dtype: object
1975011    The only thing my church has ever complained a...
1365978                                            I HATE IT
1961839                                      Communion bread
78777                                      Not what expected
1845770                                                Gross
                                 ...                        
30387                          i hv to say I LOVE THIS CHIPS
1269                                           not very good
18038      Gross Chemical Taste. Nothing like Pumpkin Pie...
22590                                               One Star
2307691                                       Pretty Rough!!
Name: review_headline, Length: 445115, dtype: object


In [22]:
# if not nltk.find('corpora/wordnet'): # may need to download it if it's not in your pc
#     nltk.download('wordnet')



import pip
pip.main(['install', 'nltk'])
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('went')) # Default is noun
print(lemmatizer.lemmatize('went', 'v')) # If "v", it works with verbs.# If needed later, I will spend some time to find the part of speech of each word, and use lemmatizer accordingly

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/navyazaveri/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


went
go


In [23]:
import pip 
pip.main(["install", "gensim"])
from gensim.parsing.preprocessing import STOPWORDS
import gensim

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [24]:
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(WordNetLemmatizer().lemmatize(token, 'v')) # for now, lemmatizing only verbs
    return result

In [25]:
for review in reviews[:5]:
    print("Removed stopwords and lemmatized words")
    print(review + " becomes -->", preprocess(review))
# Here, I notice that the amount of starts is getting removed. I don't think it's bad because a review being "1 star"
# does not necessarily mean the product is faulty. I can either further remove all the word "stars" from the texts, 
# Or try to figure out how to keep numbers, if it's possible, on lemmatizing

Removed stopwords and lemmatized words
Using these for years - love them. becomes --> ['years', 'love']
Removed stopwords and lemmatized words
Wonderful becomes --> ['wonderful']
Removed stopwords and lemmatized words
Five Stars becomes --> ['star']
Removed stopwords and lemmatized words
Five Stars becomes --> ['star']
Removed stopwords and lemmatized words
Five Stars becomes --> ['star']


In [26]:
### I will run this on the first 1000 reviews because for now, because it will take long for more
less_reviews = reviews[:1000]
processed = []
for review in less_reviews:
    processed.append(preprocess(review))
print(processed) # All the words after applying lemmatization on the first 1000 reviews

[['years', 'love'], ['wonderful'], ['star'], ['star'], ['star'], ['happy'], ['star'], ['star'], ['great', 'taste'], ['disgust', 'difficult', 'digestion'], ['like', 'soy', 'sauce', 'll', 'like'], ['star'], ['star'], ['star'], ['star'], ['excellent'], ['yummy'], ['workers', 'think', 'okay'], ['delicious', 'tea', 'easy'], ['best', 'coconut', 'oil'], ['star'], ['star'], ['good', 'variety'], ['pita', 'crackers'], ['star'], ['nice', 'big', 'bottle'], ['star'], ['star'], ['star'], ['star'], ['love'], ['star'], ['fresh', 'great', 'tomato', 'base', 'meal'], ['star'], ['goodie', 'downunder'], [], ['far', 'like'], ['star'], ['instructions', 'sure'], ['lifesavers', 'eons', 'trust', 'quality', 'good', 'taste', 'mint'], ['recommend'], ['star'], ['star'], ['star'], ['wonderful', 'watermelon'], ['great', 'product'], ['good', 'flavor'], ['tasteless'], ['tea', 'great', 'enjoy', 'flavor'], ['favorite', 'tea'], ['good'], ['taste', 'great'], ['organic', 'brand', 'better', 'taste'], ['great', 'import', 'are

In [27]:
dictionary = gensim.corpora.Dictionary(processed) # construct word<->id mappings - it does it in alphabetical order
print(dictionary) # In the first 1000 reviews, we have 787 unique words after applying lemmatization in every word

Dictionary(787 unique tokens: ['love', 'years', 'wonderful', 'star', 'happy']...)


In [28]:
bow_corpus = [dictionary.doc2bow(review) for review in processed]
# Each entry in bow_corpus lists how many times each word id exists. 

print(bow_corpus) # bow_corpus shows the id mapping for each word, and the total occurences in the dict of that word
# in that document

[[(0, 1), (1, 1)], [(2, 1)], [(3, 1)], [(3, 1)], [(3, 1)], [(4, 1)], [(3, 1)], [(3, 1)], [(5, 1), (6, 1)], [(7, 1), (8, 1), (9, 1)], [(10, 2), (11, 1), (12, 1), (13, 1)], [(3, 1)], [(3, 1)], [(3, 1)], [(3, 1)], [(14, 1)], [(15, 1)], [(16, 1), (17, 1), (18, 1)], [(19, 1), (20, 1), (21, 1)], [(22, 1), (23, 1), (24, 1)], [(3, 1)], [(3, 1)], [(25, 1), (26, 1)], [(27, 1), (28, 1)], [(3, 1)], [(29, 1), (30, 1), (31, 1)], [(3, 1)], [(3, 1)], [(3, 1)], [(3, 1)], [(0, 1)], [(3, 1)], [(5, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(3, 1)], [(36, 1), (37, 1)], [], [(10, 1), (38, 1)], [(3, 1)], [(39, 1), (40, 1)], [(6, 1), (25, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1)], [(46, 1)], [(3, 1)], [(3, 1)], [(3, 1)], [(2, 1), (47, 1)], [(5, 1), (48, 1)], [(25, 1), (49, 1)], [(50, 1)], [(5, 1), (21, 1), (49, 1), (51, 1)], [(21, 1), (52, 1)], [(25, 1)], [(5, 1), (6, 1)], [(6, 1), (53, 1), (54, 1), (55, 1)], [(5, 1), (56, 1), (57, 1)], [(3, 1)], [(3, 1)], [(3, 1)], [(3, 1)], [(3, 1)], [(3, 1)], [(19, 

In [29]:
# By changing review_num, we can view the amount of appearances of that word in that specific review
review_num = 0
bow_doc_x = bow_corpus[review_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time(s).".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 0 ("love") appears 1 time(s).
Word 1 ("years") appears 1 time(s).


In [30]:
# Here, we give the bag of words we have (for now I have used first 1000 reviews), how many topics we want to get,
# the dictionary of the id <-> word mappings and how many passes(epochs) to perform, and it creates the model.
lda_model =  gensim.models.LdaModel(bow_corpus, 
                                   num_topics=20, 
                                   id2word=dictionary,                                    
                                   passes=10,
                                   ) 

In [31]:
for idx, topic in lda_model.print_topics(-1): # The words occuring in each class, and the weight given for that
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

# I have noticed that every time I run this, the results below are different. This can either be good or bad.
# Some times when I run it, there is a class with the most weighted word being "dissapoint". If we want results that
# are the same every time, we can use a seed for reproducibility. Let me know! 

Topic: 0 
Words: 0.051*"good" + 0.028*"ok" + 0.027*"flavor" + 0.025*"energy" + 0.020*"game" + 0.020*"amazingly" + 0.020*"great" + 0.020*"amaze" + 0.020*"delicious" + 0.020*"package"


Topic: 1 
Words: 0.047*"oil" + 0.034*"like" + 0.034*"love" + 0.034*"nice" + 0.032*"coconut" + 0.028*"taste" + 0.028*"free" + 0.021*"gluten" + 0.014*"season" + 0.014*"have"


Topic: 2 
Words: 0.082*"flavor" + 0.068*"good" + 0.042*"great" + 0.028*"instant" + 0.027*"pretty" + 0.021*"coffee" + 0.014*"strong" + 0.014*"black" + 0.014*"matcha" + 0.014*"fav"


Topic: 3 
Words: 0.066*"sweet" + 0.032*"best" + 0.026*"sugar" + 0.024*"great" + 0.021*"pink" + 0.021*"party" + 0.011*"excellent" + 0.011*"color" + 0.011*"yes" + 0.011*"fake"


Topic: 4 
Words: 0.180*"good" + 0.043*"tea" + 0.022*"stuff" + 0.017*"brand" + 0.016*"make" + 0.016*"like" + 0.011*"time" + 0.011*"flavor" + 0.011*"gochujang" + 0.011*"gf"


Topic: 5 
Words: 0.169*"great" + 0.075*"taste" + 0.036*"tea" + 0.030*"thank" + 0.024*"coffee" + 0.023*"price" + 

In [32]:
# Here, and on the next cell, I run this on a review that was not used for training to find out how we can get results
# on unseen data
unseen_document = reviews[1001]
print(unseen_document)
preprocess(unseen_document)

Kirkland Coffee is a terrific daily brand coffee.


['kirkland', 'coffee', 'terrific', 'daily', 'brand', 'coffee']

In [33]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

In [34]:
lda_model[bow_vector] # How likely each class is. Because the review is small, some classes have very low probabilities.
# I think that is good! Because there is a high probability to exist in the rest. But again, the data I have trained
# is coarse for now, so no guaranteed results (yet).

[(0, 0.010000301),
 (1, 0.010000301),
 (2, 0.010000301),
 (3, 0.010000301),
 (4, 0.20999382),
 (5, 0.010000301),
 (6, 0.010000301),
 (7, 0.010000301),
 (8, 0.3995623),
 (9, 0.010000301),
 (10, 0.010000301),
 (11, 0.010000301),
 (12, 0.010000301),
 (13, 0.010000301),
 (14, 0.010000301),
 (15, 0.22043876),
 (16, 0.010000301),
 (17, 0.010000301),
 (18, 0.010000301),
 (19, 0.010000301)]

In [35]:
# Sorts the most probable classes and gives some details on weights, words and probabilities
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Index: {}\nScore: {}\t Topic: {}".format(index, score, lda_model.print_topic(index, 5)))

Index: 8
Score: 0.3994898498058319	 Topic: 0.149*"best" + 0.091*"coffee" + 0.032*"great" + 0.029*"flavor" + 0.028*"happy"
Index: 15
Score: 0.22051124274730682	 Topic: 0.075*"excellent" + 0.064*"great" + 0.052*"taste" + 0.043*"price" + 0.031*"fresh"
Index: 4
Score: 0.20999382436275482	 Topic: 0.180*"good" + 0.043*"tea" + 0.022*"stuff" + 0.017*"brand" + 0.016*"make"
Index: 0
Score: 0.010000300593674183	 Topic: 0.051*"good" + 0.028*"ok" + 0.027*"flavor" + 0.025*"energy" + 0.020*"game"
Index: 1
Score: 0.010000300593674183	 Topic: 0.047*"oil" + 0.034*"like" + 0.034*"love" + 0.034*"nice" + 0.032*"coconut"
Index: 2
Score: 0.010000300593674183	 Topic: 0.082*"flavor" + 0.068*"good" + 0.042*"great" + 0.028*"instant" + 0.027*"pretty"
Index: 3
Score: 0.010000300593674183	 Topic: 0.066*"sweet" + 0.032*"best" + 0.026*"sugar" + 0.024*"great" + 0.021*"pink"
Index: 5
Score: 0.010000300593674183	 Topic: 0.169*"great" + 0.075*"taste" + 0.036*"tea" + 0.030*"thank" + 0.024*"coffee"
Index: 6
Score: 0.010000

In [45]:
import pip 
pip.main(["install", "pyLDAvis==3.2.2"])
import pyLDAvis
import os 
import pickle 
import pyLDAvis
import pyLDAvis.gensim

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)

LDAvis_prepared

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


hello
