In [1]:
import pandas as pd 
DATASET = "amazon_reviews_us_Grocery_v1_00.tsv"
import logging
import warnings
import sys
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import wordnet
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
logging.disable(sys.maxsize)



In [2]:
dataset = pd.read_table(DATASET, error_bad_lines=False, header=0, warn_bad_lines=False) 
# there are some malformed entries the dataset... let's ignore those for now


In [None]:
synonyms = []
for subject in subjects:
    for syn in wordnet.synsets(subject):
        for l in syn.lemmas():
            synonyms.append(l.name())

print(set(synonyms))


In [9]:
### Let's see what the dataset looks like... 
dataset.head()


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,42521656,R26MV8D0KG6QI6,B000SAQCWC,159713740,"The Cravings Place Chocolate Chunk Cookie Mix,...",Grocery,5,0.0,0.0,N,Y,Using these for years - love them.,"As a family allergic to wheat, dairy, eggs, nu...",2015-08-31
1,US,12049833,R1OF8GP57AQ1A0,B00509LVIQ,138680402,"Mauna Loa Macadamias, 11 Ounce Packages",Grocery,5,0.0,0.0,N,Y,Wonderful,"My favorite nut. Creamy, crunchy, salty, and ...",2015-08-31
2,US,107642,R3VDC1QB6MC4ZZ,B00KHXESLC,252021703,Organic Matcha Green Tea Powder - 100% Pure Ma...,Grocery,5,0.0,0.0,N,N,Five Stars,This green tea tastes so good! My girlfriend l...,2015-08-31
3,US,6042304,R12FA3DCF8F9ER,B000F8JIIC,752728342,15oz Raspberry Lyons Designer Dessert Syrup Sauce,Grocery,5,0.0,0.0,N,Y,Five Stars,I love Melissa's brand but this is a great sec...,2015-08-31
4,US,18123821,RTWHVNV6X4CNJ,B004ZWR9RQ,552138758,"Stride Spark Kinetic Fruit Sugar Free Gum, 14-...",Grocery,5,0.0,0.0,N,Y,Five Stars,good,2015-08-31


In [10]:
#lots of nan values... :( 
dataset.isnull().sum()

marketplace           0
customer_id           0
review_id             0
product_id            0
product_parent        0
product_title         0
product_category      0
star_rating          22
helpful_votes        23
total_votes          23
vine                 23
verified_purchase    23
review_headline      34
review_body          88
review_date          65
dtype: int64

In [3]:
# Lots of spammy duplicate reviews...might be worth checking if the same person is duplicating the reviews across multiple products


x = dataset[dataset['review_body'].duplicated() == True]
x.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
130,US,21294263,R3KT8X23DG3WCI,B00ZPV2070,22340539,Wolfgang Puck Coffee Keurig K-Cups - Vanilla F...,Grocery,5,0.0,0.0,N,Y,Great coffee,Delicious,2015-08-31
186,US,41355087,R8AE9OXZMTORH,B00KDW19WS,76193812,Nagaraya Cracker Nuts Assorted Bundle 4-Pack: ...,Grocery,5,0.0,0.0,N,Y,Five Stars,Yummy,2015-08-31
208,US,51044872,R3QZBU3F214CF1,B00N2FVHWU,928026267,Green Mountain Coffee Wicked Winter Blend Keur...,Grocery,5,1.0,1.0,N,Y,Five Stars,Great coffee,2015-08-31
270,US,48880662,RRRDCQSQ66LE,B008TMIO2M,81982020,"Nutella, Hazelnut Spread with Cocoa",Grocery,4,0.0,0.0,N,Y,Four Stars,good,2015-08-31
273,US,14100648,RGH9G1J3M0ZW6,B005P0U5BO,936604878,Envirokidz Organic Bar Gf Crspyrice Brry Org,Grocery,5,0.0,0.0,N,Y,GREAT! Thanks So much and God bless,GREAT! Thanks So much and God bless! &#60;&#...,2015-08-31


In [4]:
#remove NaNs from the dataset  for now...
dataset = dataset.dropna()

In [5]:

def filter_heuristic(row):
    '''
    determine if a row is a negative review based on its score. Potentially extend it to include sentiment analysis 
    and presence of specific negative words 
    '''
    return row["star_rating"].astype(int) <=3

In [7]:
negative_rows = dataset.loc[filter_heuristic]
negative_rows.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
5,US,23649464,RIG9AWFOGRDVO,B00AL6QBZ6,681475449,Herr's Popcorn Hot Cheese 1 Oz (Pack of 30),Grocery,2,1.0,1.0,N,Y,Not Happy,The popcorn was stale.,2015-08-31
9,US,19624355,R1ODXB3C9UP3NL,B00J074W94,2499702,"Orgain Organic Plant Based Protein Powder, Pac...",Grocery,1,1.0,3.0,N,N,Disgusting now and difficult on digestion,Used to be a decent product. Disgusting now a...,2015-08-31
17,US,22765168,R3T6TTD2IN0EFZ,B00XDXMLL2,971154239,"Skippy Creamy Peanut Butter, with Salted Caram...",Grocery,1,4.0,4.0,N,N,"1 Out Of 5 Of My Co-Workers Thought It Was ""Okay""",I bought this from a local super market on a w...,2015-08-31
23,US,35636887,R9MISLBRG08FX,B00DBSFXUA,294404974,"Keebler Town House Pita Crackers, 9.5 Ounce",Grocery,1,0.0,0.0,N,Y,pita crackers,not craze about these. nothing really wrong wi...,2015-08-31
26,US,12650237,R2A9O8CWZ1PP74,B0083GJKR2,868929824,"Eclipse Sugar Free Gum, Spearmint, 120 Piece B...",Grocery,3,0.0,0.0,N,Y,Three Stars,it's gum..,2015-08-31


In [40]:
bad_words = [
    "stinky",
    "moldy",
    "mouldy",
    "rotted",
    "rotten",
    "stale"
    "stinking",
    "icky",
    "curdled",
    "danger",
    "dangerous",
    "avoid",
    "ammonia",
    "sick"
]

def filter_bad_words(row):
    '''
    determine if a row is a negative review based on its words.
    '''
    return row["review_headline"].str.contains("\\b(%s)\\b" % '|'.join(bad_words))


In [41]:
negative_rows = negative_rows.loc[filter_bad_words]

704


In [42]:
#nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
test_scores = sia.polarity_scores("That sandwidch was disgusting!")
print(test_scores)

def sentiment_heuristic(row):
    return sia.polarity_scores(row)['compound']

negative_rows['sentiment_score'] = negative_rows["review_body"].apply(sentiment_heuristic)

{'neg': 0.552, 'neu': 0.448, 'pos': 0.0, 'compound': -0.5707}


In [63]:
pd.set_option('display.max_colwidth', None)
rows = negative_rows.head(n=4)['review_body']
for row in rows:
    print(row)
    print(sia.polarity_scores(row)['compound'])
    print("")

I got this by mistake, somehow.  there seems to be a regular yellow candy style AND then there is this red stripe variety.  I had ordered the yellow plain kind but they shipped this, instead.  I tried two pieces.  could not even finish either one.  tasted like wax or something very flavorless!  I would not buy this.  in fact, I did not, but for some reason, amazon shipped this to me.
0.7263

Probably would have been good, but my nuts were rancid.
-0.775

I am Turkish and I've never had any problems with Marmarabirlik olives in my whole life.  HOWEVER, this is the first time I've ordered them online and I have to say there were the worst olives I've ever tasted.  They were rotten!  My Mom even commented that these olives are bad.  Maybe I got a bad batch because this company makes one of the very best Turkish olives on the market.  I'm extremely disappointed with this online order.
-0.9069

The taste is okay....Nothing spectacular....It tastes like it was made with rotted anchovies....<

In [64]:
negative_rows = negative_rows[negative_rows['sentiment_score'] < 0]

In [68]:
sorted_negative_rows = negative_rows.sort_values("product_id")
negative_rows.head(n=8)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,sentiment_score
8911,US,37307498,RZP5EWKOBX5KH,B009L9TMBY,696947556,"Deluxe Mixed Nuts Raw, 2Lbs",Grocery,2,1.0,1.0,N,Y,Ixnay on the rotten nuts.,"Probably would have been good, but my nuts were rancid.",2015-08-28,-0.775
22258,US,47973491,R1NOLST0E8IATT,B0036ZIF9Y,923729241,Gemlik Type Black Olives LUX – 1.1lb (500g),Grocery,1,2.0,2.0,N,Y,Mine were rotten and tasted horrible...,"I am Turkish and I've never had any problems with Marmarabirlik olives in my whole life. HOWEVER, this is the first time I've ordered them online and I have to say there were the worst olives I've ever tasted. They were rotten! My Mom even commented that these olives are bad. Maybe I got a bad batch because this company makes one of the very best Turkish olives on the market. I'm extremely disappointed with this online order.",2015-08-24,-0.9069
35555,US,30293525,R1FRDE0OIA546B,B00TCOAXR0,915238907,Chung Jung One Anchovy Sauce Blue,Grocery,1,3.0,9.0,N,Y,.It tastes like it was made with rotted anchovies....,The taste is okay....Nothing spectacular....It tastes like it was made with rotted anchovies....<br />The main problem: I braised something using this sauce and my apartment stunk like rotted fish for days....<br />The cheap plastic bottle does not make this product more enticing.....,2015-08-19,-0.0772
36783,US,50106447,R2KIR9GSCGNQI6,B002N723Q2,479009963,"Numi Organic Black Tea, Loose Leaf Tea",Grocery,1,1.0,3.0,N,Y,"Good taste, but with rotten flavor","Good taste, but absolutely no flavor, both dry and brewed. Contacted Numi Customer Service. Got careless response, They sent me a couple of tea bags which were much worse than the loose Emperor's Puerh tea. After a month or so it smells rotten. Still taste is OK",2015-08-19,-0.9272
44033,US,23565135,R1GGRVYJDHN2GV,B00BUPEOKG,393268588,Muffin Town Gluten Free Snack 'N Loaves (12 Per Box),Grocery,1,0.0,0.0,N,N,Very disappointed that 5 of the loaves were moldy,This is my second time buying this product. Very disappointed that 5 of the loaves were moldy. The mold was just visible on the bottom of the loaves. I wont buy them again for health reasons.,2015-08-17,-0.5256
54550,US,22365141,RGSXU84MSLDY7,B006GL6N9E,776164341,Fruit Basket Delight with Cheese and Nuts,Grocery,2,0.0,0.0,N,Y,rotten fruit,I sent this to my Mom for Mothers Day and the fruit was rotten! It made me feel terrible.,2015-08-12,-0.7712
70811,US,13861214,R1X6047VQJ7JQ7,B00T77QT5M,290213412,"Barnana Organic Chewy Banana Bites Variety Pack, 12 Count",Grocery,1,0.0,2.0,N,Y,These taste like rotten moldy bananas,"These taste like rotten moldy bananas. I was trying to find a product that came close to the banana chunks that the Philippine brand made since I can't seem to find them anymore, but these aren't even edible much less comparible.<br />Unfortunately it won't let me give no star.",2015-08-07,-0.4939
73105,US,37337835,R3SV31LUR8QP5,B00FIQLAPA,264430861,"Ice Breakers Berry Sours Original Sours, 1.5-Ounce Pucks (Pack of 8)",Grocery,1,1.0,1.0,N,Y,Not as pictured! Definitely avoid this item!,"Fraud, pure and simple. Item delivered was a Chinese food product, with no content information in English. That includes contents and expiration dates. Everything is in Chinese. If I had known this, I never would have ordered this, and Amazon.com needs to protect customers from this outrage. DO NOT ORDER THIS PRODUCT FROM AMAZON.COM!",2015-08-06,-0.8209
