In [4]:
import pandas as pd 
DATASET = "amazon_reviews_us_Grocery_v1_00.tsv"
import logging
import warnings
import sys
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
logging.disable(sys.maxsize)



In [6]:
dataset = pd.read_table(DATASET, error_bad_lines=False, header=0, warn_bad_lines=False) 
# there are some malformed entries the dataset... let's ignore those for now


In [8]:
### Let's see what the dataset looks like... 
dataset.head()


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,42521656,R26MV8D0KG6QI6,B000SAQCWC,159713740,"The Cravings Place Chocolate Chunk Cookie Mix,...",Grocery,5,0.0,0.0,N,Y,Using these for years - love them.,"As a family allergic to wheat, dairy, eggs, nu...",2015-08-31
1,US,12049833,R1OF8GP57AQ1A0,B00509LVIQ,138680402,"Mauna Loa Macadamias, 11 Ounce Packages",Grocery,5,0.0,0.0,N,Y,Wonderful,"My favorite nut. Creamy, crunchy, salty, and ...",2015-08-31
2,US,107642,R3VDC1QB6MC4ZZ,B00KHXESLC,252021703,Organic Matcha Green Tea Powder - 100% Pure Ma...,Grocery,5,0.0,0.0,N,N,Five Stars,This green tea tastes so good! My girlfriend l...,2015-08-31
3,US,6042304,R12FA3DCF8F9ER,B000F8JIIC,752728342,15oz Raspberry Lyons Designer Dessert Syrup Sauce,Grocery,5,0.0,0.0,N,Y,Five Stars,I love Melissa's brand but this is a great sec...,2015-08-31
4,US,18123821,RTWHVNV6X4CNJ,B004ZWR9RQ,552138758,"Stride Spark Kinetic Fruit Sugar Free Gum, 14-...",Grocery,5,0.0,0.0,N,Y,Five Stars,good,2015-08-31


In [10]:
#lots of nan values... :( 
dataset.isnull().sum()

marketplace           0
customer_id           0
review_id             0
product_id            0
product_parent        0
product_title         0
product_category      0
star_rating          22
helpful_votes        23
total_votes          23
vine                 23
verified_purchase    23
review_headline      34
review_body          88
review_date          65
dtype: int64

In [12]:
# Lots of spammy duplicate reviews...might be worth checking if the same person is duplicating the reviews across multiple products

# Additions, because maybe same review_body is not duplicate i.e. "Good" as review.
x = dataset[dataset['review_body'].duplicated() == True]
x = x[dataset['customer_id'].duplicated() == True] # ADDED THIS
x = x[dataset['product_id'].duplicated() == True] # ADDED THIS
print(len(x))
x.head()

75122


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
2323,US,32433311,R3DPQJFH05T0HS,B00OZYNGUS,228095196,Viva Naturals - The FINEST Raw Organic Chia Se...,Grocery,5,0.0,0.0,N,Y,Great product.,I've been using this to help keep me from snac...,2015-08-31
2330,US,130585,R3IOACXWPYTRUP,B00HZ7HSTW,885171844,Beantown Roasters K Cups Variety Packs,Grocery,5,0.0,1.0,N,Y,The Office like it!,These were purchased for an office and they lo...,2015-08-31
2401,US,14100648,RA8F0DF5OJH3J,B000WV0RW8,653213046,Healthworks Chia Seeds 3lb 6lb Parent,Grocery,5,0.0,0.0,N,Y,GREAT! Thanks So much and God bless,GREAT! Thanks So much and God bless! &#60;&#...,2015-08-31
2436,US,43554935,RK0TC9FRNUL55,B00HQ3ZPJA,220133938,Chia Seeds,Grocery,5,0.0,0.0,N,Y,Five Stars,As advertised.,2015-08-31
2541,US,168472,R2WFNGP0E0ZLXW,B006ZMYLKC,322601427,Keurig,Grocery,5,0.0,0.0,N,Y,Five Stars,Satisfied,2015-08-31


In [14]:
#remove NaNs from the dataset  for now...
dataset = dataset.dropna()

In [16]:
def filter_heuristic(row):
    '''
    determine if a row is a negative review based on its score. Potentially extend it to include sentiment analysis 
    and presence of specific negative words 
    '''
    return row["star_rating"].astype(int) <=3

In [18]:
negative_rows = dataset.loc[filter_heuristic]
negative_rows.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
5,US,23649464,RIG9AWFOGRDVO,B00AL6QBZ6,681475449,Herr's Popcorn Hot Cheese 1 Oz (Pack of 30),Grocery,2,1.0,1.0,N,Y,Not Happy,The popcorn was stale.,2015-08-31
9,US,19624355,R1ODXB3C9UP3NL,B00J074W94,2499702,"Orgain Organic Plant Based Protein Powder, Pac...",Grocery,1,1.0,3.0,N,N,Disgusting now and difficult on digestion,Used to be a decent product. Disgusting now a...,2015-08-31
17,US,22765168,R3T6TTD2IN0EFZ,B00XDXMLL2,971154239,"Skippy Creamy Peanut Butter, with Salted Caram...",Grocery,1,4.0,4.0,N,N,"1 Out Of 5 Of My Co-Workers Thought It Was ""Okay""",I bought this from a local super market on a w...,2015-08-31
23,US,35636887,R9MISLBRG08FX,B00DBSFXUA,294404974,"Keebler Town House Pita Crackers, 9.5 Ounce",Grocery,1,0.0,0.0,N,Y,pita crackers,not craze about these. nothing really wrong wi...,2015-08-31
26,US,12650237,R2A9O8CWZ1PP74,B0083GJKR2,868929824,"Eclipse Sugar Free Gum, Spearmint, 120 Piece B...",Grocery,3,0.0,0.0,N,Y,Three Stars,it's gum..,2015-08-31


In [20]:
sorted_negative_rows = negative_rows.sort_values("product_id")
sorted_negative_rows.head() # ADDITION JUST TO CHECK

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
1975011,US,14434517,R25ATZCV5FBJMV,805470867,518591127,Communion Bread Pack of 500,Grocery,2.0,3.0,4.0,N,Y,The only thing my church has ever complained a...,I'm the pastor of a church plant and the only ...,2012-10-16
1365978,US,47145,RUI9CRY6RZNRB,805470867,518591127,Communion Bread Pack of 500,Grocery,1.0,0.0,0.0,N,Y,I HATE IT,what is that thing<br />it doesn't crunch like...,2014-02-21
1961839,US,37403535,R37YYRMOE7K2TB,805470867,518591127,Communion Bread Pack of 500,Grocery,1.0,1.0,1.0,N,Y,Communion bread,"I purchased 3 boxes, two of them the experatio...",2012-11-11
78777,US,41394716,R2QAW3WQRYVQF5,805470867,518591127,Communion Bread Pack of 500,Grocery,3.0,0.0,0.0,N,Y,Not what expected,"This came in a flimsy cardboard box, squished....",2015-08-04
1845770,US,11312642,R3M9ORU1I9L2C3,805470867,518591127,Communion Bread Pack of 500,Grocery,2.0,0.0,0.0,N,Y,Gross,These things taste nothing like the communion ...,2013-02-18


In [19]:
def generate_wordcloud(text):
    wordcloud = WordCloud(width = 5000, height = 5000, random_state=1, background_color='salmon', colormap='Pastel1',
                          collocations=False, stopwords = STOPWORDS).generate(text)
    plt.imshow(wordcloud)

In [22]:
# generate a naive word cloud without any preprocessing....
# generate_wordcloud(" ".join(sorted_negative_rows["review_body"]))

In [92]:
sorted_rows = dataset.sort_values("product_id")
sorted_rows["avg_rating"] = ""
sorted_rows.head(5)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,avg_rating
115213,US,886579,R18D9WDJHHZNWA,268956898,674048074,Delgada coffee infused with Chaga 28 Sachets p...,Grocery,5,7.0,8.0,N,N,You will lose weight and feel great..,This coffee has been a life saver for me.. I s...,2015-07-22,
1569777,US,41921418,R1QUUN8C0ZF7C7,657745316,31867226,100 Percent All Natural Vanilla Extract,Grocery,5,1.0,1.0,N,N,Best vanilla I've ever had,"No sugar, no GMO garbage, no fillers that come...",2013-10-11,
1299951,US,20405919,R105NG5PE8CMHK,681727810,680395208,Beemster Gouda - Aged 18/24 Months - App. 1.5 Lbs,Grocery,5,4.0,4.0,N,Y,"This cheese is so good, just wish it didn't co...","This cheese is so good, just wish it didn't co...",2014-04-01,
1941826,US,36034133,R26D6HKBPP95HB,700026444,597586824,Pure Darjeeling Tea: Loose Leaf,Grocery,5,2.0,2.0,N,N,Terrific Tea!,"This is my absolute, undisputed favorite tea r...",2012-12-06,
715823,US,50510678,R440L1BV7SZ18,786960159,449320317,Axis and Allies 1942 Second Edition: A Wwii St...,Grocery,5,0.0,0.0,N,Y,Five Stars,1st class,2015-01-05,


In [93]:
starRating = sorted_rows.iloc[0]["star_rating"]
productId = sorted_rows.iloc[0]["product_id"]
print(len(sorted_rows))

i = 0

while i < len(sorted_rows):
    productId = sorted_rows.iloc[i]["product_id"]
    sum = 0
    skip = 1

    sum += int(sorted_rows.iloc[i]["star_rating"])

    sameProduct = True

    while(sameProduct):
        if((i + skip) == len(sorted_rows)):
            sameProduct = False
        elif(sorted_rows.iloc[i + skip]["product_id"] == productId):
            sum += int(sorted_rows.iloc[i + skip]["star_rating"])
            skip += 1
        else:
            sameProduct = False

    avgRating = round((sum / skip), 2)

    for j in range(skip):
        sorted_rows.iat[(i + j),15] = avgRating
    
    i += 1
    i += (skip - 1)

sorted_rows.head(25)


2393238


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,avg_rating
115213,US,886579,R18D9WDJHHZNWA,268956898,674048074,Delgada coffee infused with Chaga 28 Sachets p...,Grocery,5.0,7.0,8.0,N,N,You will lose weight and feel great..,This coffee has been a life saver for me.. I s...,2015-07-22,5.0
1569777,US,41921418,R1QUUN8C0ZF7C7,657745316,31867226,100 Percent All Natural Vanilla Extract,Grocery,5.0,1.0,1.0,N,N,Best vanilla I've ever had,"No sugar, no GMO garbage, no fillers that come...",2013-10-11,5.0
1299951,US,20405919,R105NG5PE8CMHK,681727810,680395208,Beemster Gouda - Aged 18/24 Months - App. 1.5 Lbs,Grocery,5.0,4.0,4.0,N,Y,"This cheese is so good, just wish it didn't co...","This cheese is so good, just wish it didn't co...",2014-04-01,5.0
1941826,US,36034133,R26D6HKBPP95HB,700026444,597586824,Pure Darjeeling Tea: Loose Leaf,Grocery,5.0,2.0,2.0,N,N,Terrific Tea!,"This is my absolute, undisputed favorite tea r...",2012-12-06,5.0
715823,US,50510678,R440L1BV7SZ18,786960159,449320317,Axis and Allies 1942 Second Edition: A Wwii St...,Grocery,5.0,0.0,0.0,N,Y,Five Stars,1st class,2015-01-05,5.0
1395096,US,41650959,R30F1NHZYFDBLE,805470867,518591127,Communion Bread Pack of 500,Grocery,1.0,0.0,0.0,N,Y,Communion bread,The taste & texture of this product is terribl...,2014-02-05,3.55
1713269,US,10574871,R3V4ZX64YU2EB3,805470867,518591127,Communion Bread Pack of 500,Grocery,5.0,0.0,0.0,N,Y,Broadman products are excellent,I have been very pleased with this product...a...,2013-06-08,3.55
1738286,US,11425554,R263F2Q3CR0JJG,805470867,518591127,Communion Bread Pack of 500,Grocery,5.0,0.0,2.0,N,Y,WORSHIP,The communion bread you take to let my Lord kn...,2013-05-18,3.55
85156,US,21557738,RQQGEZP7Y51CA,805470867,518591127,Communion Bread Pack of 500,Grocery,5.0,0.0,0.0,N,Y,Five Stars,what I needed,2015-08-02,3.55
1539819,US,44727551,R1RQYKPG2BBZPW,805470867,518591127,Communion Bread Pack of 500,Grocery,5.0,0.0,0.0,N,Y,The package was well recieved and we enjoyed i...,Awesome packing and easy to use. Made it so ea...,2013-11-04,3.55


In [94]:
sorted_rows.tail(50)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,avg_rating
3601,US,33372973,R35IN4UTJS0T4A,B013Z565IC,860421107,Powdered Eggs Dried Egg Mix for Scrambled Eggs...,Grocery,4,6.0,7.0,N,Y,"This a bulk commodity, and the product is ship...","Note:<br />This a bulk commodity, and the prod...",2015-08-30,4.0
21230,US,11562559,R17NG8LRJ62RZK,B013Z5P7LS,375327014,Bumble Bee Chicken Salad with Crackers 3.25oz....,Grocery,5,0.0,0.0,N,N,Great product! Handy for when you are on the go!,Great value for a great product. Good for work...,2015-08-24,5.0
244,US,30680101,R2P9HK6G9RRNKE,B0140324OW,331127238,M&M's Pecan Pie Limited Edition Fall Milk Choc...,Grocery,4,9.0,9.0,N,N,"No pecan pie flavor at all, but still amazingl...","Delicious! This is by far, one of the better M...",2015-08-31,4.0
26875,US,20208193,R3H13HPLSV4ZIC,B014070UZI,26819128,Awesome Snacks Gift Box Care Package Bundle (3...,Grocery,5,1.0,1.0,N,N,Convenient,I loved my order. Very good and convenient for...,2015-08-22,5.0
10536,US,40719186,R934HAMEJB074,B014070UZI,26819128,Awesome Snacks Gift Box Care Package Bundle (3...,Grocery,5,1.0,1.0,N,N,College snack pack,I sent to my son for a college cake package. H...,2015-08-28,5.0
30387,US,15083467,R31DICRUJ1MP8E,B0141GYWZM,901014761,Jans Mixed Roots Sea Salt Chips - All Natural ...,Grocery,2,3.0,4.0,N,N,i hv to say I LOVE THIS CHIPS,i hv to say I LOVE THIS CHIPS! I once got it f...,2015-08-21,2.0
1269,US,20523034,RGLOABU68EMJ3,B0141KONH0,37823474,Marions Kitchen Meal Kit Pad Thai,Grocery,2,4.0,5.0,N,N,not very good,"The box and presentation looked good , but the...",2015-08-31,2.0
2442,US,38681294,R1T5HKBUSNJ2Q5,B0142KDXRU,700904394,STEVE'S PALEOGOODS > Paleo & Gluten Free > Sau...,Grocery,5,0.0,0.0,N,N,Tasty!,Excellent sauce for grilled chicken (and an ex...,2015-08-31,5.0
803,US,35058835,R2K2VLYP2961X5,B0143M04XI,968540398,Gator Queen Liz Louisiana Bayou Seasoning Eliz...,Grocery,5,1.0,1.0,N,Y,We love the zingy flavor of this Bayou Seasoning,We love the zingy flavor of this Bayou Seasoni...,2015-08-31,5.0
13458,US,39546970,R1ON886BFBSB29,B0143M04XI,968540398,Gator Queen Liz Louisiana Bayou Seasoning Eliz...,Grocery,5,1.0,1.0,N,Y,yumm!,The smell is amazing! The taste of this wonder...,2015-08-27,5.0
