### Useful links:
- [Dataset documentation](https://nijianmo.github.io/amazon/index.html)
- [Complete Metadata files](http://deepyeti.ucsd.edu/jianmo/amazon/index.html)
- [Pandas reference sheet](https://ds100.org/sp21/resources/assets/exams/sp20/sp20_checkpoint_reference_sheet.pdf)
- [Data-200 Google Doc](https://docs.google.com/document/d/19HWODy5kpWoUB7BEKEmKLbRnK8MC1fBmRat_WP7vfNc/edit)
- [Grad Project Guidelines](https://ds100.org/sp21/grad_proj/gradproject/)
- [Git repo](https://github.com/alexander-zw/data200-proj)

In [27]:
import os
import numpy as np
import pandas as pd
import json
import gzip
import urllib.request
from urllib.request import urlopen
import seaborn as sns

In [28]:
url = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty_5.json.gz"
filename = 'All_Beauty_5.json.gz'
if not os.path.exists(filename):
    urllib.request.urlretrieve(url,filename)


In [29]:
### load the data

data = []
with gzip.open(filename) as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of reviews
print(len(data))

# first row of the list
print(data[0])

5269
{'overall': 5.0, 'verified': True, 'reviewTime': '09 1, 2016', 'reviewerID': 'A3CIUOJXQ5VDQ2', 'asin': 'B0000530HU', 'style': {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice Blue'}, 'reviewerName': 'Shelly F', 'reviewText': 'As advertised. Reasonably priced', 'summary': 'Five Stars', 'unixReviewTime': 1472688000}


#### Convert to dataframe:

In [30]:
reviews = pd.DataFrame.from_dict(data)
reviews.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,


In [31]:
# Check score-wise values
reviews[(reviews['overall'] == 5)]

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25,
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3,
5,5.0,False,"05 16, 2010",A24HQ2N7332W7W,B00006L9LC,{'Size:': ' 366'},Kindle Customer Joyce Wilson,"If you know the scent of Diva, you'll LOVE thi...",Diva is Heavenly,1273968000,,
...,...,...,...,...,...,...,...,...,...,...,...,...
5264,5.0,True,"08 23, 2018",AUX122XW8ONG6,B01DLR9IDI,{'Design:': ' ETA-C6B5F7C374'},Amzon Customer,I have genetic undereye darkness. Ive accepted...,Pretty Sweet!!!,1534982400,2,
5265,5.0,True,"08 23, 2018",AUX122XW8ONG6,B01DLR9IDI,{'Design:': ' ETA-7117EE3788'},Amzon Customer,I absolutely love this eye gel.,As advertised,1534982400,,
5266,5.0,True,"08 23, 2018",AUX122XW8ONG6,B01DLR9IDI,{'Design:': ' ETA-BF66BD2F87'},Amzon Customer,The eye gel is easy to apply and I use it morn...,I'm very happy with,1534982400,,
5267,5.0,True,"08 23, 2018",AUX122XW8ONG6,B01DLR9IDI,{'Design:': ' ETA-03868FCB34'},Amzon Customer,Ok this eye gel is good stuff.,Does it's thing!,1534982400,,


#### Column labels:
- reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- asin - ID of the product, e.g. 0000013714
- reviewerName - name of the reviewer
- vote - helpful votes of the review
- style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
- reviewText - text of the review
- overall - rating of the product
- summary - summary of the review
- unixReviewTime - time of the review (unix time)
- reviewTime - time of the review (raw)
- image - images that users post after they have received the product

#### Checking 5-core:
A 5-core dataset contains only those users with at least 5 reviews.

In [32]:
reviews.groupby(by="reviewerID").size().sort_values()

reviewerID
A24HQ2N7332W7W     4
A105A034ZG9EHO     5
A3COAV45SLM4LY     5
A3CPIVUW77AK6K     5
A3CZ890UHC8HHZ     5
                  ..
A11QGZ39A7ZF0X    10
AUX122XW8ONG6     11
A3NFZN1GS1RKR9    11
A1UQBFCERIP7VJ    12
AKJHHD5VEH7VG     12
Length: 991, dtype: int64

#### Import metadata:

In [33]:
url = "http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles/meta_All_Beauty.json.gz"
filename = 'Meta_All_Beauty_5.json.gz'


if not os.path.exists(filename):
    urllib.request.urlretrieve(url,filename)

In [34]:
### load the data

data = []
with gzip.open(filename) as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

# first row of the list
print(data[0])

32892
{'category': [], 'tech1': '', 'description': ["Loud 'N Clear Personal Sound Amplifier allows you to turn up the volume on what people around you are saying, listen at the level you want without disturbing others, hear a pin drop from across the room."], 'fit': '', 'title': "Loud 'N Clear&trade; Personal Sound Amplifier", 'also_buy': [], 'image': [], 'tech2': '', 'brand': 'idea village', 'feature': [], 'rank': '2,938,573 in Beauty & Personal Care (', 'also_view': [], 'details': {'ASIN: ': '6546546450'}, 'main_cat': 'All Beauty', 'similar_item': '', 'date': '', 'price': '', 'asin': '6546546450'}


#### Convert to dataframe:

In [35]:
metadata = pd.DataFrame.from_dict(data)
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,image,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",[],,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],[https://images-na.ssl-images-amazon.com/image...,,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],[https://images-na.ssl-images-amazon.com/image...,,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790


In [36]:
metadata['asin'].value_counts().sort_values()

B01EIPJYRC    1
B018SMKROU    1
B001NZRCIY    1
B015V7MDXI    1
B004PC43RU    1
             ..
B0002F1HJ6    2
B0001BOPZO    2
B00027D5H6    2
B0000532ZN    2
B00028NL6U    2
Name: asin, Length: 32488, dtype: int64

#### Merging the reviews and metadata on `asin`:

In [37]:
df = reviews.merge(metadata,how="left",left_on = "asin",right_on = "asin")

In [38]:
df.shape

(5767, 29)

In [39]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,...,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,...,,Aqua Velva,[],"65,003 in Beauty & Personal Care (","[B01I9TIY1U, B07L1PZCS7, B01N12C89Y, B01I9TINT...",{'  Product Dimensions: ': '3 x 4 x 5 ...,All Beauty,,,
1,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,...,,Aqua Velva,[],"65,003 in Beauty & Personal Care (","[B01I9TIY1U, B07L1PZCS7, B01N12C89Y, B01I9TINT...",{'  Product Dimensions: ': '3 x 4 x 5 ...,All Beauty,,,
2,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,...,,Aqua Velva,[],"65,003 in Beauty & Personal Care (","[B01I9TIY1U, B07L1PZCS7, B01N12C89Y, B01I9TINT...",{'  Product Dimensions: ': '3 x 4 x 5 ...,All Beauty,,,
3,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,...,,Aqua Velva,[],"65,003 in Beauty & Personal Care (","[B01I9TIY1U, B07L1PZCS7, B01N12C89Y, B01I9TINT...",{'  Product Dimensions: ': '3 x 4 x 5 ...,All Beauty,,,
4,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,...,,Aqua Velva,[],"65,003 in Beauty & Personal Care (","[B01I9TIY1U, B07L1PZCS7, B01N12C89Y, B01I9TINT...",{'  Product Dimensions: ': '3 x 4 x 5 ...,All Beauty,,,


#### Column labels:
- asin - ID of the product, e.g. 0000031852
- title - name of the product
- feature - bullet-point format features of the product
- description - description of the product
- price - price in US dollars (at time of crawl)
- image - url of the product image
- related - related products (also bought, also viewed, bought together, buy after viewing)
- salesRank - sales rank information
- brand - brand name
- categories - list of categories the product belongs to
- tech1 - the first technical detail table of the product
- tech2 - the second technical detail table of the product
- similar - similar product table

#### We can clean the data a little: 
- Change `overall` column name to `rating`
- `asin` to `productid`
- Extract `gift_amount` from `style`
- Extract `rank#` from `rank`
- Rempve rows with no information on the price either from `price` column or from `gift_amount` as both should be the same
- `price` missing? Change dataset?

In [40]:
list_of_top_25 = df.brand.value_counts().to_frame().iloc[:25].reset_index()['index']

In [41]:
top_25_brands = df[df.brand.isin(list_of_top_25)]

In [42]:
top_25_brands.groupby(['brand','verified']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,overall,unixReviewTime
brand,verified,Unnamed: 2_level_1,Unnamed: 3_level_1
,False,4.25,1447404000.0
,True,4.666667,1475206000.0
AXE PW,False,3.0,1215965000.0
AXE PW,True,4.0,1423786000.0
Aqua Velva,False,5.0,1304381000.0
Aqua Velva,True,3.666667,1411286000.0
Avalon,False,4.962963,1432978000.0
Avalon,True,4.740319,1451692000.0
Bath & Body Works,False,4.969388,1424803000.0
Bath & Body Works,True,4.831039,1440729000.0


In [50]:
df.groupby(['verified']).mean()['overall'].to_frame()

Unnamed: 0_level_0,overall
verified,Unnamed: 1_level_1
False,4.607656
True,4.80665


In [51]:
# avg rating by whether product is verified

sns.barplot(data = df.groupby(['verified']).mean()['overall'].to_frame(), y = 'overall', x = 'verified')

ValueError: Could not interpret input 'verified'

In [44]:
def words_in_texts(words, texts):
    '''
    Args:
        words (list): words to find
        texts (Series): strings to search in
    
    Returns:
        NumPy array of 0s and 1s with shape (n, p) where n is the
        number of texts and p is the number of words.
    '''
    import numpy as np
    indicator_array = 1 * np.array([texts.str.contains(word) for word in words]).T
    return indicator_array

good_words = ['good', 'great', 'incredible', 'wonderful', 'amazing']
bad_words = ['bad','broke','not']
other_words = ['surprise']
good = words_in_texts(good_words, df['reviewText'])
bad = words_in_texts(bad_words, df['reviewText'])
other = words_in_texts(other_words, df['reviewText'])