In [1]:
import gzip
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import urllib.request
import io
from nltk import tokenize

# Amazon Review Data

Data downloaded from http://jmcauley.ucsd.edu/data/amazon/links.html

R. He, J. McAuley. Modeling the visual evolution of fashion trends with one-class collaborative filtering. WWW, 2016


J. McAuley, C. Targett, J. Shi, A. van den Hengel. Image-based recommendations on styles and substitutes. SIGIR, 2015

## Read data into pandas dataframe

In [2]:
# Update path given user's local directory structure
path = r'/Users/annielane/Documents/MIDS/W266/w266-final-project/data/raw/amazon/reviews_Office_Products_5.json.gz'

### Parsing function

In [3]:
def parse(path): 
    g = gzip.open(path, 'rb') 
    for l in g: 
        yield eval(l) 

### Read into dataframe

In [4]:
def getDF(path): 
    i = 0 
    df = {} 
    for d in parse(path): 
        df[i] = d 
        i += 1
    return pd.DataFrame.from_dict(df, orient='index').rename(columns={'reviewTime': 'reviewDate'})

### Reviews into Dataframe

In [5]:
df = getDF(path)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewDate
0,A32T2H8150OJLU,B00000JBLH,ARH,"[3, 4]","I bought my first HP12C in about 1984 or so, a...",5.0,"A solid performer, and long time friend",1094169600,"09 3, 2004"
1,A3MAFS04ZABRGO,B00000JBLH,"Let it Be ""Alan""","[7, 9]",WHY THIS BELATED REVIEW? I feel very obliged t...,5.0,"Price of GOLD is up, so don't bury the golden ...",1197676800,"12 15, 2007"
2,A1F1A0QQP2XVH5,B00000JBLH,Mark B,"[3, 3]",I have an HP 48GX that has been kicking for mo...,2.0,"Good functionality, but not durable like old HPs",1293840000,"01 1, 2011"
3,A49R5DBXXQDE5,B00000JBLH,R. D Johnson,"[7, 8]",I've started doing more finance stuff recently...,5.0,One of the last of an almost extinct species,1145404800,"04 19, 2006"
4,A2XRMQA6PJ5ZJ8,B00000JBLH,Roger J. Buffington,"[0, 0]",For simple calculations and discounted cash fl...,5.0,Still the best,1375574400,"08 4, 2013"


In [6]:
products = df.asin.unique()

In [7]:
len(products)

2420

In [8]:
df['asin'].value_counts()

B0010T3QT2    311
B0039N7ELS    227
B0027CTFBO    205
B0039N3QFQ    186
B002K9M6OW    185
B000MFHX3U    185
B0039N3QO2    182
B002K9GOPE    182
B002K9IHJK    180
B003FHBPRM    178
B002K9PIKG    176
B000WU4H5C    176
B002K9XU0Q    175
B002K9M7MS    173
B0035FX5MC    173
B000XSPEPA    167
B002ECFIDG    163
B008DF54N2    156
B0009F3P3U    153
B001PV36V8    151
B003FHF98S    146
B008I213WQ    144
B0026ICM1E    142
B0000538AC    141
B00G411O8G    131
B002M7VZXA    127
B002NU5ND4    123
B004YGBIVQ    122
B004ZKXC2O    122
B004O49F7M    121
             ... 
B001F0RI1I      5
B00D4OUFF4      5
B00450N5BM      5
B000KJRRBQ      5
B0006IEV60      5
B00004Z47L      5
B001E641P4      5
B0041OWIRO      5
B001S2PKP4      5
B001A4X296      5
B0036D5XGO      5
B004JQPEBQ      5
B001S2PKFY      5
B003VNIL90      5
B000V7N810      5
B002BA5WK0      5
B004H4EBHI      5
B004XLBL96      5
B001B09BX0      5
B0049FVB2I      5
B0006OKKN2      5
B0000C9ZJY      5
B00135HSI8      5
B0006HVU8S      5
B006LTTHNA

### Look at particular product example

In [9]:
# Five Star Flex Hybrid NoteBinder = B003O3F2R6
df_example = df.loc[df['asin'].str.contains('B003O3F2R6')]
df_example.shape

(41, 9)

In [10]:
df_example['overall'].describe()

count    41.000000
mean      4.365854
std       0.766684
min       2.000000
25%       4.000000
50%       5.000000
75%       5.000000
max       5.000000
Name: overall, dtype: float64

In [11]:
# Stats on length of reviews 
df_example['reviewText'].str.len()
df_example['reviewText'].str.split().apply(len).describe()

count      41.000000
mean      248.707317
std       249.345167
min        43.000000
25%        96.000000
50%       167.000000
75%       269.000000
max      1401.000000
Name: reviewText, dtype: float64

In [13]:
df_example.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewDate
29549,A4QXTN3K865NR,B003O3F2R6,"aar! ""aar!""","[1, 1]","The manufacturer, Mead describes the one inch ...",5.0,Best Notebook/Binder Ever,1401235200,"05 28, 2014"
29550,AD0WUBKBO21KK,B003O3F2R6,Amazon Customer,"[1, 1]","I like the nylon spine on this binder, which e...",2.0,Seemed great until bits started breaking off....,1401580800,"06 1, 2014"
29551,A3T4OHEAHOJ700,B003O3F2R6,B. Colonna,"[1, 1]",It is hard to get excited over a binder or not...,4.0,"Half A Notebook, Half A Binder = A Fun, But A ...",1403827200,"06 27, 2014"
29552,A2FLJCJ3SW0TUY,B003O3F2R6,"Brie ""Brie""","[1, 1]","I'm reviewing the blue hybrid notebinder, whic...",4.0,"Love the design, but costly.",1403308800,"06 21, 2014"
29553,AYNAH993VDECT,B003O3F2R6,bsg2004,"[1, 1]",This is a smart idea for a hybrid notepad-bind...,5.0,Versatile and Practical with one flaw (that ma...,1401494400,"05 31, 2014"


In [12]:
df_example['reviewText'].iloc[0]

'The manufacturer, Mead describes the one inch binder as a &#34;Flex Hybrid NoteBinder is the next generation in portable organization. This unique innovation acts like a notebook and works like a binder, offering both flexible capacity and carrying convenience. Outfitted with the patented TechLock fixture, its rings open and close easily for quick customization&#34;. No doubt this is the neatest &#34;Hybrid&#34; Binder ever.  First the easy to open binder Techlock rings are easy to open, are plastic and have not misaligned or broken.  What we like about this notebook is the fact it works like a binder, folds flat as it does not have a stiff board, but a soft nylon edge.  It comes with 80 sheets of both ruled and graph type (quad ruled) paper, 5 plastic tabbed, see through dividers that hold documents without having to punch holes in.  In addition, each divider has a 4&#34; X 7&#34; flapped card pocket.  Overall this is the slickest Notebook/Binder we come across - highly recommended w