In [8]:
import gzip
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import urllib.request
import io
from nltk import tokenize
import time

# Amazon Review Data

Data downloaded from http://jmcauley.ucsd.edu/data/amazon/links.html

R. He, J. McAuley. Modeling the visual evolution of fashion trends with one-class collaborative filtering. WWW, 2016


J. McAuley, C. Targett, J. Shi, A. van den Hengel. Image-based recommendations on styles and substitutes. SIGIR, 2015

## Read data into pandas dataframe

In [132]:
#path = '/home/amkabatznick/w266-final-project/data/raw/amazon/reviews_Office_Products_5.json.gz'
path = '/home/amkabatznick/w266-final-project/data/raw/amazon/reviews_Musical_Instruments_5.json.gz'
product = 'Musical_Instruments'


### Parsing function

In [10]:
def parse(path): 
    g = gzip.open(path, 'rb') 
    for l in g: 
        yield eval(l) 

### Read into dataframe

In [11]:
def getDF(path): 
    i = 0 
    df = {} 
    for d in parse(path): 
        df[i] = d 
        i += 1
    return pd.DataFrame.from_dict(df, orient='index').rename(columns={'reviewTime': 'reviewDate'})

#### Read Musical Instruments reviews into dataframe

In [12]:
df_product = getDF(path)
df_product.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewDate
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [13]:
products = df_product.asin.unique()

### Create helper functions to find all features

In [14]:
def find_all_features():
    #Used to Time the function
    start = time.time()
    last = start
    #What the output array will look like
    columns=['features','sentences','product_id','review_helpful']
    all_features = pd.DataFrame(columns=columns)
    #loop through all reviews
    for i in df_product.index:
        if i % 1000==0:
            print('Currently Parsing Review {:,}'.format(i))
            end = time.time()
            hours, rem = divmod(end-start, 3600)
            minutes, seconds = divmod(rem, 60)
            print("The total code has been running for: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
            
            hours, rem = divmod(end-last, 3600)
            minutes, seconds = divmod(rem, 60)
            print("The last 1,000 took: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
            last = end
             
        sentence = tokenize.sent_tokenize(df_product.reviewText[i])
        product_features = find_nouns(sentence)
        product_features['product_id'] = df_product.asin[i]
        product_features['review_helpful'] = df_product.helpful[i][0]
        all_features = all_features.append(product_features,ignore_index=True)
    
    return all_features

In [15]:
def find_nouns(sentences):
    features = pd.DataFrame(columns=['features','sentences'])
    #nouns = ['NN','NNS','NNP','	NNPS']
    nouns = ['NN','NNS']
    for i in range(len(sentences)):
        tags = nltk.pos_tag(nltk.word_tokenize(sentences[i]))
        for word in tags:
            if word[1] in nouns:
                features = features.append(pd.DataFrame(data=[[word[0],sentences[i]]],columns=['features','sentences']),ignore_index=True)
                break
    return features

### Find all Features

In [16]:
all_features = find_all_features()

Currently Parsing Review 0
The total code has been running for: 00:00:00.01
The last 1,000 took: 00:00:00.01
Currently Parsing Review 1,000
The total code has been running for: 00:00:16.84
The last 1,000 took: 00:00:16.83
Currently Parsing Review 2,000
The total code has been running for: 00:00:33.97
The last 1,000 took: 00:00:17.14
Currently Parsing Review 3,000
The total code has been running for: 00:00:52.70
The last 1,000 took: 00:00:18.72
Currently Parsing Review 4,000
The total code has been running for: 00:01:12.47
The last 1,000 took: 00:00:19.78
Currently Parsing Review 5,000
The total code has been running for: 00:01:37.41
The last 1,000 took: 00:00:24.94
Currently Parsing Review 6,000
The total code has been running for: 00:02:06.74
The last 1,000 took: 00:00:29.33
Currently Parsing Review 7,000
The total code has been running for: 00:02:38.31
The last 1,000 took: 00:00:31.57
Currently Parsing Review 8,000
The total code has been running for: 00:03:15.04
The last 1,000 took:

### Find all features that have a number of reviews great than 5

In [17]:
#grouped_features = all_features.groupby(['features','product_id'])
#distinct_grouped_features = grouped_features.filter(lambda x: x.count()['review_helpful'] > 5)

In [18]:
#np.mean(distinct_grouped_features.groupby(['features','product_id'])['review_helpful'].count())

10.43286219081272

In [19]:
#distinct_grouped_features.groupby(['features','product_id'])['review_helpful'].max()

features   product_id
accurate   B003VWKPHC      0
adapter    B0002GZM00      0
adapters   B000RNB720      1
amp        B0002CZV82    142
           B0002D0096      8
           B000B6DHAS     38
           B000B6DHB2    130
           B000MVYOZY     13
           B000VOBU9U      2
           B001J1JRN0     15
           B001L8IJ0I     27
           B0027M30Z6     32
           B002GHBZ4U      6
           B002GYWBIM      3
           B002GYWBJ6     17
           B003D3OCD2      9
           B003S3S0DU     22
           B0042EZH6W    190
           B004ISK48S     74
           B006Y2FUW6     17
           B008BTTQI4     27
anchors    B0002GLCRC      0
bass       B0002OS9FC      6
           B000KIPUQG      8
batteries  B003VWJ2K8      1
battery    B003VWJ2K8     36
           B005FKF1PY     20
bench      B000GUR8V8      0
board      B001G3TJXO      5
           B001RNOHHG      3
                        ... 
tuner      B004XNK7AI      2
           B004Z17008    290
           B005FKF1PY

In [22]:
grouped_features_10 = all_features.groupby(['features','product_id'])
distinct_grouped_features_10 = grouped_features_10.filter(lambda x: x.count()['review_helpful'] > 10)
distinct_grouped_features_10.to_csv('/home/amkabatznick/w266-final-project/data/processed/amazon/{}/reviews_data.csv'.format(product))
#distinct_grouped_features_10.groupby(['features','product_id'])['review_helpful'].count()

In [23]:
distinct_grouped_features_10.groupby(['features','product_id'])['review_helpful'].count()

features  product_id
amp       B000B6DHB2    15
          B001L8IJ0I    11
          B002GYWBIM    15
          B003D3OCD2    16
          B0042EZH6W    37
          B004ISK48S    13
          B006Y2FUW6    14
cable     B000068NW5    19
          B0002CZZW4    13
          B0002GMGYA    24
          B000PO30QM    28
          B000VJJQUU    11
cables    B0002GMGYA    12
capo      B0002CZSJY    15
          B0002E2KPC    17
          B0007GGUGA    15
          B000B6FBA2    13
          B000Y7Q2C4    11
          B0026RB0G8    11
          B008BPI2HE    25
          B00GTSM8FW    23
capos     B0002E2KPC    11
case      B0002D01K4    17
          B0002FO9QY    17
clip      B003VWKPHC    12
cloth     B0002GYW4C    15
cutter    B0002E1G5C    12
delay     B0049Z4UQ2    11
display   B003VWJ2K8    15
          B003VWKPHC    15
                        ..
strings   B0002PBS6S    12
          B000A2HOB6    14
          B000A6ASSS    20
          B003B01QL8    14
          B003NJF1G8    14
       

In [131]:
distinct_features = distinct_grouped_features_10.groupby(['features','product_id']
                                                        )['review_helpful'].size().reset_index()[['features',
                                                                                                  'product_id']]

for i in distinct_features.index:
    features = distinct_features['features'].loc[i]
    product_id = distinct_features['product_id'].loc[i]
    all_sentences = distinct_grouped_features_10[(distinct_grouped_features_10.product_id==product_id) & 
                        (distinct_grouped_features_10.features==features)]

    MaxSentences = ['']
    for i in range(3):
        current_max = all_sentences.review_helpful.astype(float).idxmax(axis=0)
        MaxSentences.append(all_sentences.loc[current_max]['sentences'])
        all_sentences.drop(current_max,inplace=True)

    text_of_review = ' '.join(all_sentences.sentences[:100].values)
    text_of_review = text_of_review+'\n\n@highlight\n\n'.join(MaxSentences)
    with open("/home/amkabatznick/w266-final-project/data/processed/amazon/{0:}/{1:}_{2:}.txt".format(product,product_id,features), "w") as text_file:
        text_file.write(text_of_review)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
