In [148]:
import gzip
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import urllib.request
import io
from nltk import tokenize
import time

# Amazon Review Data

Data downloaded from http://jmcauley.ucsd.edu/data/amazon/links.html

R. He, J. McAuley. Modeling the visual evolution of fashion trends with one-class collaborative filtering. WWW, 2016


J. McAuley, C. Targett, J. Shi, A. van den Hengel. Image-based recommendations on styles and substitutes. SIGIR, 2015

## Read data into pandas dataframe

In [149]:
path = '/home/amkabatznick/w266-final-project/data/raw/amazon/reviews_Office_Products_5.json.gz'
#path = '/home/amkabatznick/w266-final-project/data/raw/amazon/reviews_Musical_Instruments_5.json.gz'
product = 'Office_Products'


### Parsing function

In [150]:
def parse(path): 
    g = gzip.open(path, 'rb') 
    for l in g: 
        yield eval(l) 

### Read into dataframe

In [151]:
def getDF(path): 
    i = 0 
    df = {} 
    for d in parse(path): 
        df[i] = d 
        i += 1
    return pd.DataFrame.from_dict(df, orient='index').rename(columns={'reviewTime': 'reviewDate'})

#### Read Musical Instruments reviews into dataframe

In [152]:
df_product = getDF(path)
df_product.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewDate
0,A32T2H8150OJLU,B00000JBLH,ARH,"[3, 4]","I bought my first HP12C in about 1984 or so, a...",5.0,"A solid performer, and long time friend",1094169600,"09 3, 2004"
1,A3MAFS04ZABRGO,B00000JBLH,"Let it Be ""Alan""","[7, 9]",WHY THIS BELATED REVIEW? I feel very obliged t...,5.0,"Price of GOLD is up, so don't bury the golden ...",1197676800,"12 15, 2007"
2,A1F1A0QQP2XVH5,B00000JBLH,Mark B,"[3, 3]",I have an HP 48GX that has been kicking for mo...,2.0,"Good functionality, but not durable like old HPs",1293840000,"01 1, 2011"
3,A49R5DBXXQDE5,B00000JBLH,R. D Johnson,"[7, 8]",I've started doing more finance stuff recently...,5.0,One of the last of an almost extinct species,1145404800,"04 19, 2006"
4,A2XRMQA6PJ5ZJ8,B00000JBLH,Roger J. Buffington,"[0, 0]",For simple calculations and discounted cash fl...,5.0,Still the best,1375574400,"08 4, 2013"


In [138]:
products = df_product.asin.unique()

### Create helper functions to find all features

In [139]:
def find_all_features():
    #Used to Time the function
    start = time.time()
    last = start
    #What the output array will look like
    columns=['features','sentences','product_id','review_helpful']
    all_features = pd.DataFrame(columns=columns)
    #loop through all reviews
    for i in df_product.index:
        if i % 1000==0:
            print('Currently Parsing Review {:,}'.format(i))
            end = time.time()
            hours, rem = divmod(end-start, 3600)
            minutes, seconds = divmod(rem, 60)
            print("The total code has been running for: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
            
            hours, rem = divmod(end-last, 3600)
            minutes, seconds = divmod(rem, 60)
            print("The last 1,000 took: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
            last = end
             
        sentence = tokenize.sent_tokenize(df_product.reviewText[i])
        product_features = find_nouns(sentence)
        product_features['product_id'] = df_product.asin[i]
        product_features['review_helpful'] = df_product.helpful[i][0]
        all_features = all_features.append(product_features,ignore_index=True)
    
    return all_features

In [140]:
def find_nouns(sentences):
    features = pd.DataFrame(columns=['features','sentences'])
    #nouns = ['NN','NNS','NNP','	NNPS']
    nouns = ['NN','NNS']
    for i in range(len(sentences)):
        tags = nltk.pos_tag(nltk.word_tokenize(sentences[i]))
        for word in tags:
            if word[1] in nouns:
                features = features.append(pd.DataFrame(data=[[word[0],sentences[i]]],columns=['features','sentences']),ignore_index=True)
                break
    return features

### Find all Features

In [154]:
try:
    all_features = pd.read_csv('/home/amkabatznick/w266-final-project/data/processed/amazon/Office_Products/reviews_data.csv')
except:
    all_features = find_all_features()

### Find all features that have a number of reviews great than 5

In [142]:
#grouped_features = all_features.groupby(['features','product_id'])
#distinct_grouped_features = grouped_features.filter(lambda x: x.count()['review_helpful'] > 5)

In [143]:
#np.mean(distinct_grouped_features.groupby(['features','product_id'])['review_helpful'].count())

In [144]:
#distinct_grouped_features.groupby(['features','product_id'])['review_helpful'].max()

In [155]:
grouped_features_10 = all_features.groupby(['features','product_id'])
distinct_grouped_features_10 = grouped_features_10.filter(lambda x: x.count()['review_helpful'] > 10)
distinct_grouped_features_10.to_csv('/home/amkabatznick/w266-final-project/data/processed/amazon/{}/reviews_data.csv'.format(product))
#distinct_grouped_features_10.groupby(['features','product_id'])['review_helpful'].count()

In [156]:
distinct_grouped_features_10.groupby(['features','product_id'])['review_helpful'].count()

features    product_id
*           B00EO58Z5O     12
***         B004H3XKR6     12
CD          B002M7W19C     17
Notes       B002XYO84U     14
            B0035FX5MC     12
            B004VJSFG2     12
            B0063C5NMY     13
Page        B004VJSG2A     14
Post-its    B003FHBPRM     15
Sharpies    B00BEYXGNY     14
WINNER      B00671E4B2     12
address     B002K9GOPE     21
            B002K9PIKG     28
            B002V1H5UY     12
adhesive    B0010T3QT2     17
            B002ECFIDG     12
            B002K9IHJK     14
            B0039N7ELS     22
            B004YGBIVQ     16
air         B00934G9RC     13
angle       B00BXCDOH0     13
anything    B0039N3QO2     12
            B008DF54N2     14
app         B00ATM1N9Q     28
apple       B004YQZPVA     11
applicator  B004O49F7M     26
area        B004412E8W     13
arm         B009NSJ8J4     17
            B00HUTA3Z6     13
arms        B009NSGO3M     11
                         ... 
wireless    B0091UBCAW     14
            B0091

In [168]:
distinct_features = distinct_grouped_features_10.groupby(['features','product_id']
                                                        )['review_helpful'].size().reset_index()[['features',
                                                                                                  'product_id']]

for i in distinct_features.index:
    features = distinct_features['features'].loc[i]
    product_id = distinct_features['product_id'].loc[i]
    all_sentences = distinct_grouped_features_10[(distinct_grouped_features_10.product_id==product_id) & 
                        (distinct_grouped_features_10.features==features)]

    #MaxSentences = ['']
    #for i in range(3):
        #current_max = all_sentences.review_helpful.astype(float).idxmax(axis=0)
        #MaxSentences.append(all_sentences.loc[current_max]['sentences'])
        #all_sentences.drop(current_max,inplace=True)

    MaxSentences = ' '.join(all_sentences.sort_values('review_helpful')[-3:]['sentences'].values)
    
    text_of_review = ' '.join(all_sentences.sentences[:100].values)
    text_of_review = text_of_review+'\n\n@highlight\n\n'.join(MaxSentences)
    with open("/home/amkabatznick/w266-final-project/data/processed/amazon/{0:}/{1:}_{2:}.txt".format(product,product_id,features), "w") as text_file:
        text_file.write(text_of_review)