In [290]:
import gzip
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import urllib.request
import io
from nltk import tokenize
import time
import os

# Amazon Review Data

Data downloaded from http://jmcauley.ucsd.edu/data/amazon/links.html

R. He, J. McAuley. Modeling the visual evolution of fashion trends with one-class collaborative filtering. WWW, 2016


J. McAuley, C. Targett, J. Shi, A. van den Hengel. Image-based recommendations on styles and substitutes. SIGIR, 2015

## Read data into pandas dataframe

In [291]:
path = '/home/amkabatznick/w266-final-project/data/raw/amazon/reviews_Office_Products_5.json.gz'
#path = '/home/amkabatznick/w266-final-project/data/raw/amazon/reviews_Musical_Instruments_5.json.gz'
path = '/home/amkabatznick/w266-final-project/data/raw/amazon/reviews_Sports_and_Outdoors_5.json.gz'
#product = 'Office_Products'
product_group = 'Sports_Outdoors'


### Parsing function

In [292]:
def parse(path): 
    g = gzip.open(path, 'rb') 
    for l in g: 
        yield eval(l) 

### Read into dataframe

In [293]:
def getDF(path): 
    i = 0 
    df = {} 
    for d in parse(path): 
        df[i] = d 
        i += 1
    return pd.DataFrame.from_dict(df, orient='index').rename(columns={'reviewTime': 'reviewDate'})

#### Read Product Reviews into dataframe

In [294]:
df_product_group = getDF(path)
df_product_group.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewDate
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5.0,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5.0,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4.0,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4.0,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4.0,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


### Create helper functions to find all features

In [295]:
def find_nouns(sentences):
    "For a given sentence it finds the first Noun and classifies that as a feature"
    
    features = pd.DataFrame(columns=['features','sentences'])
    #nouns = ['NN','NNS','NNP','	NNPS']
    nouns = ['NN','NNS']
    #loops through all sentences
    for i in range(len(sentences)):
        #tags each word in a sentence with a Part of Speech
        tags = nltk.pos_tag(nltk.word_tokenize(sentences[i]))
        #For each word/tag paring
        for word in tags:
            if word[1] in nouns:
                #Find the first noun tag and call this our feature
                features = features.append(pd.DataFrame(data=[[word[0],sentences[i]]],columns=['features','sentences']),ignore_index=True)
                break
    return features

In [296]:
def find_all_product_features(product_dataframe):
    """
        Used to extract all features for a given product and consildate them under one dataframe
    """
    
    #Used to Time the function
    start = time.time()
    last = start
    #What the output array will look like
    columns=['features','sentences','product_id','review_helpful']
    all_product_features = pd.DataFrame(columns=columns)
    total_reviews = len(product_dataframe)
    #loop through all reviews for this product
    for i in product_dataframe.index:
        #Prints time for larger datasets
        if total_reviews>1000 and i % 1000==0:
            print('Currently Parsing Review {:,}'.format(i))
            end = time.time()
            hours, rem = divmod(end-start, 3600)
            minutes, seconds = divmod(rem, 60)
            print("The total code has been running for: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
            
            hours, rem = divmod(end-last, 3600)
            minutes, seconds = divmod(rem, 60)
            print("The last 1,000 took: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
            last = end
        
        #Splits each review into Sentence
        sentence = tokenize.sent_tokenize(product_dataframe.reviewText[i])
        #Finds the feature (Noun) of each sentence
        product_features = find_nouns(sentence)
        #Adds the product_ID and the helpfulness score
        product_features['product_id'] = product_dataframe.asin[i]
        product_features['review_helpful'] = product_dataframe.helpful[i][0]
        
        #Adds to Data
        all_product_features = all_product_features.append(product_features,ignore_index=True)
    
    return all_product_features

In [297]:
def find_product_features_over_10(product_id,product_features):
        '''Used to filter down the product/feature pairing that has at least 10 reviews'''
        #For a given product dataframe find the number of features
        grouped_product_features_10 = product_features.groupby(['features'])
        #Keep only those features with a count greater than 10
        distinct_grouped_product_features_10 = grouped_product_features_10.filter(lambda x: x.count()['review_helpful'] > 10)
        return distinct_grouped_product_features_10

In [298]:
def generate_feature_text(product_id,distinct_grouped_product_features_10):
    '''Generates a text file for each product/feature pairing'''
    #Find the distinct features
    distinct_features = distinct_grouped_product_features_10.groupby(['features']
                                                 )['review_helpful'].size().reset_index()[['features']]
    
    #Loop through each distinct feature
    for i in distinct_features.index:
        feature = distinct_features['features'].loc[i]
        text_file_path = "/home/amkabatznick/w266-final-project/data/processed/amazon/{0:}/{1:}_{2:}.txt".format(product_group,product_id,feature)
        text_file_path_exists = os.path.isfile(text_file_path)
        #See if the file already exists. If it does continue, else write the file.
        if not text_file_path_exists:
            #Find the sentences related to this feature
            all_sentences = distinct_grouped_product_features_10[(distinct_grouped_product_features_10.features==feature)]
            
            #Take the 3 most helpful reviews and uses those as a our summary. 
            #If ties or no feature review was rated helpful then you just pick the last 3 reviews from the sort
            MaxSentences = all_sentences.sort_values('review_helpful')[-3:]['sentences'].values
            
            #Join the first 100 sentences or less (including the most helpful) as our sentences
            text_of_review = ' '.join(all_sentences.sentences[:100].values)
            #Join the body of the text with oue most helpful summary
            text_of_review = text_of_review+'\n\n@highlight\n\n'.join(np.insert(MaxSentences,0,''))
            
            #Write the text to file
            with open(text_file_path, "w") as text_file:
                text_file.write(text_of_review)

In [305]:
def parse_all_products():
    #Used to Time the function
    start = time.time()
    last = start
    
    '''Does all product/feature parsing for all reviews and writes these to file'''
    #Find all distinct products
    products = df_product_group.asin.unique()
    
    #Find how many products equate to 1 Perecent of the data
    one_percent = round(len(products)/100)

    for i,product in enumerate(products):
        #Used to track what percentage of the data we are through and show how long things are taking
        if i % one_percent == 0:
            print('Processed {}% of the Data'.format(i/one_percent))
            end = time.time()
            hours, rem = divmod(end-start, 3600)
            minutes, seconds = divmod(rem, 60)
            print("The total code has been running for: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
            
            hours, rem = divmod(end-last, 3600)
            minutes, seconds = divmod(rem, 60)
            print("The last 1 percent took: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
            last = end
            
        #Check if a given product dataset already exists
        csv_path = '/home/amkabatznick/w266-final-project/data/processed/amazon/{}/{}_reviews_data.csv'.format(product_group,product)
        csv_exists = os.path.isfile(csv_path)
        if csv_exists:
            #If it exists then see if we need to generate the text files
            all_product_features = pd.read_csv(csv_path)
            generate_feature_text(product,all_product_features)
        else:
            #If the file does not exist for the product then generate the product/feature file
            df_product = df_product_group[df_product_group.asin == product]
            if len(df_product) >1000:
                print('This product has {} reviews'.format(len(df_product)))

            product_features = find_all_product_features(df_product)
            
            if not product_features.empty:
                #If product features exists, see if there is a product/feature pairing of at least 10
                all_product_features = find_product_features_over_10(product,product_features)

                if not all_product_features.empty:
                    #If the produt/feature pairing of at least 10 exists then write it to CSV and generate the text files
                    all_product_features.to_csv(csv_path)
                    generate_feature_text(product,all_product_features)

In [307]:
parse_all_products()

Processed 0.0% of the Data
The total code has been running for: 00:00:00.03
The last 1 perecent took: 00:00:00.03
Processed 1.0% of the Data
The total code has been running for: 00:01:00.43
The last 1 perecent took: 00:01:00.39
Processed 2.0% of the Data
The total code has been running for: 00:02:02.69
The last 1 perecent took: 00:01:02.26
Processed 3.0% of the Data
The total code has been running for: 00:03:03.63
The last 1 perecent took: 00:01:00.94
Processed 4.0% of the Data
The total code has been running for: 00:04:06.19
The last 1 perecent took: 00:01:02.56
Processed 5.0% of the Data
The total code has been running for: 00:05:05.89
The last 1 perecent took: 00:00:59.71
Processed 6.0% of the Data
The total code has been running for: 00:06:01.43
The last 1 perecent took: 00:00:55.54
Processed 7.0% of the Data
The total code has been running for: 00:07:12.01
The last 1 perecent took: 00:01:10.57
Processed 8.0% of the Data
The total code has been running for: 00:08:12.86
The last 1 p

Processed 73.0% of the Data
The total code has been running for: 01:16:47.25
The last 1 perecent took: 00:00:56.92
Processed 74.0% of the Data
The total code has been running for: 01:18:00.53
The last 1 perecent took: 00:01:13.28
Processed 75.0% of the Data
The total code has been running for: 01:18:51.08
The last 1 perecent took: 00:00:50.55
Processed 76.0% of the Data
The total code has been running for: 01:19:46.95
The last 1 perecent took: 00:00:55.87
Processed 77.0% of the Data
The total code has been running for: 01:20:34.08
The last 1 perecent took: 00:00:47.13
Processed 78.0% of the Data
The total code has been running for: 01:21:27.64
The last 1 perecent took: 00:00:53.56
Processed 79.0% of the Data
The total code has been running for: 01:22:31.31
The last 1 perecent took: 00:01:03.67
Processed 80.0% of the Data
The total code has been running for: 01:23:17.92
The last 1 perecent took: 00:00:46.61
Processed 81.0% of the Data
The total code has been running for: 01:24:08.27
The

In [300]:
#try:
#    all_features = pd.read_csv('/home/amkabatznick/w266-final-project/data/processed/amazon/{}/reviews_data.csv'.format(product))
#except:
#    all_features = find_all_features()

In [301]:
#grouped_features_10 = all_features.groupby(['features','product_id'])
#distinct_grouped_features_10 = grouped_features_10.filter(lambda x: x.count()['review_helpful'] > 10)

#distinct_grouped_features_10.to_csv('/home/amkabatznick/w266-final-project/data/processed/amazon/{}/reviews_data.csv'.format(product))


#distinct_grouped_features_10.groupby(['features','product_id'])['review_helpful'].count()

In [302]:
#distinct_grouped_features_10.groupby(['features','product_id'])['review_helpful'].count()

In [303]:
#distinct_features = distinct_grouped_features_10.groupby(['features','product_id']
#                                                        )['review_helpful'].size().reset_index()[['features',
#                                                                                                  'product_id']]

#for i in distinct_features.index:
    #features = distinct_features['features'].loc[i]
    
    #product_id = distinct_features['product_id'].loc[i]
    
    #all_sentences = distinct_grouped_features_10[(distinct_grouped_features_10.product_id==product_id) & 
    #                    (distinct_grouped_features_10.features==features)]

    #MaxSentences = ['']
    #for i in range(3):
        #current_max = all_sentences.review_helpful.astype(float).idxmax(axis=0)
        #MaxSentences.append(all_sentences.loc[current_max]['sentences'])
        #all_sentences.drop(current_max,inplace=True)

    #MaxSentences = all_sentences.sort_values('review_helpful')[-3:]['sentences'].values
    
    #text_of_review = ' '.join(all_sentences.sentences[:100].values)
    #text_of_review = text_of_review+'\n\n@highlight\n\n'.join(np.insert(MaxSentences,0,''))
    #with open("/home/amkabatznick/w266-final-project/data/processed/amazon/{0:}/{1:}_{2:}.txt".format(product,product_id,features), "w") as text_file:
    #    text_file.write(text_of_review)