In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist, BigramCollocationFinder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import string
import gzip
import json

### Objective:
Guide is: [article](https://towardsdatascience.com/sentiment-analysis-and-product-recommendation-on-amazons-electronics-dataset-reviews-part-1-6b340de660c2)
Set up a 'dev doc' 

Sourcing the metadata will help immensely. Which can be our next step together.

Goal is to predict the rating based on the words in a review. I used the article above to get these questions, most of which will not be able to be answered until we have the metadata too. 
* Usefulness on large volume of reviews
* Rating vs number of reviews
* Rating vs proportion of reviews
* Helpful proportion vs Number of reviews
* Rating vs helpfulness ratio
* Top 20 most reviewed products
* Bottom 20 reviewed products
* Positive and negative words
* World cloud for different ratings, brand name etc

The commands `parse()` and `getDF()` comes from where I got the data. They were provided to help with ease of access. The rest are what I came up with to process the data to be able to be used. In this case, the data is text.

In [3]:
def parse(path):
    '''
    Helper function that parses the gzip file and allows us to put it into a Pandas DataFrame
    '''
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    '''
    Main function that is formatting our data. Uses parse() as a helper function and 
    creates a DataFrame from our gzip that holds the data. 
    '''
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

def clean_text(text):
    '''
    Cycles through every row in a series and deletes any of the following symbols: ,.'?!''\n
    This helps get rid of unnecessary data that clogs up our useful information. 
    '''
    cleaned_text = []
    for line in text:
            for symbol in ",.'?!''\n":
                line = line.replace(symbol, "").lower()
            cleaned_text.append(line)
    return cleaned_text

def getContractions(series):
    '''
    Cycles through every row in a series and and returns a set of all contractions found. 
    '''
    contr = set()
    for row in series:
        for word in row.split(' '):
            if "'" in word:
                contr.add(word)
    return contr

def process_text(text):
    '''
    Cycles through every word in an article and if the word is not in my stopwords list, it is then set to lowercase
    and added to my proccessed text. 
    '''
    stopwords_list = stopwords.words('english') + list(string.punctuation) + ["''", '""', '...', '``', "'s", "n't"]
    return [w.lower() for w in word_tokenize(text) if w.lower() not in stopwords_list]

def process_data(data):
    '''
    Helper function to more easily call process_text() for those who don't regularly use map()
    '''
    return list(map(process_text, data))

First, I like taking a good look at the data. Calling both `df.head()` and `df.info()` is a good habit to get into. Although you get most of the info you need with `df.info()`, it is always important to take a look at what each data point looks like. 

In [4]:
df = getDF('Appliances.json.gz')
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5.0,2.0,False,"11 27, 2013",A3NHUQ33CFH3VM,1118461304,{'Format:': ' Hardcover'},Greeny,Not one thing in this book seemed an obvious o...,Clear on what leads to innovation,1385510400,
1,5.0,,False,"11 1, 2013",A3SK6VNBQDNBJE,1118461304,{'Format:': ' Kindle Edition'},Leif C. Ulstrup,I have enjoyed Dr. Alan Gregerman's weekly blo...,Becoming more innovative by opening yourself t...,1383264000,
2,5.0,,False,"10 10, 2013",A3SOFHUR27FO3K,1118461304,{'Format:': ' Hardcover'},Harry Gilbert Miller III,Alan Gregerman believes that innovation comes ...,The World from Different Perspectives,1381363200,
3,5.0,,False,"10 9, 2013",A1HOG1PYCAE157,1118461304,{'Format:': ' Hardcover'},Rebecca Ripley,"Alan Gregerman is a smart, funny, entertaining...",Strangers are Your New Best Friends,1381276800,
4,5.0,10.0,False,"09 7, 2013",A26JGAM6GZMM4V,1118461304,{'Format:': ' Hardcover'},Robert Morris,"As I began to read this book, I was again remi...","How and why it is imperative to engage, learn ...",1378512000,


In [5]:
meta = getDF('meta_Appliances.json.gz')
meta.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,"[Appliances, Refrigerators, Freezers & Ice Mak...","class=""a-keyvalue prodDetTable"" role=""present...",[],,Tupperware Freezer Square Round Container Set ...,[],,Tupperware,[Each 3-pc. set includes two 7/8-cup/200 mL an...,"[>#39,745 in Appliances (See top 100)]",[],{},Appliances,,"November 19, 2008",,7301113188,[],[]
1,"[Appliances, Refrigerators, Freezers & Ice Mak...","class=""a-keyvalue prodDetTable"" role=""present...",[2 X Tupperware Pure & Fresh Unique Covered Co...,,2 X Tupperware Pure &amp; Fresh Unique Covered...,[],,Tupperware,[2 X Tupperware Pure & Fresh Unique Covered Co...,"[>#6,118 in Appliances (See top 100)]",[B004RUGHJW],{},Appliances,,"June 5, 2016",$3.62,7861850250,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
2,"[Appliances, Parts &amp; Accessories]",,[],,The Cigar - Moments of Pleasure,[],,The Cigar Book,[],"[>#1,861,816 in Home &amp; Kitchen (See Top 10...","[B01HCAVSLK, 1632206579]",{},Amazon Home,,,$150.26,8792559360,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
3,"[Appliances, Parts & Accessories]","class=""a-keyvalue prodDetTable"" role=""present...","[Multi purpost descaler, especially suited to ...",,Caraselle 2X 50G Appliance Descalene,[],,Caraselle,[],"[>#1,654,505 in Tools & Home Improvement (See ...",[],{},Tools & Home Improvement,,"December 17, 2014",.a-box-inner{background-color:#fff}#alohaBuyBo...,9792954481,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,"[Appliances, Parts & Accessories, Range Parts ...","class=""a-keyvalue prodDetTable"" role=""present...",[Full gauge and size beveled-edge; furnished w...,,EATON Wiring 39CH-SP-L Arrow Hart 1-Gang Chrom...,[],,EATON Wiring,[Returns will not be honored on this closeout ...,"[>#3,066,990 in Tools & Home Improvement (See ...",[],{},Tools & Home Improvement,,"January 16, 2007",$3.43,B00002N5EL,[],[]


In [6]:
df = pd.merge(df, meta, on='asin')
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,...,feature,rank,also_view,details,main_cat,similar_item,date,price,imageURL,imageURLHighRes
0,5.0,,True,"03 20, 2015",A3SHVDMM83IHJ4,B00002N7IL,{'Size:': ' Pack of 1'},steve crumpler,Just what I needed for my electric range. Matc...,Matched pigtail cord and works great.,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
1,5.0,,True,"03 20, 2015",A3SHVDMM83IHJ4,B00002N7IL,{'Size:': ' Pack of 1'},steve crumpler,Just what I needed for my electric range. Matc...,Matched pigtail cord and works great.,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
2,5.0,,True,"03 26, 2018",A3TIWHNJXMSIU7,B00002N7IL,{'Size:': ' Pack of 1'},Torpex,this particular type is perfect and easily ada...,this particular type is perfect and easily ada...,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
3,5.0,,True,"03 26, 2018",A3TIWHNJXMSIU7,B00002N7IL,{'Size:': ' Pack of 1'},Torpex,this particular type is perfect and easily ada...,this particular type is perfect and easily ada...,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
4,5.0,,True,"03 20, 2018",AVP16JFIT6LPL,B00002N7IL,{'Size:': ' Pack of 1'},RAFAEL FERNANDEZ,excellent,Five Stars,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 615747 entries, 0 to 615746
Data columns (total 30 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   overall          615747 non-null  float64
 1   vote             66580 non-null   object 
 2   verified         615747 non-null  bool   
 3   reviewTime       615747 non-null  object 
 4   reviewerID       615747 non-null  object 
 5   asin             615747 non-null  object 
 6   style            144064 non-null  object 
 7   reviewerName     615732 non-null  object 
 8   reviewText       615422 non-null  object 
 9   summary          615619 non-null  object 
 10  unixReviewTime   615747 non-null  int64  
 11  image            9402 non-null    object 
 12  category         615747 non-null  object 
 13  tech1            615747 non-null  object 
 14  description      615747 non-null  object 
 15  fit              615747 non-null  object 
 16  title            615747 non-null  obje

Here I use `df.value_counts()` to better understand the information above. Doing so allowed me to find that the reason `vote` column so many nulls is because if someone has 0 votes, they put in nothing. Same with `reviewerName`, they had a few missing so I was able to fill it in with a replacement value. 

In [8]:
# results = {}
# for column in df.columns:
#     results[column] = df[column].value_counts()
# results

This next cell is my basic cleaning cell. I used the information I found previously to decide what to get rid of and how to clean it for better use. First, I filled in the null values in the columns I wanted to keep. Next, I dropped the columns I decided weren't very useful. And finally I drop the rows that have any null values left. I call `df.info()` to verify that I have a nice clean dataset ready for some processing. 

Note: You may notice that everything is `inplace`, this edits the DataFrame as is instead of needing to reasign to a different variable. 

In [9]:
to_drop = ['image', 'style']
df.vote.fillna(0, inplace=True)
df.reviewerName.fillna('Amazon Customer', inplace=True)
df.drop(to_drop, axis=1, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(subset=['reviewText', 'reviewerID'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579384 entries, 0 to 615746
Data columns (total 28 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   overall          579384 non-null  float64
 1   vote             579384 non-null  object 
 2   verified         579384 non-null  bool   
 3   reviewTime       579384 non-null  object 
 4   reviewerID       579384 non-null  object 
 5   asin             579384 non-null  object 
 6   reviewerName     579384 non-null  object 
 7   reviewText       579384 non-null  object 
 8   summary          579384 non-null  object 
 9   unixReviewTime   579384 non-null  int64  
 10  category         579384 non-null  object 
 11  tech1            579384 non-null  object 
 12  description      579384 non-null  object 
 13  fit              579384 non-null  object 
 14  title            579384 non-null  object 
 15  also_buy         579384 non-null  object 
 16  tech2            579384 non-null  obje

Here we call our cleaning and processing functions to tokenize all of our articles. We end up with a list of lists. Each list representing an individual article. First we want to reattach this proccessed data to our DataFrame and do some exploratory data analysis. Being able to find out which words are occuring most with good and bad reviews without the clog of stopwords.
Next step would be attaching a normalized frequency of each word in the entire dataset. 

In [10]:
cleaned_text = clean_text(df.reviewText)
processed_data = process_data(cleaned_text)

In [11]:
processed_data[:6]

[['needed',
  'electric',
  'range',
  'matched',
  'pigtail',
  'cord',
  'works',
  'great'],
 ['particular',
  'type',
  'perfect',
  'easily',
  'adapts',
  'almost',
  'box',
  'mount',
  'item',
  'high',
  'quality',
  'suppose'],
 ['excellent'],
 ['yea', 'baby'],
 ['well', 'made', 'works', 'well'],
 ['great', 'buy']]

Now that we have our text processed, we want to replace our unprocessed reviewText with our new data.

In [12]:
df['reviewText'] = processed_data
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,...,feature,rank,also_view,details,main_cat,similar_item,date,price,imageURL,imageURLHighRes
0,5.0,0,True,"03 20, 2015",A3SHVDMM83IHJ4,B00002N7IL,steve crumpler,"[needed, electric, range, matched, pigtail, co...",Matched pigtail cord and works great.,1426809600,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
2,5.0,0,True,"03 26, 2018",A3TIWHNJXMSIU7,B00002N7IL,Torpex,"[particular, type, perfect, easily, adapts, al...",this particular type is perfect and easily ada...,1522022400,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
4,5.0,0,True,"03 20, 2018",AVP16JFIT6LPL,B00002N7IL,RAFAEL FERNANDEZ,[excellent],Five Stars,1521504000,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
6,5.0,0,True,"10 14, 2017",A2J2T9LX2HC947,B00002N7IL,John R. Smith,"[yea, baby]",Five Stars,1507939200,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]
8,5.0,0,True,"07 31, 2017",AJN26F4KMWD59,B00002N7IL,Jerry Beckwith,"[well, made, works, well]",Five Stars,1501459200,...,"[Fits all wallplates with 2.15"" diameter cente...","[>#411,187 in Tools & Home Improvement (See to...","[B00002N7HY, B074Q2KZGV, B000FPCL2K, B000FP8LK...",{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2004",,[],[]


Now before we go ahead and do some fun stuff with the data that we have cleaned up, we need to save it as a local file so that we don't have to do the same work everytime we launch our jupyter notebook.

In [13]:
df.to_csv('processed_Appliances.csv', index=False)

Simple cell that checks how many unique words are in our dataset. 

In [None]:
total_vocab = set()
for article in processed_data:
    total_vocab.update(article)
len(total_vocab)

Here we have our first step in figuring out the significance of each word. It is just a count so we don't have a lot of context. 

In [None]:
review_concat = []
for review in processed_data:
    review_concat += review
review_freqdist = FreqDist(review_concat)
total_word_count = sum(review_freqdist.values())


In [None]:
review_freqdist.most_common(200)


To add the context, we normalize the frequency and divide each count by the total number of words and create a dictionary of the 200 most common. 

In [None]:
normalized_freqs = {}
for word in review_freqdist.most_common(200):
    normalized_freq = word[1]/total_word_count
    normalized_freqs[word[0]] = normalized_freq
normalized_freqs

Here we are just having a bit of fun figuring out the bigrams, or which pairs of words are found together often.

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(review_concat)
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored[:50]

More fun where we check the PMI or the pointwise mutual information. This tells us pairs of words that (for the most part) only ever occur next to eachother. 

In [None]:
pmi_finder = BigramCollocationFinder.from_words(review_concat)
pmi_finder.apply_freq_filter(5)
review_pmi_scored = pmi_finder.score_ngrams(bigram_measures.pmi)
review_pmi_scored[:50]