In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist, BigramCollocationFinder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import string
import gzip
import json

### Objective:
Guide is: [article](https://towardsdatascience.com/sentiment-analysis-and-product-recommendation-on-amazons-electronics-dataset-reviews-part-1-6b340de660c2)
Set up a 'dev doc' 

Sourcing the metadata will help immensely. Which can be our next step together.

Goal is to predict the rating based on the words in a review. I used the article above to get these questions, most of which will not be able to be answered until we have the metadata too. 
* Usefulness on large volume of reviews
* Rating vs number of reviews
* Rating vs proportion of reviews
* Helpful proportion vs Number of reviews
* Rating vs helpfulness ratio
* Top 20 most reviewed products
* Bottom 20 reviewed products
* Positive and negative words
* World cloud for different ratings, brand name etc

The commands `parse()` and `getDF()` comes from where I got the data. They were provided to help with ease of access. The rest are what I came up with to process the data to be able to be used. In this case, the data is text.

In [2]:
def parse(path):
    '''
    Helper function that parses the gzip file and allows us to put it into a Pandas DataFrame
    '''
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    '''
    Main function that is formatting our data. Uses parse() as a helper function and 
    creates a DataFrame from our gzip that holds the data. 
    '''
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

def clean_text(text):
    '''
    Cycles through every row in a series and deletes any of the following symbols: ,.'?!''\n
    This helps get rid of unnecessary data that clogs up our useful information. 
    '''
    cleaned_text = []
    for line in text:
        if not '[' in line and not ']' in line:
            for symbol in ",.'?!''\n":
                line = line.replace(symbol, "").lower()
            cleaned_text.append(line)
    return cleaned_text

def getContractions(series):
    '''
    Cycles through every row in a series and and returns a set of all contractions found. 
    '''
    contr = set()
    for row in series:
        for word in row.split(' '):
            if "'" in word:
                contr.add(word)
    return contr

def process_text(text):
    '''
    Cycles through every word in an article and if the word is not in my stopwords list, it is then set to lowercase
    and added to my proccessed text. 
    '''
    stopwords_list = stopwords.words('english') + list(string.punctuation) + ["''", '""', '...', '``', "'s", "n't"]
    return [w.lower() for w in word_tokenize(text) if w.lower() not in stopwords_list]

def process_data(data):
    '''
    Helper function to more easily call process_text() for those who don't regularly use map()
    '''
    return list(map(process_text, data))

First, I like taking a good look at the data. Calling both `df.head()` and `df.info()` is a good habit to get into. Although you get most of the info you need with `df.info()`, it is always important to take a look at what each data point looks like. 

In [3]:
df = getDF('Magazine_Subscriptions.json.gz')
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,style,image
0,5.0,9.0,False,"11 8, 2001",AH2IFH762VY5U,B00005N7P0,ted sedlmayr,"for computer enthusiast, MaxPC is a welcome si...","AVID READER SINCE ""boot"" WAS THE NAME",1005177600,,
1,5.0,9.0,False,"10 31, 2001",AOSFI0JEYU4XM,B00005N7P0,Amazon Customer,Thank god this is not a Ziff Davis publication...,The straight scoop,1004486400,,
2,3.0,14.0,False,"03 24, 2007",A3JPFWKS83R49V,B00005N7OJ,Bryan Carey,Antiques Magazine is a publication made for an...,"Antiques Magazine is Good, but not for Everyone",1174694400,{'Format:': ' Print Magazine'},
3,5.0,13.0,False,"11 10, 2006",A19FKU6JZQ2ECJ,B00005N7OJ,Patricia L. Porada,This beautiful magazine is in itself a work of...,THE DISCERNING READER,1163116800,{'Format:': ' Print Magazine'},
4,5.0,,True,"07 14, 2014",A25MDGOMZ2GALN,B00005N7P0,Alvey,A great read every issue.,Five Stars,1405296000,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89689 entries, 0 to 89688
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         89689 non-null  float64
 1   vote            24103 non-null  object 
 2   verified        89689 non-null  bool   
 3   reviewTime      89689 non-null  object 
 4   reviewerID      89689 non-null  object 
 5   asin            89689 non-null  object 
 6   reviewerName    89687 non-null  object 
 7   reviewText      89656 non-null  object 
 8   summary         89670 non-null  object 
 9   unixReviewTime  89689 non-null  int64  
 10  style           51398 non-null  object 
 11  image           135 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 8.3+ MB


Here I use `df.value_counts()` to better understand the information above. Doing so allowed me to find that the reason `vote` column so many nulls is because if someone has 0 votes, they put in nothing. Same with `reviewerName`, they had a few missing so I was able to fill it in with a replacement value. 

In [5]:
results = {}
for column in df.columns:
    results[column] = df[column].value_counts()
results

{'overall': 5.0    53790
 4.0    12676
 1.0    11029
 3.0     6971
 2.0     5223
 Name: overall, dtype: int64,
 'vote': 2      6318
 3      3657
 4      2403
 5      1754
 6      1321
        ... 
 304       1
 497       1
 179       1
 229       1
 275       1
 Name: vote, Length: 287, dtype: int64,
 'verified': True     58654
 False    31035
 Name: verified, dtype: int64,
 'reviewTime': 02 20, 2015    146
 03 29, 2016    131
 03 25, 2014    127
 05 30, 2015    114
 03 1, 2016     113
               ... 
 06 1, 2018       1
 07 19, 2005      1
 12 14, 2003      1
 09 25, 2004      1
 08 5, 2006       1
 Name: reviewTime, Length: 5804, dtype: int64,
 'reviewerID': A3JPFWKS83R49V    55
 A2OTUWUSH49XIN    26
 AEMZRE6QYVQBS     25
 A3GA09FYFKL4EY    24
 A30H2335OM7RD6    22
                   ..
 A1GWF8D9X20TXN     1
 A2RAHU8Z0HVB23     1
 A1YNCRREU71LZ4     1
 A2PEWUSS3T0PTR     1
 AVWGN0X5ZSTF6      1
 Name: reviewerID, Length: 72098, dtype: int64,
 'asin': B00005NIOH    1718
 B00005N7Q

This next cell is my basic cleaning cell. I used the information I found previously to decide what to get rid of and how to clean it for better use. First, I filled in the null values in the columns I wanted to keep. Next, I dropped the columns I decided weren't very useful. And finally I drop the rows that have any null values left. I call `df.info()` to verify that I have a nice clean dataset ready for some processing. 

Note: You may notice that everything is `inplace`, this edits the DataFrame as is instead of needing to reasign to a different variable. 

In [6]:
to_drop = ['image', 'style']
df.vote.fillna(0, inplace=True)
df.reviewerName.fillna('Amazon Customer', inplace=True)
df.drop(to_drop, axis=1, inplace=True)
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89640 entries, 0 to 89688
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         89640 non-null  float64
 1   vote            89640 non-null  object 
 2   verified        89640 non-null  bool   
 3   reviewTime      89640 non-null  object 
 4   reviewerID      89640 non-null  object 
 5   asin            89640 non-null  object 
 6   reviewerName    89640 non-null  object 
 7   reviewText      89640 non-null  object 
 8   summary         89640 non-null  object 
 9   unixReviewTime  89640 non-null  int64  
dtypes: bool(1), float64(1), int64(1), object(7)
memory usage: 6.9+ MB


Here we call our cleaning and processing functions to tokenize all of our articles. We end up with a list of lists. Each list representing an individual article. First we want to reattach this proccessed data to our DataFrame and do some exploratory data analysis. Being able to find out which words are occuring most with good and bad reviews without the clog of stopwords.
Next step would be attaching a normalized frequency of each word in the entire dataset. 

In [7]:
processed_data = process_data(df.reviewText)

In [8]:
processed_data[:5]

[['computer',
  'enthusiast',
  'maxpc',
  'welcome',
  'sight',
  'mailbox',
  'remember',
  'years',
  'savorying',
  'every',
  'page',
  'boot',
  'called',
  'beginning',
  'still',
  'obcessed',
  'pc',
  'anyone',
  'advanced',
  'users',
  'beginners',
  'looking',
  'knowledge',
  'profit',
  'every',
  'issue',
  'maxpc',
  'icing',
  'cake',
  'subscription',
  'comes',
  'cd-rom',
  'packed',
  'demos',
  'utilities',
  'useful',
  'apps',
  'helpful',
  'blessed',
  'broadband',
  'connections',
  'discovered',
  'community',
  'hardware',
  'enthusiast',
  'web',
  'sites',
  'maxpc',
  'formerly',
  'boot',
  'really',
  'informative',
  'source',
  'computing',
  'news',
  'articles',
  'day',
  'consider',
  'subscription',
  'worth',
  '10',
  'subscriptions',
  'computing',
  'mags',
  'ca',
  'wait',
  'merge',
  'dvd',
  'media',
  'maybe',
  'end',
  'offering',
  'info',
  'divx',
  'codecs',
  'encoding',
  'movies',
  'best',
  'bang',
  'buck',
  'audio',
  'v

Simple cell that checks how many unique words are in our dataset. 

In [9]:
total_vocab = set()
for article in processed_data:
    total_vocab.update(article)
len(total_vocab)

60904

Here we have our first step in figuring out the significance of each word. It is just a count so we don't have a lot of context. 

In [10]:
review_concat = []
for review in processed_data:
    review_concat += review
review_freqdist = FreqDist(review_concat)
total_word_count = sum(review_freqdist.values())


In [11]:
review_freqdist.most_common(200)


[('magazine', 68522),
 ('great', 23089),
 ('articles', 17853),
 ('love', 16787),
 ('like', 16606),
 ('subscription', 16316),
 ('read', 16114),
 ('good', 15479),
 ('issue', 14658),
 ('one', 14144),
 ('get', 11490),
 ('would', 11000),
 ('years', 9425),
 ('magazines', 9207),
 ('time', 8920),
 ('reading', 8689),
 ('kindle', 8425),
 ('new', 8362),
 ('always', 7878),
 ('many', 7860),
 ('really', 7811),
 ('every', 7458),
 ('year', 7319),
 ('much', 7287),
 ('recipes', 7213),
 ('well', 7155),
 ('issues', 7017),
 ('first', 6923),
 ('also', 6747),
 ('price', 6744),
 ('find', 6044),
 ('amazon', 6014),
 ("'ve", 5994),
 ('information', 5866),
 ('even', 5817),
 ("'m", 5745),
 ('enjoy', 5643),
 ('best', 5563),
 ('interesting', 5481),
 ('still', 5140),
 ('month', 5032),
 ('want', 4804),
 ('received', 4792),
 ('ideas', 4738),
 ('got', 4713),
 ('ads', 4682),
 ('cover', 4620),
 ('people', 4600),
 ('look', 4504),
 ('gift', 4402),
 ('content', 4370),
 ('know', 4296),
 ('never', 4263),
 ('stories', 4251),
 (

To add the context, we normalize the frequency and divide each count by the total number of words and create a dictionary of the 200 most common. 

In [12]:
normalized_freqs = {}
for word in review_freqdist.most_common(200):
    normalized_freq = word[1]/total_word_count
    normalized_freqs[word[0]] = normalized_freq
normalized_freqs

{'magazine': 0.03424760918480874,
 'great': 0.011539987864744885,
 'articles': 0.00892301110265886,
 'love': 0.008390219424205135,
 'like': 0.008299754795874813,
 'subscription': 0.008154811468715733,
 'read': 0.008053850944280787,
 'good': 0.007736475038260041,
 'issue': 0.007326135481026918,
 'one': 0.007069235928751858,
 'get': 0.005742754582958063,
 'would': 0.005497850340516858,
 'years': 0.004710658132670126,
 'magazines': 0.00460170073501261,
 'time': 0.004458256821582761,
 'reading': 0.004342801964431907,
 'kindle': 0.0042108535562595025,
 'new': 0.004179365867945633,
 'always': 0.003937460452962891,
 'many': 0.0039284639705875,
 'really': 0.0039039735463433796,
 'every': 0.0037275425308704296,
 'year': 0.003658069694749353,
 'much': 0.003642075948304213,
 'recipes': 0.003605090409649827,
 'well': 0.003576101744218011,
 'issues': 0.003507128712673345,
 'first': 0.0034601470824907462,
 'also': 0.0033721814770424764,
 'price': 0.0033706820633132447,
 'find': 0.003020818859825808,

Here we are just having a bit of fun figuring out the bigrams, or which pairs of words are found together often.

In [13]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(review_concat)
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored[:50]

[(('great', 'magazine'), 0.002374571542526872),
 (('love', 'magazine'), 0.0023006004652180996),
 (('first', 'issue'), 0.0012075278566080663),
 (('magazine', 'great'), 0.001160046421849057),
 (('good', 'magazine'), 0.000874658008718591),
 (('look', 'forward'), 0.0008686603538016636),
 (('every', 'month'), 0.0007856927941175001),
 (('every', 'issue'), 0.000783193771235447),
 (('like', 'magazine'), 0.0006752359827307523),
 (('magazine', 'love'), 0.0006077623649153181),
 (('cover', 'cover'), 0.0006057631466096756),
 (('years', 'ago'), 0.0005917686184701782),
 (('kindle', 'fire'), 0.0005787736994835019),
 (('great', 'articles'), 0.0005467862065932221),
 (('year', 'old'), 0.0005432875745583477),
 (('well', 'written'), 0.0005332914830301352),
 (('great', 'price'), 0.0005192969548906377),
 (('magazine', 'good'), 0.0005187971503142271),
 (('interesting', 'articles'), 0.0005008041855634447),
 (('highly', 'recommend'), 0.000489808484882411),
 (('many', 'years'), 0.00048531024369471537),
 (('read'

More fun where we check the PMI or the pointwise mutual information. This tells us pairs of words that (for the most part) only ever occur next to eachother. 

In [14]:
pmi_finder = BigramCollocationFinder.from_words(review_concat)
pmi_finder.apply_freq_filter(5)
review_pmi_scored = pmi_finder.score_ngrams(bigram_measures.pmi)
review_pmi_scored[:50]

[(('dame', 'edna'), 18.347170052112418),
 (('settimana', 'enigmistica'), 18.347170052112418),
 (('bells', 'whistles'), 18.12477763077597),
 (('miley', 'cyrus'), 18.12477763077597),
 (('humpty', 'dumpty'), 18.124777630775966),
 (('a-section', 'a-spacing-small'), 17.932132552833572),
 (('a-spacing-small', 'a-spacing-top-mini'), 17.932132552833572),
 (('a-spacing-top-mini', 'video-block'), 17.932132552833572),
 (('fons', 'porter'), 17.932132552833572),
 (('gwyneth', 'paltrow'), 17.932132552833572),
 (('roz', 'chast'), 17.932132552833572),
 (('video-block', '/div'), 17.932132552833572),
 (('bona', 'fide'), 17.902385209439522),
 (('christina', 'aguilera'), 17.861743224942174),
 (('sofia', 'vergara'), 17.861743224942174),
 (('movers', 'shakers'), 17.76220755139126),
 (('suze', 'orman'), 17.76220755139126),
 (('puerto', 'rico'), 17.739487474891174),
 (('di', 'giacomo'), 17.61020445794621),
 (('div', 'id='), 17.61020445794621),
 (('fareed', 'zakaria'), 17.61020445794621),
 (('tai', 'chi'), 17.