In [13]:
import numpy as np
import time
import pandas as pd
import json

print(time.strftime("%H:%M:%S", time.gmtime()))

# Import amazon reviews dataset 

filepath = './Clothing_Shoes_and_Jewelry_5.json'

data = []
with open(filepath
            , 'r') as f:
        for line in f:
            data.append(json.loads(line))

df = pd.DataFrame(data)


df["length_review"] = df["reviewText"].apply(lambda x: len(x.split()))
df["length_summary"] = df["summary"].apply(lambda x: len(x.split()))

print(df["reviewText"][0])
df.head()

# SUbset 10000 reviews for faster processing
df2  = df.sample(10000)
df2.shape




04:12:33
This is a great tutu and at a really great price. It doesn't look cheap at all. I'm so glad I looked on Amazon and found such an affordable tutu that isn't made poorly. A++


(10000, 11)

In [14]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nanchen/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [16]:
# Run Vader on the reviews
analyzer = SentimentIntensityAnalyzer()

def vader_polarity(text):
    return analyzer.polarity_scores(text)['compound']
# Calculate the breakdown of the sentiment
def vader_positive(text):
    return analyzer.polarity_scores(text)['pos']
def vader_neutral(text):
    return analyzer.polarity_scores(text)['neu']
def vader_negative(text):
    return analyzer.polarity_scores(text)['neg']

df2['vader_polarity'] = df2['reviewText'].apply(vader_polarity)
df2['vader_positive'] = df2['reviewText'].apply(vader_positive)
df2['vader_neutral'] = df2['reviewText'].apply(vader_neutral)
df2['vader_negative'] = df2['reviewText'].apply(vader_negative)

# textblob
def textblob_polarity(text):
    return TextBlob(text).sentiment.polarity
def textblob_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

df2['textblob_polarity'] = df2['reviewText'].apply(textblob_polarity)
df2['textblob_subjectivity'] = df2['reviewText'].apply(textblob_subjectivity)

df2.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,length_review,length_summary,vader_polarity,vader_positive,vader_neutral,vader_negative,textblob_polarity,textblob_subjectivity
92548,AIFPCDUD7B14B,B002Z7EOGW,susan,"[0, 0]","These arrived quickly, and fit perfectly! My h...",5.0,mens Cudas Water Shoe,1390176000,"01 20, 2014",25,4,0.9244,0.401,0.599,0.0,0.361905,0.632143
190510,A1TVGGVRMO5LMZ,B007L6GBZ4,Sandy,"[0, 0]",Bought for my son and I think for short term t...,4.0,ok,1369872000,"05 30, 2013",36,1,0.8338,0.228,0.772,0.0,0.225,0.427778
172026,ATOSQ6RK52IX7,B006DU3EM2,doctor,"[0, 0]","I read all reviews for this product, and I ord...",3.0,Converse Chuck Taylor All Star Shoes (M9166) L...,1380758400,"10 3, 2013",65,14,0.4939,0.115,0.851,0.034,0.025,0.415
13822,A13ITPS2P2ZRK6,B00085FIPO,Shannon R.,"[0, 0]",I was having such a hard time finding shorts t...,4.0,Great product for the money,1339718400,"06 15, 2012",95,5,0.5023,0.059,0.928,0.013,0.133712,0.389286
129289,A2G66KTJIE9FI4,B004IYZKWW,Debby,"[1, 1]",I wear a size 7 and these were too short and t...,2.0,smaller than size shown,1341014400,"06 30, 2012",48,4,0.7184,0.125,0.875,0.0,0.094898,0.35102


In [17]:
# run correlation between vader and textblob
df2[['vader_polarity', 'textblob_polarity', 'textblob_subjectivity']].corr()

Unnamed: 0,vader_polarity,textblob_polarity,textblob_subjectivity
vader_polarity,1.0,0.48244,0.207044
textblob_polarity,0.48244,1.0,0.45196
textblob_subjectivity,0.207044,0.45196,1.0


In [18]:

# Run ANOVA to see if the polarity is different for different ratings

from scipy import stats

stats.f_oneway(df2['vader_polarity'][df2['overall'] == 1],
                df2['vader_polarity'][df2['overall'] == 2],
                df2['vader_polarity'][df2['overall'] == 3],
                df2['vader_polarity'][df2['overall'] == 4],
                df2['vader_polarity'][df2['overall'] == 5])

stats.f_oneway(df2['textblob_polarity'][df2['overall'] == 1],
                df2['textblob_polarity'][df2['overall'] == 2],
                df2['textblob_polarity'][df2['overall'] == 3],
                df2['textblob_polarity'][df2['overall'] == 4],
                df2['textblob_polarity'][df2['overall'] == 5])

F_onewayResult(statistic=565.0513472950851, pvalue=0.0)

In [19]:
# Describe() polarity scores by rating
df2.groupby('overall').describe()[['vader_polarity', 'textblob_polarity']]

Unnamed: 0_level_0,vader_polarity,vader_polarity,vader_polarity,vader_polarity,vader_polarity,vader_polarity,vader_polarity,vader_polarity,textblob_polarity,textblob_polarity,textblob_polarity,textblob_polarity,textblob_polarity,textblob_polarity,textblob_polarity,textblob_polarity
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
overall,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
1.0,385.0,-0.004888,0.601752,-0.9839,-0.5709,0.0,0.5499,0.9869,385.0,0.006029,0.21988,-0.75,-0.103571,0.025002,0.136667,0.7375
2.0,532.0,0.200063,0.574307,-0.972,-0.25385,0.283,0.7163,0.9926,532.0,0.079853,0.17541,-0.766667,-0.01777,0.087727,0.180864,0.633333
3.0,1111.0,0.417149,0.51273,-0.9318,0.05185,0.5859,0.8428,0.9973,1111.0,0.15021,0.189286,-0.78,0.035635,0.138095,0.254356,1.0
4.0,2095.0,0.689848,0.361603,-0.9085,0.5942,0.8402,0.9278,0.9993,2095.0,0.22833,0.173584,-0.4,0.115994,0.208333,0.333333,1.0
5.0,5877.0,0.796263,0.274763,-0.8873,0.7717,0.894,0.9501,0.9995,5877.0,0.325113,0.19659,-0.875,0.192262,0.310571,0.447222,1.0
