In [2]:
import numpy as np
import time
import pandas as pd
import json

print(time.strftime("%H:%M:%S", time.gmtime()))

# Import amazon reviews dataset 

filepath = './Clothing_Shoes_and_Jewelry_5.json'

data = []
with open(filepath
            , 'r') as f:
        for line in f:
            data.append(json.loads(line))

df = pd.DataFrame(data)


df["length_review"] = df["reviewText"].apply(lambda x: len(x.split()))
df["length_summary"] = df["summary"].apply(lambda x: len(x.split()))

print(df["reviewText"][0])
df.head()

# SUbset 10000 reviews for faster processing
df2  = df.sample(10000)
df2.shape




18:10:37
This is a great tutu and at a really great price. It doesn't look cheap at all. I'm so glad I looked on Amazon and found such an affordable tutu that isn't made poorly. A++


(10000, 11)

In [3]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nanchen/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# Run Vader on the reviews
analyzer = SentimentIntensityAnalyzer()

def vader_polarity(text):
    return analyzer.polarity_scores(text)['compound']
# Calculate the breakdown of the sentiment
def vader_positive(text):
    return analyzer.polarity_scores(text)['pos']
def vader_neutral(text):
    return analyzer.polarity_scores(text)['neu']
def vader_negative(text):
    return analyzer.polarity_scores(text)['neg']

# Calculate the sum of all the lexicon ratings and takes values from -1 to 1. 
# The closer to 1, the more positive the text is.
df2['vader_polarity'] = df2['reviewText'].apply(vader_polarity)
# the proportion of the text that falls into positive, neutral, and negative categories, respectively
df2['vader_positive'] = df2['reviewText'].apply(vader_positive)
df2['vader_neutral'] = df2['reviewText'].apply(vader_neutral)
df2['vader_negative'] = df2['reviewText'].apply(vader_negative)

# textblob
def textblob_polarity(text):
    return TextBlob(text).sentiment.polarity
def textblob_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Returns a polarity score between -1 and 1 where 1 means positive statement and -1 means a negative statement.
df2['textblob_polarity'] = df2['reviewText'].apply(textblob_polarity)
# A subjectivity score between 0 and 1. Subjectivity quantifies the amount of personal 
# opinion and factual information contained in the text. The higher subjectivity means that 
# the text contains personal opinion rather than factual information.
df2['textblob_subjectivity'] = df2['reviewText'].apply(textblob_subjectivity)

df2.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,length_review,length_summary,vader_polarity,vader_positive,vader_neutral,vader_negative,textblob_polarity,textblob_subjectivity
164610,A35ZDE63E9JPX0,B005QNSPD0,David G,"[0, 0]",You need to train your feet to use these. Fol...,5.0,Heed the warning,1369180800,"05 22, 2013",34,3,0.0,0.161,0.654,0.185,0.8,0.75
1673,A21U4DR8M6I9QN,B0000ASDJH,"K. J ""justine""","[2, 2]","I buy a bunch at a time, they last a year of ...",5.0,love them,1378166400,"09 3, 2013",23,2,0.296,0.124,0.81,0.067,-0.145833,0.304167
154167,A1BHX28BBBJ3UW,B005CT0HCA,Gina,"[0, 0]",It was love at first wearing. These shoes are ...,5.0,My new favorite flats,1363564800,"03 18, 2013",32,4,0.9628,0.42,0.58,0.0,0.4025,0.595833
214365,AODBT2BR9O842,B008KRWGUM,A. Davis,"[0, 1]",I needed something casual for the upcoming sum...,5.0,Great summer sandals,1394928000,"03 16, 2014",78,3,0.9841,0.344,0.656,0.0,0.255208,0.686111
229019,A4020GK4YX8ZH,B009L4E37M,"MASTERMAT ""MATSTRAZZ""","[0, 0]","Huge T shirt guy, design most of my own, and h...",5.0,BESTEST,1371859200,"06 22, 2013",34,1,0.9153,0.305,0.622,0.073,0.56,0.75


In [5]:
# run correlation between vader and textblob
# A correlation matrix that shows the correlation coefficients between these three variables.
# Correlation coefficients range from -1 to 1. A value close to 1 implies a strong positive 
# correlation (as one variable increases, the other tends to increase), a value close to -1 
# implies a strong negative correlation (as one variable increases, the other tends to decrease), 
# and a value close to 0 implies no correlation.
#  if ‘vader_polarity’ and ‘textblob_polarity’ have a high positive correlation, 
# it means that both methods tend to agree on the sentiment of the reviews. 
# This could be useful in validating the consistency of the sentiment analysis methods used.
df2[['vader_polarity', 'textblob_polarity', 'textblob_subjectivity']].corr()

Unnamed: 0,vader_polarity,textblob_polarity,textblob_subjectivity
vader_polarity,1.0,0.473401,0.215395
textblob_polarity,0.473401,1.0,0.46364
textblob_subjectivity,0.215395,0.46364,1.0


Vader_polarity and textblob_polarity correlation is approximately 0.48244. This indicates a moderate positive correlation between the sentiment scores calculated by VADER and TextBlob. In other words, when the sentiment score from VADER increases, the sentiment score from TextBlob also tends to increase, but not as strongly.

Vader_polarity and textblob_subjectivity correlation is approximately 0.207044. This indicates a weak positive correlation. This means that there’s a slight tendency that as the sentiment score from VADER increases, the subjectivity score from TextBlob also increases, but this relationship is not very strong.

Textblob_polarity and textblob_subjectivity correlation is approximately 0.45196. This indicates a moderate positive correlation. This means that there’s a moderate tendency that as the sentiment score from TextBlob increases, the subjectivity score from TextBlob also increases.

The above result shows how these sentiment and subjectivity scores relate to each other. For example, the moderate positive correlation between vader_polarity and textblob_polarity suggests that these two methods are somewhat consistent in their sentiment scoring. However, the weak correlation between vader_polarity and textblob_subjectivity suggests that the sentiment score from VADER doesn’t strongly relate to the subjectivity score from TextBlob.

In [6]:

# Run ANOVA to see if the polarity is different for different ratings
# determine whether there are any statistically significant differences between the means of three or more independent groups
from scipy import stats

stats.f_oneway(df2['vader_polarity'][df2['overall'] == 1],
                df2['vader_polarity'][df2['overall'] == 2],
                df2['vader_polarity'][df2['overall'] == 3],
                df2['vader_polarity'][df2['overall'] == 4],
                df2['vader_polarity'][df2['overall'] == 5])



F_onewayResult(statistic=887.6537635940534, pvalue=0.0)

In [7]:
stats.f_oneway(df2['textblob_polarity'][df2['overall'] == 1],
                df2['textblob_polarity'][df2['overall'] == 2],
                df2['textblob_polarity'][df2['overall'] == 3],
                df2['textblob_polarity'][df2['overall'] == 4],
                df2['textblob_polarity'][df2['overall'] == 5])

F_onewayResult(statistic=551.1465894999486, pvalue=0.0)

The ANOVA test is showing that there is a statistically significant difference in the sentiment scores (both VADER and TextBlob) between the different rating groups.

statistic=565.0513472950851/statistic=887.6537635940534: This is the F-statistic, a value you get when you run an ANOVA test. It represents the ratio of the variation between the group means to the variation within the groups. The larger this value is, the more likely it is that the differences between the group means are statistically significant.

pvalue=0.0: The p-value is the probability of getting a result at least as extreme as the one that was actually observed, given that the null hypothesis is true. In this case, the null hypothesis is that there’s no difference in the mean sentiment scores between the different rating groups. A p-value of 0.0 is indicating that the likelihood of the observed differences arising by chance (assuming the null hypothesis is true) is extremely low. Therefore, you would reject the null hypothesis and conclude that there is a statistically significant difference in the mean sentiment scores between the different rating groups.

So, in this case, the ANOVA test results with an F-statistic of 565.0513472950851 and a p-value of 0.0 indicate a statistically significant difference in polarity scores across different overall ratings for both VADER and TextBlob. This suggests that the sentiment expressed in reviews varies significantly with the star ratings given by customers.


In [8]:
# Describe() polarity scores by rating
df2.groupby('overall').describe()[['vader_polarity', 'textblob_polarity']]

Unnamed: 0_level_0,vader_polarity,vader_polarity,vader_polarity,vader_polarity,vader_polarity,vader_polarity,vader_polarity,vader_polarity,textblob_polarity,textblob_polarity,textblob_polarity,textblob_polarity,textblob_polarity,textblob_polarity,textblob_polarity,textblob_polarity
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
overall,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
1.0,421.0,-0.027843,0.604262,-0.9934,-0.5926,-0.017,0.5118,0.9966,421.0,0.014128,0.236167,-1.0,-0.107143,0.008929,0.15303,0.75
2.0,560.0,0.222996,0.558643,-0.9407,-0.25,0.3471,0.72065,0.9977,560.0,0.086079,0.191643,-1.0,-0.01433,0.086369,0.196706,1.0
3.0,1069.0,0.439957,0.497928,-0.9527,0.1531,0.5927,0.8462,0.9981,1069.0,0.148718,0.173998,-0.65,0.045455,0.1375,0.25,0.8
4.0,2096.0,0.689216,0.361208,-0.9706,0.5927,0.83075,0.927225,0.9995,2096.0,0.22823,0.185303,-0.6,0.107896,0.213854,0.327865,1.0
5.0,5854.0,0.795509,0.276393,-0.9715,0.77135,0.8957,0.9498,0.9997,5854.0,0.322229,0.192159,-0.5,0.193238,0.306583,0.443312,1.0


The descriptive statistics for VADER and TextBlob polarity scores, grouped by overall ratings, provide further insights:

The columns “min”, “25%”, “50%”, “75%”, and “max” represent different percentiles and extremes in the distribution of polarity scores. These statistics help in sentiment analysis tasks by providing a comprehensive view of how sentiments are distributed across different ratings

Min: The lowest polarity score, indicating the most negative sentiment within the group.
25%: The lower quartile, where 25% of scores are below this value, showing the lower bound of sentiment scores.
50%: The median, indicating the middle sentiment score.
75%: The upper quartile, where 75% of scores are below this value, showing the upper bound of sentiment scores.
Max: The highest polarity score, indicating the most positive sentiment within the group.

1-Star Ratings: Negative mean polarity scores from both VADER and TextBlob suggest that reviews with 1-star ratings express negative sentiments.

2 to 5-Star Ratings: Positive mean polarity scores indicate that reviews with higher star ratings express more positive sentiments. This trend is consistent across both VADER and TextBlob, with the mean polarity scores increasing as the star ratings increase. Correspond to higher positive mean polarities, indicating more positive sentiments.

Standard Deviation: The standard deviation for VADER polarity scores tends to decrease with higher ratings, suggesting more consistency in positive sentiment expression among higher-rated reviews. TextBlob’s standard deviation does not show a consistent trend.


These results confirm that sentiment scores are correlated with the star ratings given by customers. For instance, a high VADER polarity score is likely to correspond to a high star rating, which can be used to predict customer satisfaction levels. Similarly, TextBlob’s polarity and subjectivity scores can provide additional layers of understanding about the reviews’ sentiments and how much personal opinion they contain.

