In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

import string, nltk
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [11]:
# table to 'replace punctuation with nothing'/remove punctuation
table = str.maketrans('','',string.punctuation)

# function to clean text
def clean(text):
    # split sentences into words/tokenize
    words = str(text).split()
    # remove punctuation
    words = [word.translate(table) for word in words]
    # make lower case
    words = [word.lower() for word in words]
    return words

analyzer = SentimentIntensityAnalyzer()

def analyze(sentences):
    # process review as column of sentences
    df = pd.DataFrame({'sentences': sentences})
    df['scores'] = df['sentences'].apply(analyzer.polarity_scores)
    
    splitdf = pd.DataFrame(df['scores'].apply(pd.Series))
    # get mean of each category
    avgpos = np.around(np.mean(splitdf['pos']), decimals=3)
    avgneg = np.around(np.mean(splitdf['neg']), decimals=3)
    avgneu = np.around(np.mean(splitdf['neu']), decimals=3)
    avgcomp = np.around(np.mean(splitdf['compound']), decimals=4)
    
    return {'neg':avgneg, 'neu': avgneu, 'pos': avgpos, 'compound': avgcomp}

In [12]:
# test code before processing whole review dataframe
file='../input/yelp-restaurant-reviews-2021/restaurantrev.csv'
reviews = pd.DataFrame()
for num, chunk in enumerate(pd.read_csv(file, parse_dates=['date'], chunksize=2)):
    chunk['clean_text'] = chunk['text'].apply(clean)
    chunk['text_length'] = chunk['clean_text'].apply(len)
    
    chunk['sentences'] = chunk['text'].apply(sent_tokenize)
    chunk['sentiment'] = chunk['sentences'].apply(analyze)

    chunk = chunk.join(pd.DataFrame(chunk['sentiment'].apply(pd.Series)))
    chunk = chunk.drop(['sentences', 'sentiment','clean_text'], axis=1)
    
    reviews = pd.concat([reviews, chunk])
    if num == 2:
        break

print(reviews)

In [9]:
# check outputs
print('original:\n',reviews.text[0])
print('sentiment:',reviews.compound[0])
reviews.info()


In [13]:
# process 4 and 5 star reviews
file='../input/yelp-restaurant-reviews-2021/restaurantrev.csv'
reviews = pd.DataFrame()
#for num, chunk in enumerate(pd.read_csv(file, parse_dates=['date'], chunksize=500000)):
for chunk in pd.read_csv(file, parse_dates=['date'], chunksize=500000):
    print('process chunk...')
    chunk = chunk[chunk['stars']>=4]
    chunk = chunk.drop('review_id', axis=1)
    
    chunk['clean_text'] = chunk['text'].apply(clean)
    chunk['text_length'] = chunk['clean_text'].apply(len)
    
    chunk['sentences'] = chunk['text'].apply(sent_tokenize)
    chunk['sentiment'] = chunk['sentences'].apply(analyze2)

    chunk = chunk.join(pd.DataFrame(chunk['sentiment'].apply(pd.Series)))
    chunk = chunk.drop(['sentences', 'sentiment','clean_text'], axis=1)
    
    reviews = pd.concat([reviews, chunk])
    
reviews.head()

In [14]:
reviews.info()

In [15]:
reviews.to_csv('fourplusstarsrev.csv')

<a href="./fourplusstarsrev.csv"> Download File </a>

In [35]:
fourstar = (reviews['stars']==4) # 4 star reviews
fivestar = (reviews['stars']==5) # 5 star reviews
fourfive = (reviews['stars']>=4) # 4 & 5 star reviews

# look at length of reviews
sns.boxplot(x=reviews.loc[fourfive,'stars'], y=reviews.loc[fourfive,'text_length'])
plt.savefig('revlength')

In [17]:
reviews[fourstar].head()

In [18]:
reviews[fivestar].head()

In [19]:
# some review length outliers
reviews.loc[reviews['text_length']>=800,'text']

In [33]:
# plot VADER compound sentiment for 4 and 5 star reviews
fig, axs = plt.subplots(1,3,figsize=(15,5))

axs[0].hist(reviews.loc[fourstar,'compound'])
axs[0].set_title('4 star reviews:\nCompound Sentiment')

axs[1].hist(reviews.loc[fivestar,'compound'])
axs[1].set_title('5 star reviews:\nCompound Sentiment')

sns.boxplot(x=reviews.loc[fourfive,'stars'], y=reviews.loc[fourfive,'compound'], ax=axs[2])
plt.savefig('compsent')

In [31]:
# plot VADER positive sentiment for 4 and 5 star reviews
fig, axs = plt.subplots(1,3,figsize=(15,5))

axs[0].hist(reviews.loc[fourstar,'pos'])
axs[0].set_title('4 star reviews:\nPositive Sentiment')

axs[1].hist(reviews.loc[fivestar,'pos'])
axs[1].set_title('5 star reviews:\nPositive Sentiment')

sns.boxplot(x=reviews.loc[fourfive,'stars'], y=reviews.loc[fourfive,'pos'], ax=axs[2])
plt.savefig('possent')

In [29]:
# plot VADER neutral sentiment for 4 and 5 star reviews
fig, axs = plt.subplots(1,3,figsize=(15,5))

axs[0].hist(reviews.loc[fourstar,'neu'])
axs[0].set_title('4 star reviews:\nNeutral Sentiment')

axs[1].hist(reviews.loc[fivestar,'neu'])
axs[1].set_title('5 star reviews:\nNeutral Sentiment')

sns.boxplot(x=reviews.loc[fourfive,'stars'], y=reviews.loc[fourfive,'neu'], ax=axs[2])
plt.savefig('neusent.pdf')

In [23]:
reviews.describe()

In [24]:
reviews.loc[fourstar,'neu'].describe()

In [25]:
reviews.loc[fivestar,'neu'].describe()

In [26]:
# cohen's d test for effect size
# 0.2 small, 0.5 moderate, 0.8 large
# means
m1 = 0.7410229
m2 = 0.6955708
# variances
v1 = 0.1084526**2
v2 = 0.1238304**2
# sample sizes
n1 = 1460390
n2 = 2246079

d = (m1 - m2)/np.sqrt(((n1-1)*v1 + (n2-1)*v2)/(n1+n2-2))
d

In [27]:
from scipy.stats import ttest_ind

ttest_ind(reviews.loc[fourstar,'neu'], reviews.loc[fivestar,'neu'], equal_var=False)