In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

import string, nltk
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
# table to 'replace punctuation with nothing'/remove punctuation
table = str.maketrans('','',string.punctuation)

# stop words
stop_words = stopwords.words('english')
#print(stop_words)

# POS dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

# function to clean text
def clean(text):
    # split sentences into words/tokenize
    words = str(text).split()
    # remove punctuation
    words = [word.translate(table) for word in words]
    # make lower case
    words = [word.lower() for word in words]
    return words

def prep(words):
    # filter out stopwords
    filtered = [word for word in words if not word in stop_words]
    # filter out numeric
    filtered = [word for word in filtered if word.isalpha()]
    # pos tag filtered words
    tagged = pos_tag(filtered)
    # re-label tags into main categories (noun, verb, adverb, adjective) for wordnet lemmatizer
    tagged = [tuple([word, pos_dict.get(tag[0])]) for word, tag in tagged]
    return tagged

wnlemmatizer = WordNetLemmatizer()
def lemmatize(tagged_words):
    string = ''
    for word, tag in tagged_words:
        if not tag:
            lemma = word
            string = string + ' ' + lemma
        else:
            lemma = wnlemmatizer.lemmatize(word, pos=tag)
            string = string + ' ' + lemma
    return string

analyzer = SentimentIntensityAnalyzer()
def analyze(lemma):
    sentiment = analyzer.polarity_scores(lemma)
    return sentiment

In [4]:
# test code before processing whole review dataframe
file='../input/yelp-restaurant-reviews-2021/restaurantrev.csv'
reviews = pd.DataFrame()
for num, chunk in enumerate(pd.read_csv(file, parse_dates=['date'], chunksize=10)):
    chunk['clean_text'] = chunk['text'].apply(clean)
    chunk['text_length'] = chunk['clean_text'].apply(len)
    chunk['tagged_text'] = chunk['clean_text'].apply(prep)
    chunk['lemma_text'] = chunk['tagged_text'].apply(lemmatize)
    chunk['sentiment'] = chunk['lemma_text'].apply(analyze)
    #print(pd.DataFrame(chunk['sentiment'].apply(pd.Series)))
    chunk = chunk.join(pd.DataFrame(chunk['sentiment'].apply(pd.Series)))
    chunk = chunk.drop('sentiment', axis=1)
    #chunk['compound_sentiment'] = chunk['lemma_text'].apply(analyze)
    reviews = pd.concat([reviews, chunk])
    if num == 2:
        break

reviews.head()

In [5]:
# check outputs
print('original:\n',reviews.text[0])
print('cleaned:\n',reviews.clean_text[0])
print('review length:', len(reviews.clean_text[0]))
print('tagged:\n',reviews.tagged_text[0])
print('lemma-ed:\n',reviews.lemma_text[0])
print('sentiment:',reviews.compound[0])
reviews.info()


In [6]:
# process 4 and 5 star reviews
file='../input/yelp-restaurant-reviews-2021/restaurantrev.csv'
reviews = pd.DataFrame()
#for num, chunk in enumerate(pd.read_csv(file, parse_dates=['date'], chunksize=500000)):
for chunk in pd.read_csv(file, parse_dates=['date'], chunksize=500000):
    chunk = chunk[chunk['stars']>=4]
    chunk = chunk.drop('review_id', axis=1)
    chunk['clean_text'] = chunk['text'].apply(clean)
    chunk['text_length'] = chunk['clean_text'].apply(len)
    chunk['clean_text'] = chunk['clean_text'].apply(prep)
    chunk['clean_text'] = chunk['clean_text'].apply(lemmatize)
    chunk['sentiment'] = chunk['clean_text'].apply(analyze)
    chunk = chunk.join(pd.DataFrame(chunk['sentiment'].apply(pd.Series)))
    chunk = chunk.drop('sentiment', axis=1)
    reviews = pd.concat([reviews, chunk])
    #if num == 2:
    #    break

reviews.head()

In [7]:
reviews.info()

In [12]:
reviews.to_csv('fourplusstarsrev.csv')

<a href="./fourplusstarsrev.csv"> Download File </a>

In [17]:
reviews[fourstar].head()

In [18]:
reviews[fivestar].head()

In [26]:
fourstar = (reviews['stars']==4) # 4 star reviews
fivestar = (reviews['stars']==5) # 5 star reviews
fourfive = (reviews['stars']>=4) # 4 & 5 star reviews

# look at length of reviews
sns.boxplot(x=reviews.loc[fourfive,'stars'], y=reviews.loc[fourfive,'text_length'])

In [27]:
# some review length outliers
reviews.loc[reviews['text_length']>=800,'text']

In [28]:
# take a look at text of a long, outlier review
reviews.loc[4055,'text']

In [20]:
# plot VADER compound sentiment for 4 and 5 star reviews
fig, axs = plt.subplots(1,3,figsize=(15,5))

axs[0].hist(reviews.loc[fourstar,'compound'])
axs[0].set_title('4 star reviews:\nCompound Sentiment')

axs[1].hist(reviews.loc[fivestar,'compound'])
axs[1].set_title('5 star reviews:\nCompound Sentiment')

sns.boxplot(x=reviews.loc[fourfive,'stars'], y=reviews.loc[fourfive,'compound'], ax=axs[2])


In [21]:
# plot VADER positive sentiment for 4 and 5 star reviews
fig, axs = plt.subplots(1,3,figsize=(15,5))

axs[0].hist(reviews.loc[fourstar,'pos'])
axs[0].set_title('4 star reviews:\nPositive Sentiment')

axs[1].hist(reviews.loc[fivestar,'pos'])
axs[1].set_title('5 star reviews:\nPositive Sentiment')

sns.boxplot(x=reviews.loc[fourfive,'stars'], y=reviews.loc[fourfive,'pos'], ax=axs[2])


In [13]:
# plot VADER neutral sentiment for 4 and 5 star reviews
fig, axs = plt.subplots(1,3,figsize=(15,5))

axs[0].hist(reviews.loc[fourstar,'neu'])
axs[0].set_title('4 star reviews:\nNeutral Sentiment')

axs[1].hist(reviews.loc[fivestar,'neu'])
axs[1].set_title('5 star reviews:\nNeutral Sentiment')

sns.boxplot(x=reviews.loc[fourfive,'stars'], y=reviews.loc[fourfive,'neu'], ax=axs[2])


In [14]:
reviews.describe()

In [16]:
reviews.loc[fourstar,'neu'].describe()

In [24]:
reviews.loc[fivestar,'neu'].describe()

In [3]:
# cohen's d test for effect size
# 0.2 small, 0.5 moderate, 0.8 large
m1 = 0.6104529
m2 = 0.5653222
v1 = 0.1181829**2
v2 = 0.1277508**2
n1 = 1460390
n2 = 2246079
d = (m1 - m2)/np.sqrt(((n1-1)*v1 + (n2-1)*v2)/(n1+n2-2))
d

In [1]:
from scipy.stats import ttest_ind

ttest_ind(reviews.loc[fourstar,'neu'], reviews.loc[fivestar,'neu'], equal_var=False)

In [None]:
# process 1 star reviews
file='../input/yelp-restaurant-reviews-2021/restaurantrev.csv'
onestarreviews = pd.DataFrame()
for chunk in pd.read_csv(file, parse_dates=['date'], chunksize=500000):
    chunk = chunk[chunk['stars']==1]
    chunk = chunk.drop('review_id', axis=1)
    chunk['clean_text'] = chunk['text'].apply(clean)
    chunk['text_length'] = chunk['clean_text'].apply(len)
    chunk['clean_text'] = chunk['clean_text'].apply(prep)
    chunk['clean_text'] = chunk['clean_text'].apply(lemmatize)
    chunk['sentiment'] = chunk['clean_text'].apply(analyze)
    chunk = chunk.join(pd.DataFrame(chunk['sentiment'].apply(pd.Series)))
    chunk = chunk.drop('sentiment', axis=1)
    onestarreviews = pd.concat([onestarreviews, chunk])

onestarreviews.head()

In [None]:
onestarreviews.describe()

In [None]:
onestar = (reviews['stars']==1) # 1 star reviews

fig, axs = plt.subplots(1,3,figsize=(15,5))

axs[0].hist(reviews.loc[onestar,'neu'])
axs[0].set_title('1 star reviews:\nNeutral Sentiment')


In [None]:
# t-test between neutral sentiment of one and five star reviews
ttest_ind(onestarreviews.loc[onestar,'neu'], reviews.loc[fivestar,'neu'], equal_var=False)