In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
# load user csv 
userdf = pd.read_csv('users.csv', parse_dates=['yelping_since'])
userdf.head()

In [29]:
userdf.info()

In [30]:
userdf.describe()

In [31]:
# take a look at users with zero reviews
userdf[userdf.review_count==0]

In [32]:
# since average stars is given, review count probably at least 1 instead of zero
zerorev = (userdf.review_count==0) # True at indexes where review count is/was zero
# replace zeros with 1
userdf.loc[userdf.review_count==0,'review_count'] = 1 
userdf.loc[zerorev,:]

In [33]:
print(f'total reviews: {userdf.review_count.sum()}')
print(f'total users: {len(userdf)}')

In [34]:
sns.violinplot(x=userdf.review_count)

In [35]:
# add columns 
userdf['start_year']=userdf.yelping_since.dt.year # yelping_since year
userdf['years_member'] = 2022 - userdf.start_year # number years yelp member
userdf['reviews/year'] = userdf.review_count/userdf.years_member # number reviews per year
userdf = userdf.drop('yelping_since', axis=1)
userdf.head()

In [36]:
# look at distribution of review count
fig, axs = plt.subplots(1,2, figsize=(10,5))
axs[0].hist(np.log10(userdf['review_count']))
axs[0].set_title('log10 review count')
axs[0].set_xlabel('log(review_count)')
#axs[0].hist(userdf['review_count'], histtype='step', density=True, cumulative=True)
axs[1].hist(np.log10(userdf['review_count']), histtype='step', density=True, cumulative=True)
axs[1].set_title('Cumulative')
axs[1].set_xlabel('log(review_count)')

In [37]:
# almost 60% of people have 10 reviews or less
fig, axs = plt.subplots(1,2, figsize=(10,5))
#sns.scatterplot(x=userdf['review_count'], y=userdf['average_stars'], ax=axs[0])
fewrev = (userdf['review_count'] <=10)
sns.boxplot(x=userdf.loc[fewrev,'review_count'], y=userdf.loc[fewrev,'average_stars'], ax=axs[0])
axs[0].set_title('Users <=10 reviews: \n Distribution of Average Rating')
sns.histplot(x=userdf.loc[userdf['review_count']==1,'average_stars'], ax=axs[1])
axs[1].set_title('Users with One Review: \n Distribution of Rating')
#axs[2].hist(x=userdf.loc[userdf['review_count']==1,'average_stars'], histtype='step', density=True, cumulative=True)


In [38]:
# why do people start yelping? generally to share a really good or bad experience
# ~50% of users with one review rate the business 4+ stars
# ~25% rate business 1 star
# minority of people only review to share bad experiences (average rating less than 2 stars even as review count increases)

In [39]:
# small fraction of users have 100+ reviews
fig, axs = plt.subplots(1,3, figsize=(15,5))
#sns.scatterplot(x=userdf['review_count'], y=userdf['average_stars'], ax=axs[0])
manyrev = (userdf['review_count'] >=100)
manyrevperyr = np.log10(userdf.loc[manyrev,'reviews/year'])

sns.boxplot(x=manyrevperyr, ax=axs[0])
axs[0].set_title('Users >=100 reviews: \n log10 Reviews per Year')
axs[0].set_xlabel('log10(Reviews/Year)')

# active users
activeuser = (userdf['reviews/year']>=10) & (userdf['reviews/year']<=30)
sns.kdeplot(x=userdf.loc[activeuser,'reviews/year'], y=userdf.loc[activeuser,'average_stars'], fill=True, ax=axs[1])
axs[1].set_title('Users 10-30 Reviews/Year: \n Distribution of Average Rating')
axs[1].set_xlabel('Reviews/Year')

# possible bots/fake reviewers
botrevperyr = userdf['reviews/year']>=100
sns.kdeplot(x=np.log10(userdf.loc[botrevperyr,'reviews/year']), y=userdf.loc[botrevperyr,'average_stars'], fill=True, ax=axs[2])
axs[2].set_title('Users >= 100 Reviews/Year: \n Distribution of Average Rating')
axs[2].set_xlabel('log10(Reviews/Year)')

In [40]:
# summary of active users with 10-30 reviews/year
userdf[activeuser].describe()

In [41]:
# summary of potential bots/fake reviewers with >=100 reviews/year
userdf[botrevperyr].describe()

In [42]:
reviewdf = pd.read_csv('restaurantrev.csv', parse_dates=['date'])
reviewdf.head()

In [43]:
reviewdf = reviewdf.drop(['review_id'],axis=1)
reviewdf.info()

In [44]:
# look at reviews of users with >=100 reviews/year
# ~80k reviws only comprise a small fraction of 5.5M restaurant reviews
botrev = reviewdf[reviewdf.user_id.isin(userdf[botrevperyr].user_id)]
print(botrev.info())
botrev.head()

In [45]:
# active reviwers with 10-30 reviews/year
# almost 1M reviews comprises almost 20% of 5.5M restaurant reviews
activerev = reviewdf[reviewdf.user_id.isin(userdf[activeuser].user_id)]
print(activerev.info())
activerev.head()

In [46]:
fig,axs = plt.subplots(1,3,figsize=(15,5))

normrev = reviewdf[reviewdf.user_id.isin(userdf[fewrev].user_id)]
axs[0].hist(x=normrev['stars'])
axs[0].set_title('Rating Distribution:\n Users with <=10 reviews')
axs[0].set_xlabel('Stars')
axs[1].hist(x=activerev['stars'])
axs[1].set_title('Rating Distribution:\n Users with 10-30 reviews/yr')
axs[1].set_xlabel('Stars')
axs[2].hist(x=botrev['stars'])
axs[2].set_title('Rating Distribution:\n Users with >=100 reviews/yr')
axs[2].set_xlabel('Stars')


In [47]:
restaurantdf = pd.read_csv('restaurants.csv')
# number of potential bots in reviews
print('# unique users with >=100 reviews/year in reviews:', len(botrev.user_id.unique()))

# summary of restaurants reviewed by potential bots
botrest = restaurantdf[restaurantdf.business_id.isin(botrev.business_id)]
botrest.describe()

In [48]:
# all restaurants
restaurantdf.describe()