In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
from data_analysis.utils import user
from data.paths.parquet_paths import USER, REVIEW

In [2]:
user_df = pd.read_parquet(str(USER))
review_df = pd.read_parquet(str(REVIEW))

In [3]:
user_df = user.prepare_user_df(user_df)
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   user_id             10000 non-null  object        
 1   name                10000 non-null  object        
 2   review_count        10000 non-null  int32         
 3   yelping_since       10000 non-null  datetime64[ns]
 4   friends             10000 non-null  object        
 5   useful              10000 non-null  int32         
 6   funny               10000 non-null  int32         
 7   cool                10000 non-null  int32         
 8   fans                10000 non-null  int32         
 9   elite               0 non-null      object        
 10  average_stars       10000 non-null  float32       
 11  compliment_hot      10000 non-null  int32         
 12  compliment_more     10000 non-null  int32         
 13  compliment_profile  10000 non-null  int32      

In [4]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   review_id    100000 non-null  object 
 1   user_id      100000 non-null  object 
 2   business_id  100000 non-null  object 
 3   stars        100000 non-null  float32
 4   date         100000 non-null  object 
 5   text         100000 non-null  object 
 6   useful       100000 non-null  int32  
 7   funny        100000 non-null  int32  
 8   cool         100000 non-null  int32  
dtypes: float32(1), int32(3), object(5)
memory usage: 5.3+ MB


In [5]:
user_columns = [
    'user_id',
    'name',
    'review_count',
    'yelping_since',
    # 'useful',
    # 'funny',
    # 'cool',
    'fans',
    'average_stars'
]


review_user_df = review_df.join(user_df[user_columns].set_index('user_id'), on='user_id')
review_user_df['date'] = pd.to_datetime(review_user_df['date'])

In [6]:
review_user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   review_id      100000 non-null  object        
 1   user_id        100000 non-null  object        
 2   business_id    100000 non-null  object        
 3   stars          100000 non-null  float32       
 4   date           100000 non-null  datetime64[ns]
 5   text           100000 non-null  object        
 6   useful         100000 non-null  int32         
 7   funny          100000 non-null  int32         
 8   cool           100000 non-null  int32         
 9   name           8279 non-null    object        
 10  review_count   8279 non-null    float64       
 11  yelping_since  8279 non-null    datetime64[ns]
 12  fans           8279 non-null    float64       
 13  average_stars  8279 non-null    float32       
dtypes: datetime64[ns](2), float32(2), float64(2), int32(3

In [7]:
review_user_df = review_user_df.loc[~review_user_df['name'].isna()].copy()

In [8]:
review_user_df.sample(2)

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,name,review_count,yelping_since,fans,average_stars
22011,gmntFOx-fo2Dmq6kOstdhA,2HoYHTUjWfmY3vHruQlDhw,cTJjTKz2huGZ-ElScC2pSw,4.0,2011-11-02,"I really enjoyed Windsor! Very cool, urban atm...",5,1,3,Danielle,150.0,2009-04-12 22:39:13,10.0,4.17
56623,yIflNJy5-XsVoW6XCoNQew,j6wLUT0ZXi-x0otelYIFpA,hoTpqmlus91I6B9MjKmdXw,2.0,2013-05-31,I was hungry. I was desperate. I prepared a lu...,1,0,0,Elaine,996.0,2010-06-07 15:38:54,59.0,3.59


In [9]:
youth_threshold = timedelta(days=60)
early_reviews = review_user_df[review_user_df.date - review_user_df.yelping_since < youth_threshold]

early_reviews[['date', 'yelping_since', 'stars', 'user_id']].sample(10)

early_reviews = early_reviews.groupby('user_id').agg(list)
early_reviews.reset_index(inplace=True)
early_reviews['count'] = early_reviews['name'].apply(lambda x: len(x))
early_reviews[early_reviews['count'] >= 3].sample(3)

Unnamed: 0,user_id,review_id,business_id,stars,date,text,useful,funny,cool,name,review_count,yelping_since,fans,average_stars,count
104,9xCdyIxzfV6J-Out6XR2FA,"[S2JWNGvtc1UUyMDzLzsmCQ, ePp7YVfjmfiZ9-1mf_Dwj...","[UkqdcwYv6_c-yW1JPCKlUA, r84VYMFrJIgs3C_yi96Ux...","[3.0, 5.0, 4.0]","[2006-08-08 00:00:00, 2006-08-09 00:00:00, 200...",[Straight outta NY. Get decent deli and all t...,"[0, 4, 1]","[1, 0, 0]","[1, 3, 1]","[Josh, Josh, Josh]","[96.0, 96.0, 96.0]","[2006-08-05 22:09:03, 2006-08-05 22:09:03, 200...","[15.0, 15.0, 15.0]","[3.2200000286102295, 3.2200000286102295, 3.220...",3
82,7UBDZmeF101orAc72KeofQ,"[_YvTJMCGwr4Q0J5tfgxTig, LPSefImkbSoeZqqUZM_HD...","[1HD5iUUfVJDbfEBIn9yVhw, ibkwcLZwaJ1pnPJVaI_-u...","[3.0, 3.0, 4.0]","[2009-03-08 00:00:00, 2009-03-08 00:00:00, 200...","[Since this place is next to Lee Lee market, I...","[0, 0, 1]","[0, 0, 1]","[0, 0, 1]","[Akiko, Akiko, Akiko]","[348.0, 348.0, 348.0]","[2009-03-07 23:04:13, 2009-03-07 23:04:13, 200...","[22.0, 22.0, 22.0]","[3.5399999618530273, 3.5399999618530273, 3.539...",3
491,ppATz6PAs8pGi4VcfIrUSw,"[p1moq5KpzsXPYMI-AKYOdA, RG0zVmb6Y0J8Vxol7nbi5...","[sbPuQIxzYsEBWXyzMipIxg, VnOGWODJ7ZREYij0rpB-X...","[3.0, 5.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0]","[2009-10-25 00:00:00, 2009-10-29 00:00:00, 200...",[Damnit! They piss me off because the pizza i...,"[0, 1, 0, 4, 1, 0, 0, 0]","[1, 0, 0, 2, 0, 0, 0, 0]","[0, 0, 0, 5, 0, 0, 0, 0]","[Lindsay, Lindsay, Lindsay, Lindsay, Lindsay, ...","[485.0, 485.0, 485.0, 485.0, 485.0, 485.0, 485...","[2009-09-11 19:46:53, 2009-09-11 19:46:53, 200...","[35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0]","[3.9000000953674316, 3.9000000953674316, 3.900...",8


In [10]:
review_bursts = review_user_df.groupby(['user_id', 'date']).agg(list)
review_bursts.reset_index(inplace=True)
review_bursts['count'] = review_bursts['name'].apply(lambda x: len(x))
review_bursts[review_bursts['count'] >= 3].sample(3)

Unnamed: 0,user_id,date,review_id,business_id,stars,text,useful,funny,cool,name,review_count,yelping_since,fans,average_stars,count
735,5Kbn1FjXmS7Jypz3_ybt_Q,2006-01-14,"[_mKKcQEcClgg69pLql-2CQ, gWWX-bdEpQXqUkVAbdn6H...","[VgGO13Fx7dQj4zg3IuVDLw, Dw4qOCZYT3ge2lGvXmet7...","[2.0, 4.0, 3.0, 4.0]",[Definitely a mixed crowd as stated before. D...,"[0, 0, 0, 2]","[0, 0, 0, 1]","[0, 0, 0, 1]","[Kelly, Kelly, Kelly, Kelly]","[91.0, 91.0, 91.0, 91.0]","[2006-01-14 04:33:56, 2006-01-14 04:33:56, 200...","[10.0, 10.0, 10.0, 10.0]","[4.03000020980835, 4.03000020980835, 4.0300002...",4
4937,c8qFkI_VusWo0xZvkjfBWQ,2005-05-26,"[aG8DkGSVcvIchm7mjDjzYw, euU8avFp6j1noN70fulyl...","[ODTI9W7fHMKNWZ3g8VcFUg, gn8cZOxQIZNUeiHcJ0WxM...","[3.0, 3.0, 3.0, 5.0, 3.0]",[I thought the food here to be a little on the...,"[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0]","[1, 0, 1, 0, 0]","[matthew, matthew, matthew, matthew, matthew]","[366.0, 366.0, 366.0, 366.0, 366.0]","[2005-05-14 16:32:28, 2005-05-14 16:32:28, 200...","[21.0, 21.0, 21.0, 21.0, 21.0]","[3.8299999237060547, 3.8299999237060547, 3.829...",5
100,-pXs08gJq9ExIk275YLvPg,2008-12-09,"[oYy5PYC8seNfkJkVWxG7Iw, yCaEi3qWZcy22JxB6GaHH...","[6z8niWBqVsjyenVG5_LtDw, DftK-qzebIR829HozNYfg...","[4.0, 4.0, 4.0, 4.0]",[the opera house is a really fun venue and con...,"[0, 1, 3, 1]","[0, 0, 0, 2]","[0, 0, 0, 1]","[Jimi, Jimi, Jimi, Jimi]","[392.0, 392.0, 392.0, 392.0]","[2008-10-30 22:55:06, 2008-10-30 22:55:06, 200...","[18.0, 18.0, 18.0, 18.0]","[3.8399999141693115, 3.8399999141693115, 3.839...",4


In [11]:
# ~14 years * ~50 weeks
bins_count = 700
review_user_df['date_bin'] = pd.cut(review_user_df['date'], bins=bins_count, labels=range(bins_count))

review_bursts_binned = review_user_df.groupby(['user_id', 'date_bin']).agg(list)
review_bursts_binned.reset_index(inplace=True)
review_bursts_binned = review_bursts_binned.dropna()
review_bursts_binned['count'] = review_bursts_binned['name'].apply(lambda x: len(x))
review_bursts_binned[review_bursts_binned['count'] >= 3].sample(3)

Unnamed: 0,user_id,date_bin,review_id,business_id,stars,date,text,useful,funny,cool,name,review_count,yelping_since,fans,average_stars,count
2051311,benfF2qIwxDz7TCeF1XWIA,311,"[jsPyjDJcX4QwwDuN0asQ0A, dqJdyOJTyVzDMD46ITV2G...","[k7VQD0pbdFN0Ju0yNVrtZQ, 7L7frgS9I4KeTkqPXh7w2...","[2.0, 5.0, 2.0]","[2011-03-23 00:00:00, 2011-03-23 00:00:00, 201...",[the waitresses are salty.\nthe food is medioc...,"[2, 3, 1]","[1, 0, 1]","[0, 1, 0]","[Tom, Tom, Tom]","[60.0, 60.0, 60.0]","[2011-03-23 00:21:53, 2011-03-23 00:21:53, 201...","[5.0, 5.0, 5.0]","[4.099999904632568, 4.099999904632568, 4.09999...",3
522450,97TjZvvXtVZY91NEPZXZBQ,250,"[w69YNsDzSABAFXrVqny4BQ, gK0JHvw84IS-BJYCvnV2A...","[LpJvitnw7vU5UW-3KE9kjA, XZbuPXdyA0ZtTu3AzqtQh...","[4.0, 3.0, 4.0, 5.0]","[2010-01-11 00:00:00, 2010-01-09 00:00:00, 201...",[If you had to pick one spot to be held hostag...,"[0, 2, 1, 2]","[0, 0, 1, 2]","[0, 0, 1, 3]","[E.C., E.C., E.C., E.C.]","[50.0, 50.0, 50.0, 50.0]","[2010-01-09 16:10:29, 2010-01-09 16:10:29, 201...","[4.0, 4.0, 4.0, 4.0]","[3.9000000953674316, 3.9000000953674316, 3.900...",4
1836914,Yej5B4nd8PqpHMQcmCTDrg,114,"[AqajRN1ywUS2d8drNaCjVg, O_71qjvZBew4v6VsuIMPP...","[dPGs5b0N9MarZjVgQVelGQ, s4xr81NFv82F8ltxLdrj0...","[2.0, 4.0, 3.0]","[2007-05-23 00:00:00, 2007-05-24 00:00:00, 200...",[So I've admitted in the Biltmore location's r...,"[2, 1, 2]","[4, 0, 2]","[1, 0, 0]","[Stevey, Stevey, Stevey]","[553.0, 553.0, 553.0]","[2006-09-14 17:37:12, 2006-09-14 17:37:12, 200...","[63.0, 63.0, 63.0]","[3.7300000190734863, 3.7300000190734863, 3.730...",3


In [12]:
early_reviewers = set(early_reviews['user_id'])
oneday_burst_reviewers = set(review_bursts['user_id'])
oneweek_burst_reviewers = set(review_bursts_binned['user_id'])

print(f'1d / 1w: {len(oneday_burst_reviewers.intersection(oneweek_burst_reviewers))}')
print(f'early / 1w: {len(early_reviewers.intersection(oneweek_burst_reviewers))}')
print(f'early / 1d: {len(early_reviewers.intersection(oneday_burst_reviewers))}')

1d / 1w: 4692
early / 1w: 558
early / 1d: 558


In [81]:
# mark users as review_bursters (let's peek early and daily)

review_bursters = set(pd.concat([early_reviews[early_reviews['count'] >= 3]['user_id'], 
                             review_bursts['user_id']])
                             .drop_duplicates())

user_df['is_burster'] = user_df['user_id'].apply(lambda x: x in review_bursters)
user_df[['user_id', 'is_burster']].sample(10)

Unnamed: 0,user_id,is_burster
2149,63iMAf0pB9au28uOteas3Q,True
6645,IO7Xq9WUMZBhkRrEDKvnIg,False
5443,1FAKs3CAQQLBgdB23GeDIQ,False
8753,QOlY5FYliVVvkgqgtjdP_w,True
3202,sN0zLCtwP9JVcsrPB86G7Q,True
99,6s-g2vFu12OemhiK3FJuOQ,False
7445,o_1beq3LiW_JH_rgUNBoFA,True
6316,WJa0xdQQTa4WxDSc66flJw,False
522,c3JovNdFoqK7-gw-rTG8CQ,True
9422,33v8c5ftrKuDGTw1ZQ2toQ,True


In [None]:
# create regression model on bursters

# user_reg_ready = 