In [None]:
# -----------------------------
# 1. Import libraries and files
# -----------------------------

# Import libraries
import pandas as pd

# Load business file
df_b = pd.read_csv('../datasets/business3.csv')

# Load review file
df_r = pd.read_parquet("../datasets/parquet_file/review_filtered.parquet")

In [2]:
# -----------------------------
# 2. Classify store_status
# -----------------------------

df_status = df_b[['business_id', 'name', 'is_open', 'stars', 'review_count']]

In [3]:
# Ensure date column is in datetime format
df_r['date'] = pd.to_datetime(df_r['date'])

# Calculate the earliest review date per business_id
first_review_dates = df_r.groupby('business_id')['date'].min().reset_index()
first_review_dates = first_review_dates.rename(columns={'date': 'first_review_date'})

# Merge with df_status to include earliest review date
df_status = df_status.merge(first_review_dates, on='business_id', how='left')

# Check results
df_status.head()

Unnamed: 0,business_id,name,is_open,stars,review_count,first_review_date
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,1,4.0,80,2008-03-09 00:36:56
1,il_Ro8jwPlHresjw9EGmBg,Denny's,1,2.5,28,2014-03-16 13:59:31
2,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,0,4.5,100,2009-12-03 00:38:28
3,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,1,4.0,245,2017-11-25 02:26:49
4,ROeacJQwBeh05Rqg7F6TCg,BAP,1,4.5,205,2013-12-10 22:13:17


In [4]:
# Define cutoff conditions
cutoff_date = pd.to_datetime("2019-01-19")
min_review = 50  # updated threshold

# Calculate 70th percentile for average rating
rating_threshold = df_status['stars'].quantile(0.7)

# Initialize store_status column
df_status['store_status'] = 'unknown'

# Assign store_status based on conditions
# close_real
df_status.loc[
    (df_status['is_open'] == 0) &
    ((df_status['review_count'] < min_review) | (df_status['stars'] <= rating_threshold)),
    'store_status'
] = 'close_real'

# close_external
df_status.loc[
    (df_status['is_open'] == 0) &
    (df_status['store_status'] == 'unknown'),
    'store_status'
] = 'close_external'

# open_old
df_status.loc[
    (df_status['is_open'] == 1) &
    (df_status['first_review_date'] < cutoff_date),
    'store_status'
] = 'open_old'

# open_new
df_status.loc[
    (df_status['is_open'] == 1) &
    (df_status['first_review_date'] >= cutoff_date),
    'store_status'
] = 'open_new'

# Check results
df_status['store_status'].value_counts()


store_status
open_old          12588
close_real         5041
open_new           1392
close_external      513
Name: count, dtype: int64

In [5]:
df_status['store_status'].value_counts()

store_status
open_old          12588
close_real         5041
open_new           1392
close_external      513
Name: count, dtype: int64

In [6]:
# Merge store_status from df_status into df_b based on business_id
df_b = df_b.merge(df_status[['business_id', 'store_status']], on='business_id', how='left')

In [7]:
df_b.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,stability_score,loyalty_score,reliability_score,store_status
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,62.925875,56.239278,open_old
1,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227.0,39.637133,-86.127217,2.5,28,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","American (Traditional), Restaurants, Diners, B...","{'Friday': '6:0-22:0', 'Monday': '6:0-22:0', '...",26.924359,51.449885,41.83536,open_old
2,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771.0,27.916116,-82.760461,4.5,100,0,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Food, Delis, Italian, Bakeries, Restaurants","{'Friday': '10:0-20:0', 'Monday': '10:0-18:0',...",60.376124,64.533041,40.716215,close_external
3,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106.0,39.953949,-75.143226,4.0,245,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Sushi Bars, Restaurants, Japanese","{'Friday': '13:30-23:0', 'Monday': None, 'Satu...",65.223712,62.774337,41.372722,open_old
4,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147.0,39.943223,-75.162568,4.5,205,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Korean, Restaurants","{'Friday': '11:30-20:30', 'Monday': '11:30-20:...",74.454651,62.12183,39.758111,open_old


In [8]:
df_b.to_csv('../datasets/business4.csv', index=False)