In [155]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

import numpy as np
import json

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:

business_csv = "/content/drive/MyDrive/ids_data/nv_business.csv"
checkin_csv = "/content/drive/MyDrive/ids_data/nv_checkin.csv"

business_df = pd.read_csv(business_csv)
checkin_df = pd.read_csv(checkin_csv)

# cleaning business.csv
business_df["city"] = business_df["city"].str.lower().str.strip().str.replace(',', '')
city_mapping = {
    'reno': 'reno',
    'sparks': 'sparks',
    'fernley': 'fernley',
    'virginia city': 'virginia city',
    'spanish springs': 'spanish springs',
    'sun valley': 'sun valley',
    'verdi': 'verdi',
    'washoe valley': 'washoe valley',
    'reno ap': 'reno',
    'reno nevada': 'reno',
    'vc highlands': 'vc highlands',
    'cold springs': 'cold springs',
    'nevada': 'reno',
    'washoe': 'washoe',
    'mount laurel': 'mount laurel',
    'carson city': 'carson city',
    'mccarran': 'mccarran',
    'new washoe city': 'new washoe city',
    'reno city': 'reno',
    'stead': 'stead',
    'south reno': 'reno',
    'reno sparks': 'sparks'
}

business_df['city'] = business_df['city'].map(city_mapping)

business_df["categories"] = business_df["categories"].str.lower()

checkin_counts = checkin_df.groupby('business_id').size().reset_index(name='checkin_count')

In [None]:
# merge count with business.csv
merged_df = pd.merge(business_df, checkin_counts, on='business_id')
print(merged_df.describe())
print(merged_df.columns)
print(len(merged_df))

merged_df = merged_df.dropna(subset=['city', 'categories', 'hours'])
print(len(merged_df))

# drop unnecessary columns
merged_df = merged_df[['city', 'categories', 'hours', 'checkin_count']]
print(merged_df.describe())


        postal_code     latitude    longitude        stars  review_count  \
count   6568.000000  6571.000000  6571.000000  6571.000000   6571.000000   
mean   89482.012485    39.510522  -119.781366     3.700807     60.390808   
std     1015.009297     0.048285     0.556086     0.950229    128.638930   
min     8054.000000    39.301508  -120.026076     1.000000      5.000000   
25%    89501.000000    39.484533  -119.811256     3.000000     10.000000   
50%    89502.000000    39.519103  -119.788540     4.000000     22.000000   
75%    89511.000000    39.532832  -119.758821     4.500000     56.000000   
max    98501.000000    39.980331   -74.858983     5.000000   3345.000000   

           is_open  checkin_count  
count  6571.000000    6571.000000  
mean      0.800030     131.598082  
std       0.400008     419.942677  
min       0.000000       1.000000  
25%       1.000000       6.000000  
50%       1.000000      23.000000  
75%       1.000000      92.000000  
max       1.000000   13420.

In [None]:
# preview of one-hot encodings
cities_list = merged_df['city'].unique()
categories_list = merged_df['categories'].str.split(',').apply(lambda x: [item.strip() for item in x]).explode().unique()

print(len(cities_list))
print(len(categories_list))
print(cities_list)

16
995
['reno' 'fernley' 'sparks' 'virginia city' 'verdi' 'spanish springs'
 'sun valley' 'washoe valley' 'vc highlands' 'cold springs' 'washoe'
 'mount laurel' 'carson city' 'mccarran' 'new washoe city' 'stead']


In [None]:
# create one-hot encoding for categories
merged_df['categories_lst'] = merged_df['categories'].apply(lambda line: [c.strip() for c in line.split(',')])

mlb = MultiLabelBinarizer()
categories_encoded = mlb.fit_transform(merged_df['categories_lst'])

unique, counts = np.unique(categories_encoded, return_counts=True)
print(dict(zip(unique, counts)))
print(categories_encoded.shape)

categories_mlb = mlb.classes_


# combine categories encoding with merged_df
categories_df = pd.DataFrame(categories_encoded, columns=mlb.classes_, index=merged_df.index)
merged_df = pd.concat([merged_df, categories_df], axis=1)
merged_df[mlb.classes_] = merged_df[mlb.classes_].fillna(0)

print(merged_df.columns)

{0: 6507183, 1: 29967}
(6570, 995)
Index(['city', 'categories', 'hours', 'checkin_count', 'categories_lst',
       '& probates', '3d printing', 'acai bowls', 'accessories', 'accountants',
       ...
       'wine bars', 'wine tasting classes', 'wine tasting room', 'wine tours',
       'wineries', 'women's clothing', 'wraps', 'yelp events', 'yoga', 'zoos'],
      dtype='object', length=1000)


In [None]:
# create one-hot encoding for cities
cities_encoded = pd.get_dummies(merged_df['city'])
merged_df = pd.concat([merged_df, cities_encoded], axis=1)
print(merged_df.columns)

Index(['city', 'categories', 'hours', 'checkin_count', 'categories_lst',
       '& probates', '3d printing', 'acai bowls', 'accessories', 'accountants',
       ...
       'reno', 'spanish springs', 'sparks', 'stead', 'sun valley',
       'vc highlands', 'verdi', 'virginia city', 'washoe', 'washoe valley'],
      dtype='object', length=1016)


In [None]:
# encode open hours by total #hours per week

# parse format: a:b-c:d
def parse_hours(hours):
    if '-' not in hours:
        return 0
    start, end = hours.split('-')
    start_hour, start_min = map(int, start.split(':'))
    end_hour, end_min = map(int, end.split(':'))
    # handle cases like 17:0-3:0
    if end_hour < start_hour:
        end_hour += 24
    return (end_hour - start_hour) + (end_min - start_min) / 60

def parse_weekly_open_hours(open_hours):
    if open_hours == "Unknown":
        return 0
    try:
        time_dict = json.loads(open_hours.replace("'", '"'))
        return sum(parse_hours(time_dict[day]) for day in time_dict)
    except json.JSONDecodeError:
        return 0

merged_df['total_hours_week'] = merged_df['hours'].apply(parse_weekly_open_hours)
print(merged_df['total_hours_week'].describe())
print(len(merged_df[merged_df['total_hours_week'] == 0]))


count    6570.000000
mean       44.395797
std        30.359341
min         0.000000
25%        24.625000
50%        45.000000
75%        63.000000
max       167.883333
Name: total_hours_week, dtype: float64
1358


  merged_df['total_hours_week'] = merged_df['hours'].apply(parse_weekly_open_hours)


In [None]:
# It is observed that a significant portion has unknown open hours,
# hence we will do an estimate with mean of other businesses.

mean_hours = merged_df[merged_df['total_hours_week'] > 0]['total_hours_week'].mean()
print(mean_hours)
merged_df['total_hours_week'] = merged_df['total_hours_week'].replace(0.0, mean_hours)
print(len(merged_df[merged_df['total_hours_week'] <= 0]))
print(merged_df['total_hours_week'].describe())

55.96323548222051
0
count    6570.000000
mean       55.963235
std        20.200637
min         0.250000
25%        44.000000
50%        55.963235
75%        63.000000
max       167.883333
Name: total_hours_week, dtype: float64


In [None]:
# prepare training data
print(merged_df.columns)
train_df = merged_df.drop(columns=['city', 'categories', 'hours', 'categories_lst'])
print(train_df.columns)

Index(['city', 'categories', 'hours', 'checkin_count', 'categories_lst',
       '& probates', '3d printing', 'acai bowls', 'accessories', 'accountants',
       ...
       'spanish springs', 'sparks', 'stead', 'sun valley', 'vc highlands',
       'verdi', 'virginia city', 'washoe', 'washoe valley',
       'total_hours_week'],
      dtype='object', length=1017)
Index(['checkin_count', '& probates', '3d printing', 'acai bowls',
       'accessories', 'accountants', 'acne treatment', 'active life',
       'acupuncture', 'addiction medicine',
       ...
       'spanish springs', 'sparks', 'stead', 'sun valley', 'vc highlands',
       'verdi', 'virginia city', 'washoe', 'washoe valley',
       'total_hours_week'],
      dtype='object', length=1013)


In [None]:
# train
X = train_df.drop(['checkin_count'], axis=1)
y = train_df['checkin_count']
print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                         max_depth=5, alpha=10, n_estimators=100)

y_train_transformed = np.log(y_train + 1)
model.fit(X_train, y_train_transformed)
model.save_model('regressor_exp.json')

preds = np.exp(model.predict(X_test)) - 1
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

Index(['& probates', '3d printing', 'acai bowls', 'accessories', 'accountants',
       'acne treatment', 'active life', 'acupuncture', 'addiction medicine',
       'adult',
       ...
       'spanish springs', 'sparks', 'stead', 'sun valley', 'vc highlands',
       'verdi', 'virginia city', 'washoe', 'washoe valley',
       'total_hours_week'],
      dtype='object', length=1012)
RMSE: 252.44399835929175


In [None]:
# Test prediction
# Feature vector: Category(995) + City(16) + Open hours(1)

ctgr_selection = [2, 4, 11]
city_selection = 12

cities_list_real = cities_encoded.columns.tolist()
# open_ctgr = categories_mlb[ctgr_selection]
open_ctgr = ['food trucks']
# open_city = cities_list_real[city_selection]
open_city = 'reno'
print(open_ctgr)
print(open_city)

categories_mlb_list = categories_mlb.tolist()

with open('categories.json', 'w') as f:
    json.dump(categories_mlb_list, f)

with open('cities.json', 'w') as f:
    json.dump(cities_list_real, f)

def get_feature_vector(open_ctgr, open_city, open_hours):
    vec = np.zeros(1012)
    one_indices = [categories_mlb_list.index(c) for c in open_ctgr]

    city_idx = cities_list_real.index(open_city)
    print(city_idx)
    one_indices.append(city_idx + 995)
    vec[one_indices] = 1
    vec[-1] = open_hours
    return vec.reshape(1, -1)

vec = get_feature_vector(open_ctgr, open_city, 55)
print(vec)
print(vec.shape)

pred = np.exp(model.predict(vec)) - 1
print(pred)

['food trucks']
reno
6
[[ 0.  0.  0. ...  0.  0. 55.]]
(1, 1012)
[8.90114]
