In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier

## Constants

In [43]:
TRAIN_FILE = 'data/train.csv'
TEST_FILE = 'data/test.csv'
REVEALED_TEST_FILE = 'data/revealed_test.csv'
CENSUS_FILE = 'data/census_starter.csv'

## Utilities

In [44]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def one_hot(df):
    columnsToEncode = list(df.select_dtypes(include=['category','object']))
    for each in columnsToEncode:
        df=pd.concat([df,pd.get_dummies(df[each],prefix=each, drop_first=True)],axis=1).drop([each],axis=1)
    return df

def fill_na(df):
    columnsToEncode = list(df.select_dtypes(include=['category','object']))
    for each in columnsToEncode:
        df[each] = df[each].fillna(df[each].mode().iloc[0])
    df = df.fillna(df.median())
    return df

## Load Data

In [45]:
train_raw = load_data(TRAIN_FILE)
test_raw = load_data(REVEALED_TEST_FILE)
census_raw = load_data(CENSUS_FILE)

In [46]:
train = pd.merge(left = train_raw, right = census_raw, how = 'left')
test = pd.merge(left = test_raw, right = census_raw, how = 'left')
all_data = pd.concat([train, test])

## Data Cleaning

In [47]:
all_data = all_data.drop(['row_id'], axis = 1)
# all_data = all_data.drop(['first_day_of_month'], axis = 1)

## Feature Engineering

In [48]:
NUM_TRAIN = round(len(all_data)*0.8)

In [49]:
# One hot and imputation
all_data = fill_na(all_data)
all_data = one_hot(all_data)
all_data = all_data[~all_data.isin([np.nan, np.inf, -np.inf]).any(1)]

train = all_data[:NUM_TRAIN]
test = all_data[NUM_TRAIN:]

  df = df.fillna(df.median())


In [50]:
X_train = train.drop(['microbusiness_density'], axis = 1)
y_train = train['microbusiness_density']
X_test = test.drop(['microbusiness_density'], axis = 1)
y_test = test['microbusiness_density']

# Try different base models

## Linear Regression

In [51]:
import statsmodels.formula.api as smf #same functionality as R's lm
reg_setup_1=smf.ols(formula='microbusiness_density ~ cfips + active + pct_bb_2017 + pct_bb_2018 + pct_bb_2019 + pct_bb_2020 + pct_bb_2021 + pct_college_2017 + pct_college_2018 + pct_college_2019 + pct_college_2020 + pct_college_2021 + pct_foreign_born_2017 + pct_foreign_born_2018 + pct_foreign_born_2019 + pct_foreign_born_2020 + pct_foreign_born_2021 + pct_it_workers_2017 + pct_it_workers_2018 + pct_it_workers_2019 + pct_it_workers_2020 + pct_it_workers_2021 + median_hh_inc_2017 + median_hh_inc_2018 + median_hh_inc_2019 + median_hh_inc_2020 + median_hh_inc_2021', data=train)
model_1=reg_setup_1.fit()
print(model_1.summary())

                              OLS Regression Results                             
Dep. Variable:     microbusiness_density   R-squared:                       0.293
Model:                               OLS   Adj. R-squared:                  0.292
Method:                    Least Squares   F-statistic:                     1575.
Date:                   Mon, 13 Mar 2023   Prob (F-statistic):               0.00
Time:                           01:13:25   Log-Likelihood:            -2.9237e+05
No. Observations:                 102828   AIC:                         5.848e+05
Df Residuals:                     102800   BIC:                         5.851e+05
Df Model:                             27                                         
Covariance Type:               nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------


## Random Forest

In [54]:
from sklearn.ensemble import RandomForestRegressor
randomforest = RandomForestRegressor(max_depth=10)
randomforest.fit(X_train, y_train)
randomforest_predictions = randomforest.predict(X_test)
randomforest.score(X_test, y_test)

0.510002154553909

## XGBoost

In [55]:
from xgboost.sklearn import XGBRegressor
xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
xgboost.score(X_test, y_test)

0.3596705882917912

## GBM

In [56]:
gradient_booster = GradientBoostingRegressor(n_estimators = 800)
gradient_booster.fit(X_train, y_train)
test_predictions = gradient_booster.predict(X_test)
gradient_booster.score(X_test, y_test)

0.5149709427626208

# Tune best model

In [None]:
# For hypermparameter tuning only, comment out for output
from sklearn.model_selection import KFold
cv = KFold(n_splits=3, random_state=1, shuffle=True)

X = all_data.drop(['microbusiness_density'],axis=1)
y = all_data['microbusiness_density']

n_estimators = np.arange(100, 2000, 100)
max_depths = np.arange(3, 7)
max_n = 0
max_m = 0
#max_s = 0
max_accuracy = 0
for n in n_estimators:
    for m in max_depths:
        gradient_booster_tuned = GradientBoostingRegressor(n_estimators=n, max_depth=m, subsample=0.8, learning_rate=0.01, )
        #gradient_booster.fit(X_train, y_train)
        scores = cross_val_score(gradient_booster_tuned, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
        mock_accuracy = np.mean(scores)
        #mock_auc = roc_auc_score(y_test, test_predictions)
        if mock_accuracy > max_accuracy:
            max_accuracy=mock_accuracy
            max_n = n
            max_m = m
        print("Finished for n: ", n, " and m: ", m, "with auc: ", mock_accuracy)
print("max auc: ", max_accuracy, "n: ", n, " m: ", m)


Finished for n:  100  and m:  3 with auc:  -12.666262153123222
Finished for n:  100  and m:  4 with auc:  -10.764196031112867
Finished for n:  100  and m:  5 with auc:  -9.131694693527441
Finished for n:  100  and m:  6 with auc:  -7.949267596756333
Finished for n:  200  and m:  3 with auc:  -8.851354927353258
Finished for n:  200  and m:  4 with auc:  -6.78975550106081
Finished for n:  200  and m:  5 with auc:  -4.987894637911757
Finished for n:  200  and m:  6 with auc:  -3.8660882202140114
Finished for n:  300  and m:  3 with auc:  -7.231156589466001
Finished for n:  300  and m:  4 with auc:  -5.251301067265561
Finished for n:  300  and m:  5 with auc:  -3.7029061661040217
Finished for n:  300  and m:  6 with auc:  -2.633285544026506
Finished for n:  400  and m:  3 with auc:  -6.419284614912161
Finished for n:  400  and m:  4 with auc:  -4.472633719702337
Finished for n:  400  and m:  5 with auc:  -3.071649776528384
Finished for n:  400  and m:  6 with auc:  -2.147979327748996
Finis

# 5-NN

You will probably need to regularize the data for KNN