In [None]:
import time
from tabulate import tabulate

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model, kernel_ridge
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

from sklearn import preprocessing
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

In [None]:
data = pd.read_pickle("../full_dataset.csv")
# Change the datatypes to ints, mostly
dtype_dict = {"week":"int",
"visits_2020":"int",
"postal_code":"int",
"naics_code":"int",
"raw_visitor_counts":"int",
"median_dwell_2020":"int",
"num_visitor_country_of_origin":"int",
"num_visitor_home_cbgs":"int",
"num_related_same_day_brand_2020":"int",
"max_hourly_visits":"int",
"visits_2019":"int",
"distance_from_home_2019":"int",
"median_dwell_2019":"int",
"num_related_same_day_brand_2019":"int",
"change_in_visits":"float",
"visits_2020_lastweek":"int",
"raw_visitor_counts_lastweek":"int",
"median_dwell_2020_lastweek":"int",
"num_visitor_country_of_origin_lastweek":"int",
"num_visitor_home_cbgs_lastweek":"int",
"num_related_same_day_brand_2020_lastweek":"int",
"max_hourly_visits_lastweek":"int",
"visits_2019_lastweek":"int",
"median_dwell_2019_lastweek":"int",
"num_related_same_day_brand_2019_lastweek":"int",
"change_in_visits_lastweek":"float",
"visits_2019_nextweek":"int",
"median_dwell_2019_nextweek":"int",
"num_related_same_day_brand_2019_nextweek":"int",
"target":"float",
"naics_2":"str",
"naics_3":"str",
"naics_4":"str",
"naics_5":"str",
"distance_from_home_2019_missing":"int",
"distance_from_home_2019_missing_lastweek":"int",
"distance_from_home_2019_missing_nextweek":"int",
"distance_from_home_2019_lastweek":"int",
"distance_from_home_2019_nextweek":"int",
"naics_2_num_biz":"float",
"naics_3_num_biz":"float",
"naics_4_num_biz":"float",
"naics_5_num_biz":"float",
"naics_6_num_biz":"float"}

data = data.astype(dtype_dict)
data.to_csv("../full_dataset_ints.csv")

In [None]:
# Load the data
# data = pd.read_csv("../full_dataset_ints.csv")

# Drop increases of 500%+ 
drop_rows = data[data['target']>5].index
data = data.drop(drop_rows,axis=0)

In [None]:
# Optional one-hot encoding - Adds lots of columns but didn't have much benefit for lasso, ridge, or xgb

def one_hot_data(data,naics_cols):
    enc = preprocessing.OneHotEncoder()
    enc.fit(data[naics_cols])
    onehotlabels = enc.transform(data[naics_cols]).toarray()
    one_hot_col_names = []
    for lvl in enc.categories_:
        [one_hot_col_names.append("naics_"+str(i)) for i in lvl]
    one_hot_df = pd.DataFrame(onehotlabels,columns=one_hot_col_names)
    data = data.reset_index(drop=True)
    new_data = pd.concat([data,one_hot_df],axis=1)
    new_data = new_data.drop(naics_cols,axis=1)
    new_data = new_data.drop('naics_0',axis=1)
    return new_data,one_hot_col_names

naics_cols = ["naics_2"]#,"naics_3","naics_4","naics_5"]
data,one_hot_col_names = one_hot_data(data,naics_cols)
# Remove naics_0
one_hot_col_names = one_hot_col_names[1:]

In [None]:
feature_cols = ["visits_2020",
"naics_code",
"raw_visitor_counts",
"median_dwell_2020",
"num_visitor_country_of_origin",
"num_visitor_home_cbgs",
"num_related_same_day_brand_2020",
"max_hourly_visits",
"visits_2019",
"distance_from_home_2019",
"median_dwell_2019",
"num_related_same_day_brand_2019",
"change_in_visits",
"visits_2020_lastweek",
"raw_visitor_counts_lastweek",
"median_dwell_2020_lastweek",
"num_visitor_country_of_origin_lastweek",
"num_visitor_home_cbgs_lastweek",
"num_related_same_day_brand_2020_lastweek",
"max_hourly_visits_lastweek",
"visits_2019_lastweek",
"median_dwell_2019_lastweek",
"num_related_same_day_brand_2019_lastweek",
"change_in_visits_lastweek",
"visits_2019_nextweek",
"median_dwell_2019_nextweek",
"num_related_same_day_brand_2019_nextweek",
# "naics_2",
"naics_3",
"naics_4",
"naics_5",
"distance_from_home_2019_missing",
"distance_from_home_2019_missing_lastweek",
"distance_from_home_2019_missing_nextweek",
"distance_from_home_2019_lastweek",
"distance_from_home_2019_nextweek",
"naics_2_num_biz",
"naics_3_num_biz",
"naics_4_num_biz",
"naics_5_num_biz",
"naics_6_num_biz"]

scale_cols = ["visits_2020",
"naics_code",
"raw_visitor_counts",
"median_dwell_2020",
"num_visitor_country_of_origin",
"num_visitor_home_cbgs",
"num_related_same_day_brand_2020",
"max_hourly_visits",
"visits_2019",
"distance_from_home_2019",
"median_dwell_2019",
"num_related_same_day_brand_2019",
"change_in_visits",
"visits_2020_lastweek",
"raw_visitor_counts_lastweek",
"median_dwell_2020_lastweek",
"num_visitor_country_of_origin_lastweek",
"num_visitor_home_cbgs_lastweek",
"num_related_same_day_brand_2020_lastweek",
"max_hourly_visits_lastweek",
"visits_2019_lastweek",
"median_dwell_2019_lastweek",
"num_related_same_day_brand_2019_lastweek",
"change_in_visits_lastweek",
"visits_2019_nextweek",
"median_dwell_2019_nextweek",
"num_related_same_day_brand_2019_nextweek",
# "naics_2",
"naics_3",
"naics_4",
"naics_5",
"distance_from_home_2019_missing",
"distance_from_home_2019_missing_lastweek",
"distance_from_home_2019_missing_nextweek",
"distance_from_home_2019_lastweek",
"distance_from_home_2019_nextweek",
"naics_2_num_biz",
"naics_3_num_biz",
"naics_4_num_biz",
"naics_5_num_biz",
"naics_6_num_biz"]

[feature_cols.append(c) for c in one_hot_col_names]

In [None]:
print("Target variance by week\n"+"*"*25)
for week in data['week'].unique():
    ind = data['week']==week
    print("Week {}: {:2.4f}".format(week,np.var(data['target'][ind])))

In [None]:
# Make week 15 the test set
test_data = data[data['week']==15]
# Make week 14 the validation set
val_data = data[data['week']==14]
# Make train everything else
train_data = data[data['week']<14]

X_test = test_data[feature_cols].copy()
y_test = test_data['target'].copy()

X_val = val_data[feature_cols].copy()
y_val = val_data['target'].copy()

X_train = train_data[feature_cols].copy()
y_train = train_data['target'].copy()

del(data,test_data,val_data,train_data)
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=17)


In [None]:
# Variance per week
print("Test variance (Week 15): {:2.4f}".format(np.var(y_test)))
print("Validation variance (Week 14): {:2.4f}".format(np.var(y_val)))
print("Train variance (Week 10-13: {:2.4f}".format(np.var(y_train)))

In [None]:
# Scale everything based on X_train
scaler = StandardScaler(with_std=False)
# scaler = StandardScaler()
scaler.fit(X_train[scale_cols])
X_train[scale_cols] = scaler.transform(X_train[scale_cols])
X_val[scale_cols] = scaler.transform(X_val[scale_cols])
X_test[scale_cols] = scaler.transform(X_test[scale_cols])

In [None]:
'''To Consider 
- Normalize features for some models
- Should we use time-based folds?
- Set a standard scoring function in GridsearchCV?  Otherwise it defers to the individual regressors.
'''

### Set param_grid

In [None]:
def lasso_covid(X,y,params):
    reg = linear_model.Lasso(normalize=True)   
    cv_reg = GridSearchCV(reg, 
                          params,
                          n_jobs = 4,
                          cv=5,
                          refit=True)
    cv_reg.fit(X,y)
    return cv_reg

def ridge_covid(X,y,params):
    reg = linear_model.Ridge(normalize=True) 
    cv_reg = GridSearchCV(reg, 
                          params,
                          n_jobs = 4,
                          cv=5,
                          refit=True)
    cv_reg.fit(X,y)
    return cv_reg

def kernel_ridge_covid(X,y,params):
    reg = kernel_ridge.KernelRidge()    
    cv_reg = GridSearchCV(reg, 
                          params,
                          n_jobs = 4,
                          cv=5,
                          refit=True)
    cv_reg.fit(X,y)
    return cv_reg

# ,GradientBoostingRegressor,AdaBoostRegressor
def random_forest_covid(X,y,params):
    reg = RandomForestRegressor() #oob_score = True?   
    cv_reg = GridSearchCV(reg, 
                          params,
                          n_jobs = -1,
                          cv=5,
                          refit=True)
    cv_reg.fit(X,y)
    return cv_reg

def gradient_boosting_covid(X,y,params):
    reg = GradientBoostingRegressor(n_estimators=100,learning_rate=.1,subsample=1.)     
    cv_reg = GridSearchCV(reg, 
                          params,
                          n_jobs = -1,
                          cv=5,
                          refit=True)
    cv_reg.fit(X,y)
    return cv_reg

def adaboost_covid(X,y,params):
    reg = AdaBoostRegressor(n_estimators=200)     
    cv_reg = GridSearchCV(reg, 
                          params,
                          n_jobs = -1,
                          cv=5,
                          refit=True)
    cv_reg.fit(X,y)
    return cv_reg

def mlp_covid(X,y,params):
    reg = MLPRegressor(hidden_layer_sizes = (2,40))     
    cv_reg = GridSearchCV(reg, 
                          params,
                          n_jobs = -1,
                          cv=5,
                          refit=True)
    cv_reg.fit(X,y)
    return cv_reg

In [None]:
# Validation function
def evaluate(reg_list,X_val,y_val):
    '''
    Inputs:
        reg_list: list of 2-tuples (reg,name)
        X_val: array/DataFrame of features
        y_val: array/series of labels
    
    Output:
        score_df: DF with model name and metrics for each model on validation set
    '''
    score_df = pd.DataFrame(columns=['Model','MSE','MAE','r2'])
    for reg in reg_list:
        mse = mean_squared_error(y_val,reg[0].predict(X_val))
        mae = mean_absolute_error(y_val,reg[0].predict(X_val))
        r2 = r2_score(y_val,reg[0].predict(X_val))
        score_df = score_df.append({'Model':reg[1],'MSE':mse,'MAE':mae,'r2':r2},ignore_index=True)
    return score_df

In [None]:
lasso_params = {'alpha':(1e-9,1e-8,1e-7,1e-6,1e-5)}

ridge_params = {'alpha':(1e-9,1e-8,1e-7,1e-6,1e-5)}

random_forest_params = {'n_estimators':(100,500),
                         'max_depth':(None,10)}

gradient_boosting_params = {'n_estimators':(100,500,1000),
#                             'learning_rate':(.01,0.1,1.),
#                             'subsample':(0.1,0.5,1.0)
                            }

adaboost_params = {'n_estimators':(100,500),
                   'learning_rate':(1e-3,.1)}

mlp_params = {'alpha':(1e-5,1e-3,0.1),
              'hidden_layer_sizes':((2,25),(3,40),(2,100),(5,50))
}

In [None]:
# Run all the models
# lasso = lasso_covid(X_train,y_train,lasso_params)
# ridge = ridge_covid(X_train,y_train,ridge_params)
a = time.clock()
random_forest = random_forest_covid(X_train,y_train,random_forest_params)
b = time.clock()
print("Run time: {:2.4f}seconds".format(b-a))
gradient_boosting = gradient_boosting_covid(X_train,y_train,gradient_boosting_params)
c = time.clock()
print("Run time: {:2.4f}seconds".format(c-b))
adaboost = adaboost_covid(X_train,y_train,adaboost_params)
d = time.clock()
print("Run time: {:2.4f}seconds".format(d-c))
mlp = mlp_covid(X_train,y_train,mlp_params)
e = time.clock()
print("Run time: {:2.4f}seconds".format(e-d))

In [None]:
# Create list of best estimators of each type
reg_list = [(lasso.best_estimator_,"Lasso"),
            (ridge.best_estimator_,"Ridge"),
#             (XGB_reg,"XGB"),
            (random_forest.best_estimator_,"Random Forest"),
            (gradient_boosting.best_estimator_,"Gradient Boosting"),
            (adaboost,"Adaboost"),
            (mlp,"MLP")
           ]
# Training scores
training_score_df = evaluate(reg_list,X_train,y_train)
# Validation scores
val_score_df = evaluate(reg_list,X_val,y_val)


In [None]:
# Print out scores
print("Training:")
print(tabulate(training_score_df,headers=training_score_df.columns))
print("\nValidation:")
print(tabulate(val_score_df,headers=val_score_df.columns))

#### XGBoost parameter tuning (don't worry about this part)
Source: https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f

In [None]:
XGB_reg = xgb.XGBRegressor(eta= .01, objective='reg:squarederror')
XGB_reg.fit(X_train,y_train)

dtrain = xgb.DMatrix(X_train,label=y_train)
dval = xgb.DMatrix(X_val,label=y_val)

params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'mae'
}
num_boost_round = 33 # MAE of 0.136

model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dval, "Test")],
    early_stopping_rounds=10
)
print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

In [None]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=100,
    seed=17,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

In [None]:
# Identify the zero-weighted features from lasso
for i in range(61):
    if abs(lasso.best_estimator_.coef_[i])==0:
        print(X_train.columns[i])