In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
# read in non-OHE data
data_full = pd.read_pickle("../data/full_dataset_with_features.csv.gz", compression='gzip')
data_lag = pd.read_csv('../data/safegraph_lag.csv.gz', compression='gzip')
data_nolag = pd.read_csv('../data/safegraph_no-lag.csv.gz', compression='gzip')
data_basic = data_nolag[['week', 'change_in_visits', 'target']]

data_full = data_full.dropna()

In [5]:
for x in data_full.columns:
    if (data_full[x].dtype != 'int64') and data_full[x].dtype != 'float64':
        print(x)
        print(data_full[x].dtype)
        print('\n')

In [6]:
def run_xgb(data, week=False, cap=None):
    '''
    Fit and evaluate optimal XGBoost on data
    '''

    # Set features, specify which ones to scale
    feature_cols = list(data.columns)
    non_features = ['postal_code', 'ZIP', 'ZIP_y', 'target']
    if week==False:
        non_features.append('week')
    for x in non_features:
        if x in feature_cols:
            feature_cols.remove(x)

    # Make week 15 the test set
    test_data = data[data['week']==15].copy()
    X_test = test_data[feature_cols]
    y_test = test_data['target']
    if cap:
        y_test = y_test.apply(lambda x: min(x, cap))

    # Make week 14 the validation set
    val_data = data[data['week']==14].copy()
    X_val = val_data[feature_cols]
    y_val = val_data['target']
    if cap:
        y_val = y_val.apply(lambda x: min(x, cap))

    # Make train everything else
    train_data = data[data['week']<14].copy()
    X_train = train_data[feature_cols]
    y_train = train_data['target']
    if cap:
        y_train = y_train.apply(lambda x: min(x, cap))

    del(test_data,val_data,train_data)

    # Scale everything based on X_train
    scaler = StandardScaler(with_std=False)
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    # instantiate model with optimal hyperparamters
    reg = xgb.XGBRegressor(max_depth=9,
                            min_child_weight= 8,
                            eta=.2,
                            subsample= 1,
                            colsample_bytree= 1,
                            objective='reg:squarederror')
    
    reg.fit(X_train, y_train)

    
    # Evaluation

    pred_train = reg.predict(X_train)
    pred_val = reg.predict(X_val)
    
    r2_train = r2_score(y_train, pred_train)
    r2_val = r2_score(y_val, pred_val)
    print('Training set r2: ', r2_train)
    print('Validation set r2: ', r2_val)
    print('\n')
    
    mse_train = mean_squared_error(y_train, pred_train)
    mse_val = mean_squared_error(y_val, pred_val)
    print('Training set MSE: ', mse_train)
    print('Validation set MSE: ', mse_val)
    print('\n')
    
    mae_train = mean_absolute_error(y_train, pred_train)
    mae_val = mean_absolute_error(y_val, pred_val)
    print('Training set MAE: ', mae_train)
    print('Validation set MAE: ', mae_val)
    print('\n')
    
    train_results = pd.DataFrame(data={'Train R2':[r2_train], 
                                       'Train MSE':[mse_train], 
                                       'Train MAE':[mae_train]})

    val_results = pd.DataFrame(data={'Val R2':[r2_val], 
                                       'Val MSE':[mse_val], 
                                       'Val MAE':[mae_val]})
    
    
    return train_results, val_results

In [7]:
# set caps to test
caps = [5]   # if we can wait for results: [None, 1, 2, 5, 10, 100]

# initialize empty results frames
train_results = pd.DataFrame()
val_results = pd.DataFrame()

for cap in caps:
    
    # xgb with only one variable
    print('XGBoost on previous target, cap={}\n'.format(cap))
    basic_train, basic_val = run_xgb(data_basic, cap=cap, week=False)
    
    print('-------------------------------------------------------------\n')

    # xgb on SG with no lag
    print('XGBoost on SafeGraph data without lag variables, cap={}\n'.format(cap))
    nolag_train, nolag_val = run_xgb(data_nolag, cap=cap, week=False)
    
    print('-------------------------------------------------------------\n')

    # xgb on SG with lag
    print('XGBoost on SafeGraph data with lag variables, cap={}\n'.format(cap))
    lag_train, lag_val = run_xgb(data_lag, cap=cap, week=False)
    
    print('-------------------------------------------------------------\n')
    
    # xgb on full data
    print('XGBoost on full dataset without week variable, cap={}\n'.format(cap))
    fullnoweek_train, fullnoweek_val = run_xgb(data_full, cap=cap, week=False)
    
    print('-------------------------------------------------------------\n')
    
    # xgb on full data
    print('XGBoost on full dataset with week variable, cap={}\n'.format(cap))
    fullweek_train, fullweek_val = run_xgb(data_full, cap=cap, week=True)
    
    print('-------------------------------------------------------------\n')
    
    # set captag
    if cap==None:
        captag = 'No cap'
    else:
        captag = 'Cap={}'.format(cap)
        
    # create list of results dataframes
    train_dfs = [basic_train, nolag_train, lag_train, fullnoweek_train, fullweek_train]
    val_dfs =  [basic_val, nolag_val, lag_val, fullnoweek_val, fullweek_val]
    
    # create list of lists
    df_lists = [train_dfs, val_dfs]
    
    # create list of dataset names
    data_list = ['Previous target only', 'SG without lag', 'SG with lag', 
                 'Full dataset without week', 'Full dataset with week']
        
    # set multi-index
    for df_list in df_lists:
        for i, df in enumerate(df_list):
            df['Outlier Cap'] = captag
            df['Dataset'] = data_list[i]
            df.set_index(['Outlier Cap', 'Dataset'], inplace=True) 
    
    # concatenate results
    train_results = pd.concat([train_results] + train_dfs)
    val_results = pd.concat([val_results] + val_dfs)
    

XGBoost on previous target, cap=5



  if getattr(data, 'base', None) is not None and \


Training set r2:  0.6575404459836365
Validation set r2:  0.6442246880505691


Training set MSE:  0.2865149027327939
Validation set MSE:  0.17797321076007167


Training set MAE:  0.28352824334241145
Validation set MAE:  0.18415744727742175


-------------------------------------------------------------

XGBoost on SafeGraph data without lag variables, cap=5



  if getattr(data, 'base', None) is not None and \


Training set r2:  0.7717852317231119
Validation set r2:  0.6327891666197507


Training set MSE:  0.19093329816085478
Validation set MSE:  0.1836937214234075


Training set MAE:  0.24558214698033934
Validation set MAE:  0.2221255250272201


-------------------------------------------------------------

XGBoost on SafeGraph data with lag variables, cap=5



  if getattr(data, 'base', None) is not None and \


Training set r2:  0.9179459992273304
Validation set r2:  0.8404551070279238


Training set MSE:  0.055925230778745484
Validation set MSE:  0.07981081291736272


Training set MAE:  0.14372857258824093
Validation set MAE:  0.13761497043952184


-------------------------------------------------------------

XGBoost on full dataset without week variable, cap=5



  if getattr(data, 'base', None) is not None and \


Training set r2:  0.9262404649812979
Validation set r2:  0.8408505918025544


Training set MSE:  0.04691470246198212
Validation set MSE:  0.07452559148533108


Training set MAE:  0.13332819570526777
Validation set MAE:  0.13262738731006873


-------------------------------------------------------------

XGBoost on full dataset with week variable, cap=5



  if getattr(data, 'base', None) is not None and \


Training set r2:  0.9265451656601505
Validation set r2:  0.8395354577815843


Training set MSE:  0.04672089780086662
Validation set MSE:  0.07514143506216472


Training set MAE:  0.1332021618537542
Validation set MAE:  0.13657246136391135


-------------------------------------------------------------



In [8]:
train_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Train R2,Train MSE,Train MAE
Outlier Cap,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cap=5,Previous target only,0.65754,0.286515,0.283528
Cap=5,SG without lag,0.771785,0.190933,0.245582
Cap=5,SG with lag,0.917946,0.055925,0.143729
Cap=5,Full dataset without week,0.92624,0.046915,0.133328
Cap=5,Full dataset with week,0.926545,0.046721,0.133202


In [9]:
val_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Val R2,Val MSE,Val MAE
Outlier Cap,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cap=5,Previous target only,0.644225,0.177973,0.184157
Cap=5,SG without lag,0.632789,0.183694,0.222126
Cap=5,SG with lag,0.840455,0.079811,0.137615
Cap=5,Full dataset without week,0.840851,0.074526,0.132627
Cap=5,Full dataset with week,0.839535,0.075141,0.136572
