In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [2]:
data_full = pd.read_pickle("./data/full_dataset_with_features.csv.gz", compression='gzip')
data_lag = pd.read_csv('./data/safegraph_lag_ohe.csv.gz', compression='gzip')
data_nolag = pd.read_csv('./data/safegraph_no-lag_ohe.csv.gz', compression='gzip')
data_basic = data_nolag[['week', 'change_in_visits', 'target']]

data_full = data_full.dropna()

print('Number of rows in SafeGraph data without lag variables: ', len(data_nolag))
print('Number of rows in SafeGraph data with lag variables: ', len(data_lag))
print('Number of rows in full dataset: ', len(data_full))

Number of rows in SafeGraph data without lag variables:  267239
Number of rows in SafeGraph data with lag variables:  229062
Number of rows in full dataset:  222263


In [3]:
def LR(data, cap=None):
    '''
    Fit and evaluate linear regression on data
    '''

    # Set features, specify which ones to scale
    feature_cols = list(data.columns)
    for x in ['week', 'postal_code', 'ZIP', 'ZIP_y', 'target']:
        if x in feature_cols:
            feature_cols.remove(x)
    scale_cols = [x for x in feature_cols if not x.startswith('naics_')]


    # Make week 15 the test set
    test_data = data[data['week']==15].copy()
    X_test = test_data[feature_cols]
    y_test = test_data['target']
    if cap:
        y_test = y_test.apply(lambda x: min(x, cap))

    # Make week 14 the validation set
    val_data = data[data['week']==14].copy()
    X_val = val_data[feature_cols]
    y_val = val_data['target']
    if cap:
        y_val = y_val.apply(lambda x: min(x, cap))

    # Make train everything else
    train_data = data[data['week']<14].copy()
    X_train = train_data[feature_cols]
    y_train = train_data['target']
    if cap:
        y_train = y_train.apply(lambda x: min(x, cap))

    del(test_data,val_data,train_data)

    # Scale everything based on X_train
    scaler = StandardScaler(with_std=False)
    scaler.fit(X_train[scale_cols])
    X_train[scale_cols] = scaler.transform(X_train[scale_cols])
    X_val[scale_cols] = scaler.transform(X_val[scale_cols])
    X_test[scale_cols] = scaler.transform(X_test[scale_cols])

    # Fit model
    reg = LinearRegression().fit(X_train, y_train)

    # Evaluation
    r2_train = reg.score(X_train, y_train)
    r2_val = reg.score(X_val, y_val)
    print('Training set r2: ', r2_train)
    print('Validation set r2: ', r2_val)
    print('\n')
    
    pred_train = reg.predict(X_train)
    pred_val = reg.predict(X_val)
    
    mse_train = mean_squared_error(y_train, pred_train)
    mse_val = mean_squared_error(y_val, pred_val)
    print('Training set MSE: ', mse_train)
    print('Validation set MSE: ', mse_val)
    print('\n')
    
    mae_train = mean_absolute_error(y_train, pred_train)
    mae_val = mean_absolute_error(y_val, pred_val)
    print('Training set MAE: ', mae_train)
    print('Validation set MAE: ', mae_val)
    print('\n')
    
    train_results = pd.DataFrame(data={'Train R2':[r2_train], 
                                       'Train MSE':[mse_train], 
                                       'Train MAE':[mae_train]})

    val_results = pd.DataFrame(data={'Val R2':[r2_val], 
                                       'Val MSE':[mse_val], 
                                       'Val MAE':[mae_val]})
    
    
    return train_results, val_results

In [4]:
# set caps to test
caps = [None, 1, 2, 5, 10, 100]

# initialize empty results frames
train_results = pd.DataFrame()
val_results = pd.DataFrame()

for cap in caps:
    
    # LR with only one variable
    print('Linear regression on previous target, cap={}\n'.format(cap))
    basic_train, basic_val = LR(data_basic, cap=cap)
    
    print('-------------------------------------------------------------\n')

    # LR on SG with no lag
    print('Linear regression on SafeGraph data without lag variables, cap={}\n'.format(cap))
    nolag_train, nolag_val = LR(data_nolag, cap=cap)
    
    print('-------------------------------------------------------------\n')

    # LR on SG with lag
    print('Linear regression on SafeGraph data with lag variables, cap={}\n'.format(cap))
    lag_train, lag_val = LR(data_lag, cap=cap)
    
    print('-------------------------------------------------------------\n')
    
    # LR on full data
    print('Linear regression on full data, cap={}\n'.format(cap))
    full_train, full_val = LR(data_full, cap=cap)
    
    print('-------------------------------------------------------------\n')
    
    # set captag
    if cap==None:
        captag = 'No cap'
    else:
        captag = 'Cap={}'.format(cap)
        
    # create list of results dataframes
    train_dfs = [basic_train, nolag_train, lag_train, full_train]
    val_dfs =  [basic_val, nolag_val, lag_val, full_val]
    
    # create list of lists
    df_lists = [train_dfs, val_dfs]
    
    # create list of dataset names
    data_list = ['Previous target only', 'SG without lag', 'SG with lag', 'Full dataset']
        
    # set multi-index
    for df_list in df_lists:
        for i, df in enumerate(df_list):
            df['Outlier Cap'] = captag
            df['Dataset'] = data_list[i]
            df.set_index(['Outlier Cap', 'Dataset'], inplace=True) 
    
    # concatenate results
    train_results = pd.concat([train_results] + train_dfs)
    val_results = pd.concat([val_results] + val_dfs)
    

Linear regression on previous target, cap=None

Training set r2:  0.6408983008428235
Validation set r2:  0.6044081770816977


Training set MSE:  7.724115323590728
Validation set MSE:  2.8867799754372427


Training set MAE:  0.45007589540713294
Validation set MAE:  0.31525566741349303


-------------------------------------------------------------

Linear regression on SafeGraph data without lag variables, cap=None

Training set r2:  0.6449673975146761
Validation set r2:  0.6061943828394556


Training set MSE:  7.636590892405923
Validation set MSE:  2.8737453707897878


Training set MAE:  0.45549306840885717
Validation set MAE:  0.3379327823227568


-------------------------------------------------------------

Linear regression on SafeGraph data with lag variables, cap=None

Training set r2:  0.6611735142999902
Validation set r2:  0.6140908635180845


Training set MSE:  3.880313735794483
Validation set MSE:  2.81612182809032


Training set MAE:  0.3667433426817209
Validation set MAE:  

Training set r2:  0.5022054765397468
Validation set r2:  0.4268659403605689


Training set MSE:  2.239160213415324
Validation set MSE:  1.9723046976109695


Training set MAE:  0.3927586236708415
Validation set MAE:  0.41222114316055775


-------------------------------------------------------------



In [5]:
print('\nTraining Set Results:')
display(train_results)



Training Set Results:


Unnamed: 0_level_0,Unnamed: 1_level_0,Train R2,Train MSE,Train MAE
Outlier Cap,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No cap,Previous target only,0.640898,7.724115,0.450076
No cap,SG without lag,0.644967,7.636591,0.455493
No cap,SG with lag,0.661174,3.880314,0.366743
No cap,Full dataset,0.661591,3.255859,0.351224
Cap=1,Previous target only,0.039996,0.257263,0.383273
Cap=1,SG without lag,0.220832,0.208802,0.33923
Cap=1,SG with lag,0.260287,0.171187,0.293761
Cap=1,Full dataset,0.243215,0.170585,0.291978
Cap=2,Previous target only,0.056657,0.394342,0.430185
Cap=2,SG without lag,0.22216,0.325158,0.387359


In [6]:
print('\nValidation Set Results:')
display(val_results)


Validation Set Results:


Unnamed: 0_level_0,Unnamed: 1_level_0,Val R2,Val MSE,Val MAE
Outlier Cap,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No cap,Previous target only,0.604408,2.88678,0.315256
No cap,SG without lag,0.606194,2.873745,0.337933
No cap,SG with lag,0.614091,2.816122,0.347874
No cap,Full dataset,0.584153,2.526766,0.373648
Cap=1,Previous target only,-0.127136,0.204343,0.374237
Cap=1,SG without lag,0.120659,0.159419,0.304102
Cap=1,SG with lag,0.163677,0.15162,0.275704
Cap=1,Full dataset,0.139318,0.151381,0.281535
Cap=2,Previous target only,-0.07364,0.288253,0.415347
Cap=2,SG without lag,0.132617,0.232876,0.340742


In [8]:
# view best-performing datasets at each cap level
pd.DataFrame(data={"Best Val R2":\
                     ['SG with lag', 'SG with lag', 'SG with lag', 
                      'SG with lag', 'SG with lag', 'SG with lag'], 
                   "Best Val MSE":\
                     ['Full dataset', 'Full dataset', 'SG with lag', 
                      'SG with lag', 'SG with lag', 'Full dataset'], 
                   "Best Val MAE":\
                     ['Previous target only', 'SG with lag', 'SG with lag', 
                      'SG with lag', 'SG with lag', 'SG with lag']}, 
             index=['No cap', 'Cap=1', 'Cap=2', 'Cap=5', 'Cap=10', 'Cap=100'])

Unnamed: 0,Best Val R2,Best Val MSE,Best Val MAE
No cap,SG with lag,Full dataset,Previous target only
Cap=1,SG with lag,Full dataset,SG with lag
Cap=2,SG with lag,SG with lag,SG with lag
Cap=5,SG with lag,SG with lag,SG with lag
Cap=10,SG with lag,SG with lag,SG with lag
Cap=100,SG with lag,Full dataset,SG with lag


In [7]:
# View target distribution with different outlier caps

dropped = data_full.dropna(axis=0)
nocap = pd.DataFrame(dropped.target.describe()[1:]).T
nocap['Cap on Outliers'] = 'None'
nocap.set_index('Cap on Outliers', inplace=True)

captable = nocap
caps = [100, 10, 5, 2, 1]
for cap in caps:
    dropped['target'] = dropped['target'].apply(lambda x: min(x, cap))
    df = pd.DataFrame(dropped.target.describe()[1:]).T
    df['Cap on Outliers'] = cap
    df.set_index('Cap on Outliers', inplace=True)
    captable = pd.concat([captable, df])
captable

Unnamed: 0_level_0,mean,std,min,25%,50%,75%,max
Cap on Outliers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,-0.393399,3.014309,-0.998963,-0.854369,-0.693182,-0.416667,564.0
100.0,-0.405346,2.06631,-0.998963,-0.854369,-0.693182,-0.416667,100.0
10.0,-0.453671,0.988641,-0.998963,-0.854369,-0.693182,-0.416667,10.0
5.0,-0.478745,0.771843,-0.998963,-0.854369,-0.693182,-0.416667,5.0
2.0,-0.515173,0.566684,-0.998963,-0.854369,-0.693182,-0.416667,2.0
1.0,-0.541941,0.463714,-0.998963,-0.854369,-0.693182,-0.416667,1.0
