In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [2]:
data_lag = pd.read_csv('./data/safegraph_lag_ohe.csv.gz', compression='gzip')
data_nolag = pd.read_csv('./data/safegraph_no-lag_ohe.csv.gz', compression='gzip')
data_basic = data_nolag[['week', 'change_in_visits', 'target']]

print('Number of rows without lag variables: ', len(data_nolag))
print('Number of rows with lag variables: ', len(data_lag))

Number of rows without lag variables:  267239
Number of rows with lag variables:  229062


In [3]:
def LR(data, cap=None):
    '''
    Fit and evaluate linear regression on data
    '''

    # Set features, specify which ones to scale
    feature_cols = list(data.columns)
    for x in ['week', 'postal_code', 'target']:
        if x in feature_cols:
            feature_cols.remove(x)
    scale_cols = [x for x in feature_cols if not x.startswith('naics_')]


    # Make week 15 the test set
    test_data = data[data['week']==15].copy()
    X_test = test_data[feature_cols]
    y_test = test_data['target']
    if cap:
        y_test = y_test.apply(lambda x: min(x, cap))

    # Make week 14 the validation set
    val_data = data[data['week']==14].copy()
    X_val = val_data[feature_cols]
    y_val = val_data['target']
    if cap:
        y_val = y_val.apply(lambda x: min(x, cap))

    # Make train everything else
    train_data = data[data['week']<14].copy()
    X_train = train_data[feature_cols]
    y_train = train_data['target']
    if cap:
        y_train = y_train.apply(lambda x: min(x, cap))

    del(test_data,val_data,train_data)

    # Scale everything based on X_train
    scaler = StandardScaler(with_std=False)
    scaler.fit(X_train[scale_cols])
    X_train[scale_cols] = scaler.transform(X_train[scale_cols])
    X_val[scale_cols] = scaler.transform(X_val[scale_cols])
    X_test[scale_cols] = scaler.transform(X_test[scale_cols])

    # Fit model
    reg = LinearRegression().fit(X_train, y_train)

    # Evaluation
    r2_train = reg.score(X_train, y_train)
    r2_val = reg.score(X_val, y_val)
    print('Training set r2: ', r2_train)
    print('Validation set r2: ', r2_val)
    print('\n')
    
    pred_train = reg.predict(X_train)
    pred_val = reg.predict(X_val)
    
    mse_train = mean_squared_error(y_train, pred_train)
    mse_val = mean_squared_error(y_val, pred_val)
    print('Training set MSE: ', mse_train)
    print('Validation set MSE: ', mse_val)
    print('\n')
    
    mae_train = mean_absolute_error(y_train, pred_train)
    mae_val = mean_absolute_error(y_val, pred_val)
    print('Training set MAE: ', mae_train)
    print('Validation set MAE: ', mae_val)
    print('\n')
    
    train_results = pd.DataFrame(data={'R2':[r2_train], 
                                       'MSE':[mse_train], 
                                       'MAE':[mae_train]})

    val_results = pd.DataFrame(data={'R2':[r2_val], 
                                       'MSE':[mse_val], 
                                       'MAE':[mae_val]})
    
    
    return train_results, val_results

In [4]:
# set caps to test
caps = [None, 1, 2, 5, 10, 100]

# initialize empty results frames
train_results = pd.DataFrame()
val_results = pd.DataFrame()

for cap in caps:
    
    # LR with only one variable
    print('Linear regression on previous target, cap={}\n'.format(cap))
    basic_train, basic_val = LR(data_basic, cap=cap)
    
    print('-------------------------------------------------------------\n')

    # LR with no lag
    print('Linear regression on data without lag variables, cap={}\n'.format(cap))
    nolag_train, nolag_val = LR(data_nolag, cap=cap)
    
    print('-------------------------------------------------------------\n')

    # LR with lag
    print('Linear regression on data with lag variables, cap={}\n'.format(cap))
    lag_train, lag_val = LR(data_lag, cap=cap)
    
    print('-------------------------------------------------------------\n')
    
    # set captag
    if cap==None:
        captag = 'no-cap'
    else:
        captag = 'cap={}'.format(cap)
        
    # create list of results dataframes
    train_dfs = [basic_train, nolag_train, lag_train]
    val_dfs =  [basic_train, nolag_train, lag_train]
    
    # create list of lists
    df_lists = [train_dfs, val_dfs]
    
    # create list of datasets
    data_list = ['basic', 'no-lag', 'lag']
        
    # set multi-index
    for df_list in df_lists:
        for i, df in enumerate(df_list):
            df['cap'] = captag
            df['data'] = data_list[i]
            df.set_index(['cap', 'data'], inplace=True) 
    
    # concatenate results
    train_results = pd.concat([train_results] + train_dfs)
    val_results = pd.concat([val_results] + val_dfs)
    

Linear regression on previous target, cap=None

Training set r2:  0.6408983008428235
Validation set r2:  0.6044081770816977


Training set MSE:  7.724115323590728
Validation set MSE:  2.8867799754372427


Training set MAE:  0.45007589540713294
Validation set MAE:  0.31525566741349303


-------------------------------------------------------------

Linear regression on data without lag variables, cap=None

Training set r2:  0.6449673975146761
Validation set r2:  0.6061943828394556


Training set MSE:  7.636590892405923
Validation set MSE:  2.8737453707897878


Training set MAE:  0.45549306840885717
Validation set MAE:  0.3379327823227568


-------------------------------------------------------------

Linear regression on data with lag variables, cap=None

Training set r2:  0.6611735142999902
Validation set r2:  0.6140908635180845


Training set MSE:  3.880313735794483
Validation set MSE:  2.81612182809032


Training set MAE:  0.3667433426817209
Validation set MAE:  0.34787371892450913


In [5]:
print('\nTraining Set Results:')
display(train_results)



Training Set Results:


Unnamed: 0_level_0,Unnamed: 1_level_0,R2,MSE,MAE
cap,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no-cap,basic,0.640898,7.724115,0.450076
no-cap,no-lag,0.644967,7.636591,0.455493
no-cap,lag,0.661174,3.880314,0.366743
cap=1,basic,0.039996,0.257263,0.383273
cap=1,no-lag,0.220832,0.208802,0.33923
cap=1,lag,0.260287,0.171187,0.293761
cap=2,basic,0.056657,0.394342,0.430185
cap=2,no-lag,0.22216,0.325158,0.387359
cap=2,lag,0.260144,0.262191,0.331597
cap=5,basic,0.09356,0.758363,0.494951


In [6]:
print('\nValidation Set Results:')
display(val_results)


Validation Set Results:


Unnamed: 0_level_0,Unnamed: 1_level_0,R2,MSE,MAE
cap,data,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no-cap,basic,0.640898,7.724115,0.450076
no-cap,no-lag,0.644967,7.636591,0.455493
no-cap,lag,0.661174,3.880314,0.366743
cap=1,basic,0.039996,0.257263,0.383273
cap=1,no-lag,0.220832,0.208802,0.33923
cap=1,lag,0.260287,0.171187,0.293761
cap=2,basic,0.056657,0.394342,0.430185
cap=2,no-lag,0.22216,0.325158,0.387359
cap=2,lag,0.260144,0.262191,0.331597
cap=5,basic,0.09356,0.758363,0.494951
