In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [2]:
data_lag = pd.read_csv('./data/safegraph_lag_ohe.csv.gz', compression='gzip')
data_nolag = pd.read_csv('./data/safegraph_no-lag_ohe.csv.gz', compression='gzip')

In [3]:
def LR(data):
    '''
    Fit and evaluation linear regression on data
    '''

    # Set features, specify which ones to scale
    feature_cols = list(data.columns)
    for x in ['week', 'postal_code', 'target']:
        feature_cols.remove(x)
    scale_cols = [x for x in feature_cols if not x.startswith('naics_')]


    # Make week 15 the test set
    test_data = data[data['week']==15].copy()
    X_test = test_data[feature_cols]
    y_test = test_data['target'].apply(lambda x: min(x, 5))

    # Make week 14 the validation set
    val_data = data[data['week']==14].copy()
    X_val = val_data[feature_cols]
    y_val = val_data['target'].apply(lambda x: min(x, 5))

    # Make train everything else
    train_data = data[data['week']<14].copy()
    X_train = train_data[feature_cols]
    y_train = train_data['target'].apply(lambda x: min(x, 5))

    del(test_data,val_data,train_data)

    # Scale everything based on X_train
    scaler = StandardScaler(with_std=False)
    scaler.fit(X_train[scale_cols])
    X_train[scale_cols] = scaler.transform(X_train[scale_cols])
    X_val[scale_cols] = scaler.transform(X_val[scale_cols])
    X_test[scale_cols] = scaler.transform(X_test[scale_cols])

    # Fit model
    reg = LinearRegression().fit(X_train, y_train)

    # Evaluation
    print('Training set r2: ', reg.score(X_train, y_train))
    print('Validation set r2: ', reg.score(X_val, y_val))
    print('\n')
    pred_train = reg.predict(X_train)
    pred_val = reg.predict(X_val)
    print('Training set MSE: ', mean_squared_error(y_train, pred_train))
    print('Validation set MSE: ', mean_squared_error(y_val, pred_val))
    print('\n')
    print('Training set MAE: ', mean_absolute_error(y_train, pred_train))
    print('Validation set MAE: ', mean_absolute_error(y_val, pred_val))
    print('\n')
    
    return

In [4]:
print('Linear regression on data without lag variables\n')
LR(data_nolag)

print('-----------------------------------------------\n')

print('Linear regression on data with lag variables\n')
LR(data_lag)



Linear regression on data without lag variables

Training set r2:  0.23035741165378942
Validation set r2:  0.15805063384253915


Training set MSE:  0.6439127445937558
Validation set MSE:  0.42117715018334123


Training set MAE:  0.4586889968624915
Validation set MAE:  0.3932796157323233


-----------------------------------------------

Linear regression on data with lag variables

Training set r2:  0.2719772584571466
Validation set r2:  0.22706518114514807


Training set MSE:  0.49619566931001247
Validation set MSE:  0.38665328031362983


Training set MAE:  0.38393375732497287
Validation set MAE:  0.3504706704067968


