In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [2]:
data = pd.read_csv('./data/safegraph_ohe.csv.gz', compression='gzip')

In [3]:
# Set SafeGraph features
feature_cols = list(data.columns)
for x in ['week', 'postal_code', 'target']:
    feature_cols.remove(x)
scale_cols = [x for x in feature_cols if not x.startswith('naics_')]

# Set SafeGraph features without lag
basic_feature_cols = [x for x in feature_cols if not \
                      ((x.endswith('_lastweek')) or ((x.endswith('_nextweek'))))]
basic_scale_cols = [x for x in basic_feature_cols if not x.startswith('naics_')]

In [4]:
'''
Linear Regression on SafeGraph features without lag
'''

# Make week 15 the test set
test_data = data[data['week']==15].copy()
X_test = test_data[basic_feature_cols]
y_test = test_data['target']

# Make week 14 the validation set
val_data = data[data['week']==14].copy()
X_val = val_data[basic_feature_cols]
y_val = val_data['target']

# Make train everything else
train_data = data[data['week']<14].copy()
X_train = train_data[basic_feature_cols]
y_train = train_data['target']

del(test_data,val_data,train_data)

# Scale everything based on X_train
scaler_basic = StandardScaler(with_std=False)
scaler_basic.fit(X_train[basic_scale_cols])
X_train[basic_scale_cols] = scaler_basic.transform(X_train[basic_scale_cols])
X_val[basic_scale_cols] = scaler_basic.transform(X_val[basic_scale_cols])
X_test[basic_scale_cols] = scaler_basic.transform(X_test[basic_scale_cols])

# Fit model
reg_basic = LinearRegression().fit(X_train, y_train)

# Evaluation
print('Training set r2: ', reg_basic.score(X_train, y_train))
print('Validation set r2: ', reg_basic.score(X_val, y_val))
print('\n')
pred_train = reg_basic.predict(X_train)
pred_val = reg_basic.predict(X_val)
print('Training set MSE: ', mean_squared_error(y_train, pred_train))
print('Validation set MSE: ', mean_squared_error(y_val, pred_val))
print('\n')
print('Training set MAE: ', mean_absolute_error(y_train, pred_train))
print('Validation set MAE: ', mean_absolute_error(y_val, pred_val))

del(X_train, y_train, X_val, y_val, X_test, y_test)

Training set r2:  0.6464452846205154
Validation set r2:  0.6164407714770381


Training set MSE:  4.0489845875171
Validation set MSE:  2.798973679804583


Training set MAE:  0.36741371793675637
Validation set MAE:  0.30227328813574694


In [5]:
'''
Linear Regression on all SafeGraph features
'''

# Make week 15 the test set
test_data = data[data['week']==15].copy()
X_test = test_data[feature_cols]
y_test = test_data['target']

# Make week 14 the validation set
val_data = data[data['week']==14].copy()
X_val = val_data[feature_cols]
y_val = val_data['target']

# Make train everything else
train_data = data[data['week']<14].copy()
X_train = train_data[feature_cols]
y_train = train_data['target']

del(test_data,val_data,train_data)

# Scale everything based on X_train
scaler_all = StandardScaler(with_std=False)
scaler_all.fit(X_train[scale_cols])
X_train[scale_cols] = scaler_all.transform(X_train[scale_cols])
X_val[scale_cols] = scaler_all.transform(X_val[scale_cols])
X_test[scale_cols] = scaler_all.transform(X_test[scale_cols])

# Fit model
reg_all = LinearRegression().fit(X_train, y_train)

# Evaluation
print('Training set r2: ', reg_all.score(X_train, y_train))
print('Validation set r2: ', reg_all.score(X_val, y_val))
print('\n')
pred_train = reg_all.predict(X_train)
pred_val = reg_all.predict(X_val)
print('Training set MSE: ', mean_squared_error(y_train, pred_train))
print('Validation set MSE: ', mean_squared_error(y_val, pred_val))
print('\n')
print('Training set MAE: ', mean_absolute_error(y_train, pred_train))
print('Validation set MAE: ', mean_absolute_error(y_val, pred_val))

Training set r2:  0.6611908575627418
Validation set r2:  0.6140657768618413


Training set MSE:  3.8801151170219876
Validation set MSE:  2.8163048947077223


Training set MAE:  0.3666799905087766
Validation set MAE:  0.347673262427686
