In [1]:
import pandas as pd
import numpy as np
import random 
import model_preparation

from model_preparation import prepare_data, get_features, get_bounds, get_interval_accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, LassoCV, Lasso

# Load in pre-split data
X_train, X_test, y_train, y_test = prepare_data()
features = get_features()
train_bounds_5 = get_bounds(y_train, 5)
train_bounds_10 = get_bounds(y_train, 10)
test_bounds_5 = get_bounds(y_test, 5)
test_bounds_10 = get_bounds(y_test, 10)

In [2]:
# Linear Regression model (without regularization)
lm = LinearRegression()
lm.fit(X_train, y_train)

# Compute predictions
y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)


Train Scores:
102.27541467609375
0.8431733491391098
Test Scores:
103.26946049382812
0.8414673849451206


In [3]:
# Display coefficients 
coef_df = pd.DataFrame(lm.coef_[0], features, columns=['Coefficient'])
coef_df

Unnamed: 0,Coefficient
travel_mins,-38.31961
originpop,535256200000.0
destinationpop,5170169000000.0
days_to_holiday,1.263608
days_from_holiday,-0.7444206
distance,-2762356000000.0
month,-2.830604
date,1.546332
hour,-0.3804377
minute,-3.685436


In [5]:
# Compute accuracy scores for 5% and 10% intervals on test data
print("5% +/- limit:")
print(get_interval_accuracy_score(test_bounds_5, y_pred_test))

print("10% +/- limit:")
print(get_interval_accuracy_score(test_bounds_10, y_pred_test))

5% +/- limit:
0.2989
10% +/- limit:
0.53445
