In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline

In [2]:
wego = pd.read_csv('data/wego_ml.csv')

In [3]:
linreg = LinearRegression()
selector = VarianceThreshold(.001)

1. Fit a linear regression model predicting the ADHERENCE using the ROUTE_ABBR and ROUTE_DIRECTION_NAME columns. Measure the performance of the model using the R^2 and mean absolute error metrics. Interpret the meaning of each metric.

In [4]:
predictorsq1 = ['ROUTE_ABBR', 'ROUTE_DIRECTION_NAME']
categorical_predictorsq1 = ['ROUTE_ABBR', 'ROUTE_DIRECTION_NAME']

Xq1 = wego[predictorsq1]
Xq1 = pd.get_dummies(Xq1, columns = categorical_predictorsq1, drop_first= True)
yq1 = wego['ADHERENCE']

In [5]:
X_trainq1, X_testq1, y_trainq1, y_testq1 = train_test_split(Xq1, yq1, random_state = 321)

In [6]:
linregq1 = linreg.fit(X_trainq1, y_trainq1)
y_predq1 = linregq1.predict(X_testq1)

In [7]:
print(f'MSE with route_abbr and route_direction_name: {mean_squared_error(y_testq1, y_predq1)}')
print(f'MAE with route_abbr and route_direction_name: {mean_absolute_error(y_testq1, y_predq1)}')
print(f'R^2 with route_abbr and route_direction_name: {r2_score(y_testq1, y_predq1)}')

MSE with route_abbr and route_direction_name: 29.781969782409398
MAE with route_abbr and route_direction_name: 3.47130898325546
R^2 with route_abbr and route_direction_name: 0.08768878523151136


2. Now, try using the ROUTE_ABBR, ROUTE_DIRECTION_NAME, and OPERATOR. Does this improve the model?

Warning: Your model may perform very poorly once you add the OPERATOR. If so, this is likely caused because some operators have very few observations. One option to correct this is to assign an "Other" (or -999999) value to operators with few observations.

In [20]:
predictorsq2 = ['ROUTE_ABBR', 'ROUTE_DIRECTION_NAME', 'OPERATOR']
categorical_predictorsq2 = ['ROUTE_ABBR', 'ROUTE_DIRECTION_NAME', 'OPERATOR']
operators = wego['OPERATOR'].value_counts()[lambda x: x> 50].index.tolist()

Xq2 = wego[predictorsq2]
Xq2.loc[~Xq2['OPERATOR'].isin(operators), 'OPERATOR'] = 999999
Xq2 = pd.get_dummies(Xq2, columns = categorical_predictorsq2, drop_first= True)

yq2 = wego['ADHERENCE']

In [21]:
X_trainq2, X_testq2, y_trainq2, y_testq2 = train_test_split(Xq2, yq2, random_state = 321)

In [22]:
linregq2 = linreg.fit(X_trainq2, y_trainq2)
y_predq2 = linregq2.predict(X_testq2)

In [23]:
print(f'MSE with route_abbr and route_direction_name: {mean_squared_error(y_testq2, y_predq2)}')
print(f'MAE with route_abbr and route_direction_name: {mean_absolute_error(y_testq2, y_predq2)}')
print(f'R^2 with route_abbr and route_direction_name: {r2_score(y_testq2, y_predq2)}')

MSE with route_abbr and route_direction_name: 26.623028069959926
MAE with route_abbr and route_direction_name: 3.192851104437311
R^2 with route_abbr and route_direction_name: 0.18445666096717983


3. Finally, the data you have been provided has an STARTING_ADHERENCE column, which contains the ADHERENCE at the beginning of the route. If you add this metric, does it improve the model? Is this of any practical use?

In [24]:
predictorsq3 = ['ROUTE_ABBR', 'ROUTE_DIRECTION_NAME', 'OPERATOR', 'STARTING_ADHERENCE']
categorical_predictorsq3 = ['ROUTE_ABBR', 'ROUTE_DIRECTION_NAME', 'OPERATOR']
operators = wego['OPERATOR'].value_counts()[lambda x: x> 50].index.tolist()

Xq3 = wego[predictorsq3]
Xq3.loc[~Xq3['OPERATOR'].isin(operators), 'OPERATOR'] = 999999
Xq3 = pd.get_dummies(Xq3, columns = categorical_predictorsq3, drop_first= True)
yq3 = wego['ADHERENCE']

In [25]:
X_trainq3, X_testq3, y_trainq3, y_testq3 = train_test_split(Xq3, yq3, random_state = 321)

In [26]:
linregq3 = linreg.fit(X_trainq3, y_trainq3)
y_predq3 = linregq3.predict(X_testq3)

In [27]:
print(f'MSE with route_abbr and route_direction_name: {mean_squared_error(y_testq3, y_predq3)}')
print(f'MAE with route_abbr and route_direction_name: {mean_absolute_error(y_testq3, y_predq3)}')
print(f'R^2 with route_abbr and route_direction_name: {r2_score(y_testq3, y_predq3)}')

MSE with route_abbr and route_direction_name: 17.573309428241508
MAE with route_abbr and route_direction_name: 2.7557565402756365
R^2 with route_abbr and route_direction_name: 0.4616767329657633


In [28]:
linregq1.coef_

array([ 7.50248390e-01, -2.27304615e-01,  7.79277950e-01,  9.26312558e-01,
        1.00444363e+00,  3.41660721e-02, -1.91353622e+00,  5.40040012e-02,
        1.30333832e+00,  8.90592322e-01,  1.59114063e+00, -1.53017547e+00,
        8.59115315e-01, -1.05509893e+00, -9.36576094e-01,  1.30591984e+00,
        3.50968966e-01,  3.88742814e-01, -1.35475117e-01,  7.83673517e-02,
       -8.82693138e-02,  1.61392971e+00,  7.55197259e-01,  1.51204312e+00,
        1.08025328e+00,  1.87887049e+00,  2.46627280e+00,  2.11389690e+00,
        1.89159093e+00,  1.63711273e+00,  9.73228068e-01,  1.39860941e+00,
        1.69390296e+00,  4.17419929e-01,  2.43495754e+00,  3.32772044e+00,
        2.49233045e-01,  1.07902482e+00,  1.60613121e+00,  6.25864760e-01,
       -1.34891837e-01,  2.51649988e-01,  2.98201360e+00,  1.27532308e-01,
       -1.76895055e-01,  5.24664468e-01, -2.21638368e-01, -1.03538705e+00,
        1.02297303e+00,  8.29986645e-01,  2.58702291e+00, -2.32281079e+00,
        1.00055819e+00,  