# Logistic Regression Model

In [1]:
# Import packages and dataset
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
df = pd.read_csv('data/processed/bus_network_data.csv', index_col=0, low_memory=False)

In [2]:
df.head()

Unnamed: 0,bus_line,direction,date,month,day,day_of_week,time_period,hour,minute,trip_time,...,Humidity,PrecipitationIn,TemperatureF,VisibilityMPH,Wind SpeedMPH,totalInjuries,pavementScore,potholeCount,prev_trip_ratio,ntwk_delay_lag1hr
0,B11,2.0,2016-01-01,1,1,Friday,PeakAM,9,44,38.45,...,58.0,0.0,39.9,10.0,10.4,162,2110.0,237,0.638492,2.0
1,B11,1.0,2016-01-01,1,1,Friday,PeakAM,9,52,53.9,...,58.0,0.0,39.9,10.0,10.4,162,2110.0,237,1.05552,2.0
2,B11,2.0,2016-01-01,1,1,Friday,MidDay,10,10,34.7,...,58.0,0.0,39.9,10.0,6.9,162,2110.0,237,1.017161,4.0
3,B11,1.0,2016-01-01,1,1,Friday,MidDay,10,23,41.05,...,58.0,0.0,39.9,10.0,6.9,162,2110.0,237,0.979406,4.0
4,B11,2.0,2016-01-01,1,1,Friday,MidDay,10,33,40.183333,...,58.0,0.0,39.9,10.0,6.9,162,2110.0,237,0.591185,4.0


In [3]:
df.ntwk_delay_lag1hr = df.ntwk_delay_lag1hr.astype(str)
df.dtypes

bus_line              object
direction            float64
date                  object
month                  int64
day                    int64
day_of_week           object
time_period           object
hour                   int64
minute                 int64
trip_time            float64
avg_trip_time        float64
std_trip_time        float64
delay_time           float64
delay                 object
Conditions            object
Humidity             float64
PrecipitationIn      float64
TemperatureF         float64
VisibilityMPH        float64
Wind SpeedMPH         object
totalInjuries          int64
pavementScore        float64
potholeCount           int64
prev_trip_ratio      float64
ntwk_delay_lag1hr     object
dtype: object

In [4]:
# Select target variable and feature space
X = pd.get_dummies(df[[u'bus_line', u'direction', u'month', u'day_of_week',
       u'hour', u'Conditions', u'Humidity', u'PrecipitationIn', 
       u'TemperatureF', u'VisibilityMPH', u'Wind SpeedMPH', u'totalInjuries', 
       u'pavementScore', u'potholeCount', u'prev_trip_ratio', u'ntwk_delay_lag1hr']])

Y = df['delay']

X_train,X_test,y_train,y_test=train_test_split(X, Y, test_size=0.3, random_state=123)

In [6]:
# Find best C parameter
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [np.exp(i) for i in np.linspace(-5,5,20)]}
logit = LogisticRegression()
C_param = GridSearchCV(logit, param_grid, cv=5)
C_param.fit(X_train, y_train)
C_param.best_params_

{'C': 10.680514992399115}

In [7]:
# Cross validate using best C parameter
avg_acc = []
for i in range(100):
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.33, random_state=i) #Use random_state to fix samples
    X_train_dummies = pd.get_dummies(X_train)
    X_test_dummies = pd.get_dummies(X_test)

    clf = LogisticRegression(C=C_param.best_params_.values()[0]) 
    clf.fit(X_train_dummies, Y_train)

    avg_acc.append(1.0*(clf.predict(X_test_dummies)==np.asarray(Y_test)).sum()/len(Y_test))

print ("Successfully (OS) predict {}% of bus statuses".format(np.mean(avg_acc)*100))

Successfully (OS) predict 41.2610822614% of bus statuses
