In [2]:
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn import linear_model
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm

import warnings
warnings.filterwarnings('ignore')

In [3]:
directory = os.getcwd()
data_all = pd.read_csv(directory + '/EMS_2017_2018_cat_final_onehot_thresh.csv', sep=',') 

In [4]:
data_all.head()

Unnamed: 0,INITIAL_SEVERITY_LEVEL_CODE,DISPATCH_RESPONSE_SECONDS_QY,INCIDENT_RESPONSE_SECONDS_QY,INCIDENT_TRAVEL_TM_SECONDS_QY,HELD_INDICATOR,T0,T1,T3,T4,T5,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
0,4,287,579.0,292.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,3,11,301.0,290.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,14,270.0,256.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,9,25.0,16.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,3,18,218.0,200.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [5]:
data = data_all[['INITIAL_SEVERITY_LEVEL_CODE', 'DISPATCH_RESPONSE_SECONDS_QY','HELD_INDICATOR', 'T0', 'T1', 'T4',\
                 'T5', 'T10', 'T12', 'T14', 'T15','T17', 'T21', 'T25', 'T27', 'T35', 'T37', '10457.0', '11207.0',\
                 'City_2.0', 'City_4.0', 'City_15.0', 'City_17.0', 'City_37.0','City_42.0', 'Comm_105.0', 'Schl_8.0',\
                 'Schl_9.0', 'Schl_10.0','Schl_27.0', 'Schl_28.0', 'Con_6.0', 'Con_7.0', 'Con_8.0', 'Con_9.0',\
                 'Con_10.0', 'Con_11.0', 'Con_12.0', 'Con_13.0', 'Con_14.0', 'Con_15.0','Day_0', 'Day_1', 'Day_2',\
                 'Day_3', 'Day_4', 'Day_5', 'Day_6', 'Hr_0','Hr_1', 'Hr_2', 'Hr_3', 'Hr_4', 'Hr_5', 'Hr_6', 'Hr_7', \
                 'Hr_8', 'Hr_9','Hr_10', 'Hr_11', 'Hr_12', 'Hr_13', 'Hr_14', 'Hr_15', 'Hr_16', 'Hr_17','Hr_18', \
                 'Hr_19', 'Hr_20', 'Hr_21', 'Hr_22', 'Hr_23', 'Month_1','Month_2', 'Month_3', 'Month_4', 'Month_5',\
                 'Month_6', 'Month_7','Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12','INCIDENT_TRAVEL_TM_SECONDS_QY']]

In [6]:
data.head()

Unnamed: 0,INITIAL_SEVERITY_LEVEL_CODE,DISPATCH_RESPONSE_SECONDS_QY,HELD_INDICATOR,T0,T1,T4,T5,T10,T12,T14,...,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,INCIDENT_TRAVEL_TM_SECONDS_QY
0,4,287,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,292.0
1,3,11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,290.0
2,2,14,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,256.0
3,4,9,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,16.0
4,3,18,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,200.0


In [7]:
data.shape

(50000, 85)

***LASSO REGRESSION***

Create lasso linear regression model and fit with training dataset

In [52]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [9]:
#X is predictors, Y is value to be predicted
X = data.drop(columns = ["INCIDENT_TRAVEL_TM_SECONDS_QY"])
Y = data["INCIDENT_TRAVEL_TM_SECONDS_QY"]

In [10]:
X.head()

Unnamed: 0,INITIAL_SEVERITY_LEVEL_CODE,DISPATCH_RESPONSE_SECONDS_QY,HELD_INDICATOR,T0,T1,T4,T5,T10,T12,T14,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
0,4,287,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,3,11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,9,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,3,18,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [11]:
Y.head()

0    292.0
1    290.0
2    256.0
3     16.0
4    200.0
Name: INCIDENT_TRAVEL_TM_SECONDS_QY, dtype: float64

In [12]:
#splitting into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=.75)

In [13]:
X_train.head()

Unnamed: 0,INITIAL_SEVERITY_LEVEL_CODE,DISPATCH_RESPONSE_SECONDS_QY,HELD_INDICATOR,T0,T1,T4,T5,T10,T12,T14,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
10907,2,21,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
44741,4,242,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
32909,4,27,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
6308,7,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19520,4,12,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [14]:
Y_train.head()

10907    217.0
44741    371.0
32909    497.0
6308     859.0
19520    163.0
Name: INCIDENT_TRAVEL_TM_SECONDS_QY, dtype: float64

In [15]:
X_test.head()

Unnamed: 0,INITIAL_SEVERITY_LEVEL_CODE,DISPATCH_RESPONSE_SECONDS_QY,HELD_INDICATOR,T0,T1,T4,T5,T10,T12,T14,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
33488,2,6,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
16949,7,107,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
15674,4,43,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
40359,3,19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
24478,6,12,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [16]:
Y_test.head()

33488    378.0
16949    457.0
15674    563.0
40359    566.0
24478    313.0
Name: INCIDENT_TRAVEL_TM_SECONDS_QY, dtype: float64

***Lasso Model***

In [78]:
lasso = linear_model.Lasso()
parameters = {'alpha': [1e-15,1e-10,1e-8,1e-4,1e-3,5e-2,1e-2,0.5,1,5,10,20]}
lasso_reg = GridSearchCV(lasso,parameters,scoring = 'neg_mean_squared_error',cv=10)
lasso_reg.fit(X_train,Y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.05,
                                   0.01, 0.5, 1, 5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [79]:
lasso_reg.best_params_

{'alpha': 0.01}

In [80]:
lasso_reg.best_score_

-35681.87736545877

In [None]:
lasso = linear_model.Lasso(normalize = True,max_iter = 100000)
parameters = {'alpha': [1e-15,1e-10,1e-8,1e-4,1e-3,5e-2,1e-2,0.5,1,5,10,20]}
lasso_reg = GridSearchCV(lasso,parameters,scoring = 'neg_mean_squared_error',cv=10)
lasso_reg.fit(X_train,Y_train)

In [85]:
lasso_reg.best_params_

{'alpha': 0.001}

In [86]:
lasso_reg.best_score_

-35680.8481866258

***Ridge Regression***

In [59]:
from sklearn.linear_model import Ridge

In [81]:
ridge = linear_model.Ridge()
parameters = {'alpha': [1e-2,0.5,1,5,10,15,20,22,25,26,27,28,30,35,40]}
ridge_reg = GridSearchCV(ridge, parameters,scoring = 'neg_mean_squared_error',cv=5)
ridge_reg.fit(X_train,Y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.01, 0.5, 1, 5, 10, 15, 20, 22, 25, 26, 27,
                                   28, 30, 35, 40]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [82]:
ridge_reg.best_params_

{'alpha': 26}

In [83]:
ridge_reg.best_score_

-35693.78879846299