# Logistic Regression Model
We opted to fit a logistic regression first because of it's fast compute time and straightforaward implementation. 


In [8]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Loading in the pickled data

In [9]:
X_train = pd.read_pickle('../assets/X_train.pkl')
X_test = pd.read_pickle('../assets/X_test.pkl')
y_train = pd.read_pickle('../assets/y_train.pkl')
y_test = pd.read_pickle('../assets/y_test.pkl')

In [10]:
X_train.columns


Index(['Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Sunset', 'Sunrise', 'Heat', 'Depart', 'DewPoint',
       'WetBulb', 'Cool', 'PrecipTotal', 'StnPressure', 'Latitude',
       'Longitude', 'Month', 'Day_length_exp', 'Tavg_shift', 'Heat_exp',
       'Cool_shift', 'Tmax_shift', 'Tmin_shift', 'Depart_shift',
       'ResultSpeed_shift', 'ResultDir_exp', 'PrecipTotal_exp', 'WetBulb_exp',
       'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS',
       'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS',
       'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS',
       'Species_CULEX TERRITANS'],
      dtype='object')

# Setting up the pipeline
We're optimizing the model first by scaling our values.
Next we cycle through both L1 and L2 penalties to determine the optimum loss function for our model.
Finally we iterate through c values between .001 and .95 which cycles through the the min and max of acceptable ranges. 

In [11]:
pipe = Pipeline([
    ('ss',StandardScaler()),
    ('lr',LogisticRegression(solver='liblinear')),    
])

In [12]:
param_grid =  {
    'lr__penalty':['l1','l2'],
    'lr__C': np.linspace(.001,.95,50)
        
}

In [16]:
gs = GridSearchCV(pipe, param_grid=param_grid,verbose=1,scoring='roc_auc')

In [None]:
gs.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


# Scoring our model

In [None]:
gs.score(X_train,y_train)

In [None]:
gs.score(X_test,y_test)

In [None]:
X_train_preds = gs.predict(X_train)

# Making a predictions dataframe

In [None]:
preds = pd.DataFrame({
    "preds":X_train_preds,
    "truth":y_train
})


In [None]:
preds.sum()

Inspecting the best params

In [13]:
gs.best_params_

{'lr__C': 0.001, 'lr__penalty': 'l2'}

In [14]:
y_train.shape

(8189,)

In [15]:
X_train.shape

(8189, 37)

In [16]:
X_train_preds.shape

(8189,)

In [17]:
preds.head()

Unnamed: 0,preds,truth
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [18]:
coefs = pd.DataFrame(gs.best_estimator_.named_steps['lr'].coef_,columns=list(X_test.columns))

In [19]:
scalar = gs.best_estimator_.steps[0][1].var_

In [20]:
coefs.T

Unnamed: 0,0
Day_length,-0.010777
Tmax,-0.005629
Tmin,0.044464
Tavg,0.018795
ResultSpeed,-0.007766
ResultDir,-0.017823
AvgSpeed,-0.002121
Sunset,-0.005906
Sunrise,0.04293
Heat,-0.016123


In [21]:
with open('../assets/logistic_regression.pkl','wb+') as f:
    pickle.dump(gs,f)