# Logistic Regression Model
We opted to fit a logistic regression first because of it's fast compute time and straightforaward implementation. 

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression



In [2]:
X_train = pd.read_pickle('../assets/X_train.pkl')
X_test = pd.read_pickle('../assets/X_test.pkl')
y_train = pd.read_pickle('../assets/y_train.pkl')
y_test = pd.read_pickle('../assets/y_test.pkl')

In [3]:
X_train.columns


Index(['Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Sunset', 'Heat', 'Depart', 'DewPoint', 'Cool',
       'PrecipTotal', 'StnPressure', 'Latitude', 'Longitude', 'Month',
       'Day_length_exp', 'Tavg_shift', 'Heat_exp', 'Cool_shift', 'Tmax_shift',
       'Tmin_shift', 'Depart_shift', 'ResultSpeed_shift', 'ResultDir_exp',
       'PrecipTotal_exp', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS',
       'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS',
       'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS',
       'Species_CULEX TERRITANS'],
      dtype='object')

In [4]:
pipe = Pipeline([
    ('ss',StandardScaler()),
    ('lr',LogisticRegression(solver='liblinear')),    
])

In [5]:
param_grid =  {
    'lr__penalty':['l1','l2'],
    'lr__C': np.linspace(.001,.95,50)
        
}

In [6]:
gs = GridSearchCV(pipe, param_grid=param_grid,verbose=1,scoring='roc_auc')

In [7]:
gs.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:   22.6s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'lr__penalty': ['l1', 'l2'], 'lr__C': array([0.001  , 0.02037, 0.03973, 0.0591 , 0.07847, 0.09784, 0.1172 ,
       0.13657, 0.15594, 0.17531, 0.19467, 0.21404, 0.23341, 0.25278,
       0.27214, 0.29151, 0.31088, 0.33024, 0.34961, 0.36898, 0.38835,
       0.40771, 0.42708, 0.44645, 0.4658...69, 0.79506,
       0.81443, 0.8338 , 0.85316, 0.87253, 0.8919 , 0.91127, 0.93063,
       0.95   ])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=1)

In [8]:
gs.score(X_train,y_train)

0.8477156203854571

In [9]:
gs.score(X_test,y_test)

0.7380049521236992

In [10]:
X_train_preds = gs.predict(X_train)

In [11]:
preds = pd.DataFrame({
    "preds":X_train_preds,
    "truth":y_train
})


In [12]:
preds.sum()

preds      0
truth    261
dtype: int64

In [13]:
gs.best_params_

{'lr__C': 0.02036734693877551, 'lr__penalty': 'l2'}

In [14]:
y_train.shape

(6483,)

In [15]:
X_train.shape

(6483, 34)

In [16]:
X_train_preds.shape

(6483,)

In [20]:
preds.head()

Unnamed: 0,preds,truth
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [128]:
coefs = pd.DataFrame(gs.best_estimator_.named_steps['lr'].coef_,columns=list(X_test.columns))

In [139]:
scalar = gs.best_estimator_.steps[0][1].var_

In [144]:
coefs.T

Unnamed: 0,0
Day_length,-0.110032
Tmax,0.008569
Tmin,0.092149
Tavg,0.03873
ResultSpeed,0.046628
ResultDir,0.015374
AvgSpeed,-0.047583
Sunset,-0.133089
Heat,0.024957
Depart,-0.140172


In [18]:
with open('../assets/logistic_regression.pkl','wb+') as f:
    pickle.dump(gs,f)