# Logistic Regression Model
We opted to fit a logistic regression first because of it's fast compute time and straightforaward implementation. 

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.svm import SVC




# Loading in the pickled data

In [2]:
X_train = pd.read_pickle('../assets/X_train.pkl')
X_test = pd.read_pickle('../assets/X_test.pkl')
y_train = pd.read_pickle('../assets/y_train.pkl')
y_test = pd.read_pickle('../assets/y_test.pkl')

In [3]:
X_train.columns

Index(['Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Sunset', 'Heat', 'Depart', 'DewPoint', 'Cool',
       'PrecipTotal', 'StnPressure', 'Latitude', 'Longitude', 'Month',
       'Day_length_exp', 'Tavg_shift', 'Heat_exp', 'Cool_shift', 'Tmax_shift',
       'Tmin_shift', 'Depart_shift', 'ResultSpeed_shift', 'ResultDir_exp',
       'PrecipTotal_exp', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS',
       'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS',
       'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS',
       'Species_CULEX TERRITANS'],
      dtype='object')

# Setting up the pipeline

In [4]:
pipe = Pipeline([
    ('ss',StandardScaler()),
    ('pca',PCA(svd_solver='randomized')),
#     ('svc',SVC(kernel='poly',C=.001,class_weight='balanced')),
    ('lr',LogisticRegression(solver='liblinear',max_iter=1000)),
    
])

In [5]:
param_grid =  {
    'lr__penalty':['l1','l2'],
    'lr__C': np.linspace(.95,50)
        
}

In [28]:
gs = GridSearchCV(pipe, param_grid=param_grid,verbose=1,scoring='roc_auc')

In [None]:
gs.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


# Scoring our model

In [None]:
gs.score(X_train,y_train)

In [None]:
gs.score(X_test,y_test)

In [None]:
X_train_preds = gs.predict(X_train)

# Making a predictions dataframe

In [None]:
preds = pd.DataFrame({
    "preds":X_train_preds,
    "truth":y_train
})


In [19]:
preds.sum()

preds      0
truth    261
dtype: int64

Inspecting the best params

In [20]:
gs.best_params_

{'lr__C': 0.95, 'lr__penalty': 'l2'}

In [21]:
y_train.shape

(6483,)

In [22]:
X_train.shape

(6483, 34)

In [16]:
X_train_preds.shape

(6483,)

In [23]:
preds.head()

Unnamed: 0,preds,truth
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [24]:
coefs = pd.DataFrame(gs.best_estimator_.named_steps['lr'].coef_,columns=list(X_test.columns))

In [25]:
scalar = gs.best_estimator_.steps[0][1].var_

In [26]:
coefs.T

Unnamed: 0,0
Day_length,-0.2809045
Tmax,-0.5415549
Tmin,-0.1359957
Tavg,-0.09147725
ResultSpeed,-0.3354903
ResultDir,-0.1766774
AvgSpeed,0.5557218
Sunset,-0.5347795
Heat,0.1213418
Depart,-0.5156955


In [27]:
with open('../assets/logistic_regression_pca.pkl','wb+') as f:
    pickle.dump(gs,f)