# Gradient Descent

In [6]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import SGDClassifier

In [7]:
X_train = pd.read_pickle('../assets/X_train.pkl')
X_test = pd.read_pickle('../assets/X_test.pkl')
y_train = pd.read_pickle('../assets/y_train.pkl')
y_test = pd.read_pickle('../assets/y_test.pkl')

In [8]:
X_train.columns


Index(['Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Sunset', 'Heat', 'Depart', 'DewPoint', 'Cool',
       'PrecipTotal', 'StnPressure', 'Latitude', 'Longitude', 'Month',
       'Day_length_exp', 'Tavg_shift', 'Heat_exp', 'Cool_shift', 'Tmax_shift',
       'Tmin_shift', 'Depart_shift', 'ResultSpeed_shift', 'ResultDir_exp',
       'PrecipTotal_exp', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS',
       'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS',
       'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS',
       'Species_CULEX TERRITANS'],
      dtype='object')

In [25]:
pipe = Pipeline([
    ('ss',StandardScaler()),
    ('gd',SGDClassifier(penalty='elasticnet',loss='log')),    
])

In [26]:
param_grid =  {
    'gd__alpha':[.001,.0025,.03,.05,1.0,],
    'gd__epsilon': [.02,.25,.50,1.0],
    'gd__l1_ratio':[.15,.25,.40,.60,.90]
        
}

In [27]:
gs = GridSearchCV(pipe, param_grid=param_grid,verbose=1,scoring='roc_auc')

In [28]:
gs.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits






























[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    3.8s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('gd', SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'gd__alpha': [0.001, 0.0025, 0.03, 0.05, 1.0], 'gd__epsilon': [0.02, 0.25, 0.5, 1.0], 'gd__l1_ratio': [0.15, 0.25, 0.4, 0.6, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=1)

In [29]:
gs.score(X_train,y_train)

0.7959520721799178

In [30]:
gs.score(X_test,y_test)

0.6721580326068404

In [10]:
X_train_preds = gs.predict(X_train)

In [11]:
preds = pd.DataFrame({
    "preds":X_train_preds,
    "truth":y_train
})


In [12]:
preds.sum()

preds      0
truth    261
dtype: int64

In [13]:
gs.best_params_

{'lr__C': 0.02036734693877551, 'lr__penalty': 'l2'}

In [14]:
y_train.shape

(6483,)

In [15]:
X_train.shape

(6483, 34)

In [16]:
X_train_preds.shape

(6483,)

In [20]:
preds.head()

Unnamed: 0,preds,truth
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [128]:
coefs = pd.DataFrame(gs.best_estimator_.named_steps['lr'].coef_,columns=list(X_test.columns))

In [139]:
scalar = gs.best_estimator_.steps[0][1].var_

In [142]:
np.exp(scalar * coefs)

Unnamed: 0,Day_length,Tmax,Tmin,Tavg,ResultSpeed,ResultDir,AvgSpeed,Sunset,Heat,Depart,...,ResultSpeed_shift,ResultDir_exp,PrecipTotal_exp,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS
0,0.0,1.908157,325.148433,10.964893,1.396208,3.174682,0.773008,2.06802e-251,1.242849,0.001397,...,0.363686,0.624065,1.000533,1.0,1.021285,1.015052,0.980486,0.998986,0.99999,0.995542


In [18]:
with open('../assets/gd_','wb+') as f:
    pickle.dump(gs,f)