In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, make_scorer
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [58]:
#code found from http://www.davidsbatista.net/blog/2018/02/23/model_optimization/
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=-1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]


In [68]:
classifier_model_params = {
    'LogisticRegression' : {
        'penalty' : ['l1', 'l2'],
        'C' : np.arange(.05, 1, .05) },
    'VanillaLogRegression' : {
    },
    'KNN' : {
        'n_neighbors' : np.arange(3, 22, 2) },
    'NaiveBayes' : {
        'alpha' : np.arange(.05, 2, .05)},
    'DecisionTree': {
        'max_depth' : [None, 6, 10, 14], 
        'min_samples_leaf' : [1, 2],
        'min_samples_split': [2, 3] },
    'BaggedDecisionTree' : {
        'n_estimators' : [20, 60, 100] },
    'RandomForest' : {
        'n_estimators' : [20, 60, 100],
        'max_depth' : [None, 2, 6, 10],
        'min_samples_split' : [2, 3, 4] },
    'ExtraTrees' : {
        'n_estimators' : [20, 60, 100],
        'max_depth' : [None, 6, 10, 14],
        'min_samples_leaf' : [1, 2], 
        'min_samples_split' : [2, 3], },
    'AdaBoost' : {
        'n_estimators' : np.arange(100, 151, 25),
        'learning_rate' : np.linspace(0.05, 1, 10) },
    'GradientBoosting' : {
        'n_estimators' : np.arange(5, 150, 15),
        'learning_rate' : np.linspace(0.05, 1, 10),
        'max_depth' : [1, 2, 3] },
    'SVM' : {
        'C' : np.arange(0.05, 1, .05),
        'kernel' : ['rbf', 'linear'] },
     'XGBoost' : {
        'n_estimators'  : np.arange(100, 151, 25), 
        'learning_rate' : np.arange(0.1, 1, .3),
        'max_depth' : [3],
        'alpha' : np.arange(0, 1, .3),
        'lambda' : np.arange(0, 1, .3),
        'gamma' : np.arange(0, 1, .3),
        'subsample' : [.5],
        'n_jobs' : [4],}
        }

In [69]:
scaler = StandardScaler()
sm = SMOTE(sampling_strategy=1,random_state=666)

train = pd.read_csv('./data/train_weather.csv')
train_dummies = pd.get_dummies(train,drop_first=True,columns=['Species','Street'])
y = train_dummies['WnvPresent']
X = train_dummies[[col for col in train_dummies.columns if col != 'WnvPresent']]

train_x, test_x, train_y, test_y = train_test_split(X,y,test_size = 0.3, random_state = 666,stratify=y)
train_x=scaler.fit_transform(train_x)
test_x=scaler.transform(test_x)
sampledX,sampledy = sm.fit_sample(train_x,train_y)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  # This is added back by InteractiveShellApp.init_path()


In [70]:
classifier_models = {
    'LogisticRegression' : LogisticRegression(random_state = 42),
    'VanillaLogRegression' : LogisticRegression(random_state = 42),
    'KNN': KNeighborsClassifier(), 
#     'NaiveBayes' : MultinomialNB(), #does not work with negative vals
#     'DecisionTree' : DecisionTreeClassifier(random_state = 42), 
#     'BaggedDecisionTree' : BaggingClassifier(random_state = 42),
#     'RandomForest' : RandomForestClassifier(random_state = 42), 
#     'ExtraTrees' : ExtraTreesClassifier(random_state = 42), 
#     'AdaBoost' : AdaBoostClassifier(random_state=42), 
#     'GradientBoosting' : GradientBoostingClassifier(random_state = 42),
#     'SVM' : SVC(random_state=42),
#     'XGBoost' : XGBClassifier(random_state=42)
}

In [71]:
rec=make_scorer(recall_score,average='binary',pos_label=1)
search = EstimatorSelectionHelper(classifier_models, classifier_model_params)
search.fit(sampledX,sampledy, scoring=rec, n_jobs=3)

Running GridSearchCV for LogisticRegression.
Fitting 3 folds for each of 38 candidates, totalling 114 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   57.9s
[Parallel(n_jobs=3)]: Done 114 out of 114 | elapsed:  2.6min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Running GridSearchCV for VanillaLogRegression.
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Running GridSearchCV for KNN.
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=3)]: Done  30 out of  30 | elapsed:  5.7min finished


We then score our different models and output our results to a csv for archival.

In [73]:
score1=search.score_summary(sort_by='mean_score')

LogisticRegression
VanillaLogRegression
KNN


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [76]:
score1=pd.read_csv(r'.\data\score3.csv')

In [17]:
score1=pd.concat([score1,score2])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [18]:
score1.to_csv(r'.\data\score3.csv',index=False)

In [74]:
score1

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,C,n_neighbors,penalty
41,KNN,0.939303,0.951636,0.965992,0.0109895,,7.0,
39,KNN,0.939303,0.950632,0.956952,0.00802905,,3.0,
42,KNN,0.935428,0.948335,0.959966,0.0100579,,9.0,
40,KNN,0.93672,0.946901,0.952627,0.00721786,,5.0,
43,KNN,0.928971,0.941591,0.950065,0.00909664,,11.0,
44,KNN,0.929832,0.940873,0.950495,0.00849502,,13.0,
45,KNN,0.92768,0.936998,0.944899,0.00710075,,15.0,
46,KNN,0.917348,0.934415,0.944468,0.0121319,,17.0,
47,KNN,0.911322,0.93341,0.952217,0.016856,,19.0,
48,KNN,0.904864,0.932981,0.950926,0.020133,,21.0,


In [79]:
score1=score1.sort_values('mean_score',ascending=False)

In [81]:
scores = score1[score1['mean_score'] == score1.groupby('estimator')['mean_score'].transform('max')]

In [83]:
scores.groupby('estimator').head(1)

Unnamed: 0,C,alpha,estimator,gamma,kernel,lambda,learning_rate,max_depth,max_score,mean_score,min_samples_leaf,min_samples_split,min_score,n_estimators,n_jobs,n_neighbors,penalty,std_score,subsample
13,,,ExtraTrees,,,,,,0.991387,0.984927,2.0,2.0,0.972868,60.0,,,,0.008534,
25,,,RandomForest,,,,,10.0,0.986219,0.971433,,3.0,0.945306,100.0,,,,0.018528,
28,,,DecisionTree,,,,,14.0,0.985788,0.959231,1.0,2.0,0.921189,,,,,0.027595,
14,,,BaggedDecisionTree,,,,,,0.991387,0.957221,,,0.894057,100.0,,,,0.044713,
470,,0.0,XGBoost,0.3,,0.9,0.4,3.0,0.988372,0.951766,,,0.879845,150.0,4.0,,,0.050858,0.5
824,,,GradientBoosting,,,,0.261111,3.0,0.98708,0.949038,,,0.875538,140.0,,,,0.051983,
64,,,KNN,,,,,,0.955642,0.942578,,,0.920758,,,3.0,,0.015529,
153,,,AdaBoost,,,,0.788889,,0.964255,0.931668,,,0.867356,150.0,,,,0.045477,
172,0.9,,SVM,,rbf,,,,0.94186,0.923629,,,0.890612,,,,,0.023389,
135,0.15,,LogisticRegression,,,,,,0.860465,0.855154,,,0.851852,,,,l2,0.003793,


In [90]:
estimators = {
    'XGB' : XGBClassifier(gamma=0.3,learning_rate=0.4,max_depth=3,n_estimators=150,subsample=0.5),
    'ADA' : AdaBoostClassifier(learning_rate=0.788889,n_estimators=150),
    'GBoost' : GradientBoostingClassifier(learning_rate=0.261111,max_depth=3,n_estimators=140),
    'SVC' : SVC(kernel='rbf'),
    'DecisionTree' : DecisionTreeClassifier(max_depth=None,min_samples_leaf=1,min_samples_split=2)
}

In [92]:
for e_name,estimator in estimators.items():
    estimator.fit(sampledX,sampledy)
    pred=estimator.predict(test_x)
    print(e_name)
    print(classification_report(test_y,pred))
    recall_score(test_y,pred,average='binary',pos_label=1)

XGB
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      2987
           1       0.27      0.24      0.25       165

   micro avg       0.93      0.93      0.93      3152
   macro avg       0.61      0.60      0.61      3152
weighted avg       0.92      0.93      0.92      3152

ADA
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      2987
           1       0.24      0.50      0.32       165

   micro avg       0.89      0.89      0.89      3152
   macro avg       0.60      0.70      0.63      3152
weighted avg       0.93      0.89      0.91      3152

GBoost
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      2987
           1       0.29      0.32      0.30       165

   micro avg       0.92      0.92      0.92      3152
   macro avg       0.62      0.64      0.63      3152
weighted avg       0.93      0.92      0.92      3152





SVC
              precision    recall  f1-score   support

           0       0.97      0.84      0.90      2987
           1       0.16      0.53      0.24       165

   micro avg       0.83      0.83      0.83      3152
   macro avg       0.56      0.68      0.57      3152
weighted avg       0.93      0.83      0.87      3152



In [55]:
dt=
dt.fit(sampledX,sampledy)
pred=dt.predict(test_x)
print(classification_report(test_y,pred))
recall_score(test_y,pred,average='binary',pos_label=1)

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      2989
           1       0.21      0.24      0.23       163

   micro avg       0.92      0.92      0.92      3152
   macro avg       0.59      0.60      0.59      3152
weighted avg       0.92      0.92      0.92      3152



0.2392638036809816

In [56]:
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(sampledX,sampledy)
pred=knn.predict(test_x)
print(classification_report(test_y,pred))
recall_score(test_y,pred,average='binary',pos_label=1)

              precision    recall  f1-score   support

           0       0.96      0.90      0.93      2989
           1       0.17      0.37      0.23       163

   micro avg       0.87      0.87      0.87      3152
   macro avg       0.56      0.63      0.58      3152
weighted avg       0.92      0.87      0.89      3152



0.36809815950920244

In [45]:
et=ExtraTreesClassifier(min_samples_leaf=2,min_samples_split=2,n_estimators=60)
et.fit(sampledX,sampledy)
pred=et.predict(test_x)
print(classification_report(test_y,pred))
recall_score(test_y,pred,average='binary',pos_label=1)

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2989
           1       0.34      0.38      0.36       163

   micro avg       0.93      0.93      0.93      3152
   macro avg       0.65      0.67      0.66      3152
weighted avg       0.93      0.93      0.93      3152



0.3803680981595092

In [53]:
rf=RandomForestClassifier(max_depth=10,min_samples_split=2,n_estimators=100)
rf.fit(sampledX,sampledy)
pred=rf.predict(test_x)
print(classification_report(test_y,pred))
recall_score(test_y,pred,average='binary',pos_label=1)

              precision    recall  f1-score   support

           0       0.97      0.91      0.94      2989
           1       0.24      0.53      0.33       163

   micro avg       0.89      0.89      0.89      3152
   macro avg       0.60      0.72      0.63      3152
weighted avg       0.93      0.89      0.91      3152



0.5276073619631901

In [52]:
lr=LogisticRegression(penalty='l2',C=0.05)
lr.fit(sampledX,sampledy)
pred=lr.predict(test_x)
print(classification_report(test_y,pred))
recall_score(test_y,pred,average='binary',pos_label=1)



              precision    recall  f1-score   support

           0       0.98      0.80      0.88      2989
           1       0.15      0.67      0.25       163

   micro avg       0.79      0.79      0.79      3152
   macro avg       0.57      0.74      0.56      3152
weighted avg       0.94      0.79      0.85      3152



0.6748466257668712

In [50]:
bd=BaggingClassifier(n_estimators=100)
bd.fit(sampledX,sampledy)
pred=bd.predict(test_x)
print(classification_report(test_y,pred))
recall_score(test_y,pred,average='binary',pos_label=1)

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2989
           1       0.32      0.26      0.29       163

   micro avg       0.93      0.93      0.93      3152
   macro avg       0.64      0.62      0.63      3152
weighted avg       0.93      0.93      0.93      3152



0.26380368098159507