This Notebook explores Binary Classification Algorithms along side other sklearn perks like Pipeline, GridSearch, etc, ...

In [10]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, roc_auc_score, roc_curve
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [11]:
dataset = load_breast_cancer()
X, y = dataset['data'], dataset['target']

In [124]:
dataset['feature_names']

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [17]:
df = pd.DataFrame(X)
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


# Train - Test

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [81]:
def conf_matrix(pip, train, test, y_train, y_test):
    
    yp_train = pip.predict(train)
    yp_test = pip.predict(test)  
    
    res = confusion_matrix(y_train, yp_train)
    res2 = confusion_matrix(y_test, yp_test)
    
    print("{:9} | {:9}".format('Train', 'Test'))
    print('--------------------')
    print("{} | {}".format(res[0,:], res2[0,:]))
    print("{} | {}".format(res[1,:], res2[1,:]))
    

# Logistic Regression

In [107]:
scaler = MinMaxScaler()
clf = LogisticRegression(solver='lbfgs', max_iter=1000, penalty='none')

## w/o scaling

In [108]:
pipeline = Pipeline([('clf', clf)])
pipeline = pipeline.fit(X_train, y_train)



In [109]:
conf_matrix(pipeline, X_train, X_test, y_train, y_test)

Train     | Test     
--------------------
[157  10] | [43  2]
[  4 284] | [ 3 66]


## w/ scaling

In [110]:
pipeline = Pipeline([('scaler', scaler), ('clf', clf)])
pipeline = pipeline.fit(X_train, y_train)

In [111]:
conf_matrix(pipeline, X_train, X_test, y_train, y_test)

Train     | Test     
--------------------
[167   0] | [43  2]
[  0 288] | [ 9 60]


## w/ penalization & w/ scaling

In [112]:
clf = LogisticRegression(solver='lbfgs', max_iter=1000, penalty='l2')

In [113]:
pipeline = Pipeline([('scaler', scaler), ('clf', clf)], verbose=False)
pipeline = pipeline.fit(X_train, y_train)

In [114]:
conf_matrix(pipeline, X_train, X_test, y_train, y_test)

Train     | Test     
--------------------
[156  11] | [41  4]
[  1 287] | [ 0 69]


In [115]:
pipeline.score(X_test, y_test)

0.9649122807017544

## W/ balanced weights

In [116]:
clf = LogisticRegression(solver='lbfgs', max_iter=1000, penalty='l2', class_weight ='balanced')

In [117]:
pipeline = Pipeline([('scaler', scaler), ('clf', clf)], verbose=False)
pipeline = pipeline.fit(X_train, y_train)
conf_matrix(pipeline, X_train, X_test, y_train, y_test)

Train     | Test     
--------------------
[162   5] | [42  3]
[  4 284] | [ 2 67]


In [118]:
pipeline.score(X_test, y_test)

0.956140350877193

# Naive Bayes

In [119]:
from sklearn.naive_bayes import GaussianNB

In [120]:
clf = GaussianNB()

In [121]:
pipeline = Pipeline([('clf', clf)], verbose=False)
pipeline = pipeline.fit(X_train, y_train)
conf_matrix(pipeline, X_train, X_test, y_train, y_test)

Train     | Test     
--------------------
[147  20] | [42  3]
[  9 279] | [ 4 65]


In [122]:
pipeline = Pipeline([('scaler', scaler), ('clf', clf)], verbose=False)
pipeline = pipeline.fit(X_train, y_train)
conf_matrix(pipeline, X_train, X_test, y_train, y_test)

Train     | Test     
--------------------
[149  18] | [42  3]
[ 14 274] | [ 4 65]


In [123]:
pipeline.score(X_test, y_test)

0.9385964912280702

# SVM

In [139]:
from sklearn.svm import SVC

In [140]:
clf = SVC(gamma='auto')

In [141]:
pipeline = Pipeline([('scaler', scaler), ('clf', clf)], verbose=False)
pipeline = pipeline.fit(X_train, y_train)
conf_matrix(pipeline, X_train, X_test, y_train, y_test)
pipeline.score(X_test, y_test)

Train     | Test     
--------------------
[145  22] | [41  4]
[  0 288] | [ 1 68]


0.956140350877193

In [142]:
pipeline[1].get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

## GridSearch

In [143]:
from sklearn.model_selection import GridSearchCV

In [153]:
param_grid = [{'kernel': ['linear', 'rbf', 'sigmoid'], 'C':[1, 10, 100]},
              {'kernel': ['poly'], 'C':[1, 10, 100], 'degree': [1, 2, 3, 5, 10]}]

In [154]:
svc = SVC(gamma="scale")
clf = GridSearchCV(svc, param_grid, cv=5, verbose = 0)

In [155]:
clf = clf.fit(dataset.data, dataset.target)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=1, kernel=linear ..............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................... C=1, kernel=linear, total=   0.9s
[CV] C=1, kernel=linear ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] ............................... C=1, kernel=linear, total=   2.1s
[CV] C=1, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   1.1s
[CV] C=1, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   0.6s
[CV] C=1, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   1.1s
[CV] C=1, kernel=rbf .................................................
[CV] .................................. C=1, kernel=rbf, total=   0.0s
[CV] C=1, kernel=rbf .................................................
[CV] .................................. C=1, kernel=rbf, total=   0.0s
[CV] C=1, kernel=rbf .................................................
[CV] .................................. C=1, kernel=rbf, total=   0.0s
[CV] C=1, kernel=rbf .................................................
[CV] .

[CV] ...................... C=1, degree=10, kernel=poly, total=   0.0s
[CV] C=10, degree=1, kernel=poly .....................................
[CV] ...................... C=10, degree=1, kernel=poly, total=   0.0s
[CV] C=10, degree=1, kernel=poly .....................................
[CV] ...................... C=10, degree=1, kernel=poly, total=   0.0s
[CV] C=10, degree=1, kernel=poly .....................................
[CV] ...................... C=10, degree=1, kernel=poly, total=   0.0s
[CV] C=10, degree=1, kernel=poly .....................................
[CV] ...................... C=10, degree=1, kernel=poly, total=   0.0s
[CV] C=10, degree=1, kernel=poly .....................................
[CV] ...................... C=10, degree=1, kernel=poly, total=   0.0s
[CV] C=10, degree=2, kernel=poly .....................................
[CV] ...................... C=10, degree=2, kernel=poly, total=   0.0s
[CV] C=10, degree=2, kernel=poly .....................................
[CV] .

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.0min finished


In [156]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [157]:
report(clf.cv_results_)

Model with rank: 1
Mean validation score: 0.963 (std: 0.017)
Parameters: {'C': 100, 'kernel': 'linear'}

Model with rank: 2
Mean validation score: 0.953 (std: 0.014)
Parameters: {'C': 10, 'kernel': 'linear'}

Model with rank: 3
Mean validation score: 0.946 (std: 0.019)
Parameters: {'C': 1, 'kernel': 'linear'}



## Verification

In [166]:
clf = SVC(kernel='linear', C=100, gamma='auto')

In [167]:
pipeline = Pipeline([('scaler', scaler), ('clf', clf)], verbose=False)
pipeline = pipeline.fit(X_train, y_train)
conf_matrix(pipeline, X_train, X_test, y_train, y_test)
pipeline.score(X_test, y_test)

Train     | Test     
--------------------
[164   3] | [44  1]
[  0 288] | [ 2 67]


0.9736842105263158

Get the best score in the training ! Perfect !

# GridSearch with multiple models

In [190]:
# Credit to http://www.davidsbatista.net/blog/2018/02/23/model_optimization/
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np


class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [187]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

models1 = {
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'SVC': SVC()
}

params1 = {
    'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
    'RandomForestClassifier': { 'n_estimators': [16, 32] },
    'AdaBoostClassifier':  { 'n_estimators': [16, 32] },
    'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
    'SVC': [
        {'kernel': ['linear'], 'C': [1, 10]},
        {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
    ]
}

In [188]:
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(dataset.data, dataset.target, scoring='f1', n_jobs=1)

Running GridSearchCV for ExtraTreesClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Running GridSearchCV for GradientBoostingClassifier.
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Running GridSearchCV for SVC.
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   14.9s finished


In [189]:
helper1.score_summary(sort_by='max_score')

ExtraTreesClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
SVC


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,C,gamma,kernel,learning_rate,n_estimators
5,AdaBoostClassifier,0.962343,0.974907,0.991667,0.0123335,,,,,32.0
0,ExtraTreesClassifier,0.956522,0.969012,0.987552,0.0133707,,,,,16.0
4,AdaBoostClassifier,0.95279,0.966463,0.983333,0.0126727,,,,,16.0
6,GradientBoostingClassifier,0.943723,0.959359,0.979253,0.0148142,,,,0.8,16.0
3,RandomForestClassifier,0.957983,0.966363,0.979079,0.00914232,,,,,32.0
1,ExtraTreesClassifier,0.962656,0.967988,0.975207,0.00529468,,,,,32.0
7,GradientBoostingClassifier,0.929825,0.957324,0.975207,0.0197359,,,,0.8,32.0
10,SVC,0.95122,0.961108,0.975207,0.0102354,1.0,,linear,,
2,RandomForestClassifier,0.937238,0.959676,0.974576,0.0161479,,,,,16.0
9,GradientBoostingClassifier,0.950413,0.959665,0.970954,0.00850878,,,,1.0,32.0
