In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Loading data (Breast Cancer Wisconsin)

In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/'
    'machine-learning-databases'
    '/breast-cancer-wisconsin/wdbc.data',
    header=None)

X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = \
    train_test_split(X, y,
                     test_size=0.20,
                     stratify=y,
                     random_state=1)

# Streamline with pipeline

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])
print("The pipeline steps: ", pipeline.steps)
pipeline


The pipeline steps:  [('scaler', StandardScaler()), ('classifier', LogisticRegression())]


In [4]:
# Get the classifier from the pipeline
classifier = pipeline.named_steps['classifier']
classifier

In [5]:
# Get the scaler from the pipeline
scaler = pipeline.named_steps['scaler']
scaler

In [6]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)
print("The pipeline steps: ", pipeline.steps)
pipeline

The pipeline steps:  [('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression())]


In [7]:
# Get the classifier from the pipeline
classifier = pipeline.named_steps['logisticregression']
classifier

In [8]:
# Get the scaler from the pipeline
scaler = pipeline.named_steps['standardscaler']
scaler

# K-fold cross-validation

In [21]:
# Create model
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=2),
                        LogisticRegression(random_state=1,
                                           solver='lbfgs'))


In [22]:
# Stratified cross-validation
from sklearn.model_selection import StratifiedKFold

# Generate n folds and its index of train and test each fold.
kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)
scores = []

for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print(f"Fold: {k}, Class dist.: {np.bincount(y_train[train])}, Acc: {score}")

print(f"CV Accuracy: {round(np.mean(scores), 3)} +/- {round(np.std(scores), 3)}")


Fold: 0, Class dist.: [256 153], Acc: 0.9347826086956522
Fold: 1, Class dist.: [256 153], Acc: 0.9347826086956522
Fold: 2, Class dist.: [256 153], Acc: 0.9565217391304348
Fold: 3, Class dist.: [256 153], Acc: 0.9565217391304348
Fold: 4, Class dist.: [256 153], Acc: 0.9347826086956522
Fold: 5, Class dist.: [257 153], Acc: 0.9555555555555556
Fold: 6, Class dist.: [257 153], Acc: 0.9777777777777777
Fold: 7, Class dist.: [257 153], Acc: 0.9333333333333333
Fold: 8, Class dist.: [257 153], Acc: 0.9555555555555556
Fold: 9, Class dist.: [257 153], Acc: 0.9555555555555556
CV Accuracy: 0.95 +/- 0.014


In [24]:
# Shortcut function

from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=1)
print(f'CV accuracy scores: {scores}')
print(f"CV Accuracy: {round(np.mean(scores), 3)} +/- {round(np.std(scores), 3)}")


CV accuracy scores: [0.93478261 0.93478261 0.95652174 0.95652174 0.93478261 0.95555556
 0.97777778 0.93333333 0.95555556 0.95555556]
CV Accuracy: 0.95 +/- 0.014


# Tuning hyperparameters via Grid Search Cross Validation

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

pipe_svc = make_pipeline(StandardScaler(),
                         SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1,
               1.0, 10.0, 100.0, 1000.0]
param_grid = [{'svc__C': param_range,
               'svc__kernel': ['linear']},
              {'svc__C': param_range,
               'svc__gamma': param_range,
               'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10,
                  refit=True,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.9846859903381642
{'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}


In [None]:
clf = gs.best_estimator_
clf.fit(X_train, y_train)
# NOTE:
# Please note that fitting a model with the best settings
#   (gs.best_estimator_) on the training set manually via
#   clf.fit(X_train, y_train) after completing the
#   grid search is not necessary if we set refit=True.

print('Test accuracy: %.3f' % clf.score(X_test, y_test))