# CH06: Learning Best Practices for Model Evaluation and Hyperparameter Tuning

## Streamling workflows with pipelines

### Loading the Breast Cancer Wisconsin dataset

In [21]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

In [23]:
data = load_breast_cancer()

df = pd.DataFrame(data["data"], columns=data["feature_names"])
df["target"] = np.array(["M", "B"])[data["target"]]

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
mean radius                569 non-null float64
mean texture               569 non-null float64
mean perimeter             569 non-null float64
mean area                  569 non-null float64
mean smoothness            569 non-null float64
mean compactness           569 non-null float64
mean concavity             569 non-null float64
mean concave points        569 non-null float64
mean symmetry              569 non-null float64
mean fractal dimension     569 non-null float64
radius error               569 non-null float64
texture error              569 non-null float64
perimeter error            569 non-null float64
area error                 569 non-null float64
smoothness error           569 non-null float64
compactness error          569 non-null float64
concavity error            569 non-null float64
concave points error       569 non-null float64
symmetry error             569 

In [29]:
from sklearn.preprocessing import LabelEncoder

In [43]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [44]:
le = LabelEncoder().fit(y)
y = le.transform(y)

In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

In [62]:
print(pd.value_counts(y_train, normalize=True))
print()
print(pd.value_counts(y_test, normalize=True))

0    0.626374
1    0.373626
dtype: float64

0    0.631579
1    0.368421
dtype: float64


### Combining transformers and estimators in a pipeline

In [65]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [66]:
pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression(random_state=1))
pipe_lr.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [67]:
y_pred = pipe_lr.predict(X_test)

In [68]:
print("Test accuracy: %.3f" % pipe_lr.score(X_test, y_test))

Test accuracy: 0.956


## Using k-fold cross-validation to assess model performacne

In [70]:
from sklearn.model_selection import StratifiedKFold

In [77]:
kfold = StratifiedKFold(n_splits=10, random_state=1).split(X_train, y_train)

In [78]:
scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print("Fold: %d, Class dist.: %s, Acc: %.3f" % (k+1, np.bincount(y_train[train]), score))

Fold: 1, Class dist.: [256 153], Acc: 0.935
Fold: 2, Class dist.: [256 153], Acc: 0.935
Fold: 3, Class dist.: [256 153], Acc: 0.957
Fold: 4, Class dist.: [256 153], Acc: 0.957
Fold: 5, Class dist.: [256 153], Acc: 0.935
Fold: 6, Class dist.: [257 153], Acc: 0.956
Fold: 7, Class dist.: [257 153], Acc: 0.978
Fold: 8, Class dist.: [257 153], Acc: 0.933
Fold: 9, Class dist.: [257 153], Acc: 0.956
Fold: 10, Class dist.: [257 153], Acc: 0.956


In [79]:
from sklearn.model_selection import cross_val_score

In [81]:
scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10, n_jobs=1)
scores

array([0.93478261, 0.93478261, 0.95652174, 0.95652174, 0.93478261,
       0.95555556, 0.97777778, 0.93333333, 0.95555556, 0.95555556])