In [2]:
# read in UCI breast cancer dataset
import pandas as pd
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)

In [3]:
# assign features to array and use LabelEncoder to tarnsform labels into integers

from sklearn.preprocessing import LabelEncoder
X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['B', 'M'], dtype=object)

In [4]:
le.transform(['M', 'B'])

array([1, 0])

In [5]:
# divide test and training dataset

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

In [6]:
# chain standardizing scales, compress data from 30 dimeonsions to 2 using PCA

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# chain StandardScaler, PCA, and LogisticRegression
pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression())

pipe_lr.fit(X_train, y_train)

y_pred = pipe_lr.predict( X_test )
test_acc = pipe_lr.score(X_test, y_test)

print(f'Test Accuracy: {test_acc:.3f}')

Test Accuracy: 0.956


In [7]:
# use stratified k-fold cross validation
import numpy as np
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)

scores = []

for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print(f'Fold: {k+1:02d} Class distr.: {np.bincount(y_train[train])} Acc.: {score:.3f}')

Fold: 01 Class distr.: [256 153] Acc.: 0.935
Fold: 02 Class distr.: [256 153] Acc.: 0.935
Fold: 03 Class distr.: [256 153] Acc.: 0.957
Fold: 04 Class distr.: [256 153] Acc.: 0.957
Fold: 05 Class distr.: [256 153] Acc.: 0.935
Fold: 06 Class distr.: [257 153] Acc.: 0.956
Fold: 07 Class distr.: [257 153] Acc.: 0.978
Fold: 08 Class distr.: [257 153] Acc.: 0.933
Fold: 09 Class distr.: [257 153] Acc.: 0.956
Fold: 10 Class distr.: [257 153] Acc.: 0.956
