In [1]:
import pandas as pd
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data", header=None)

In [2]:
from sklearn.preprocessing import LabelEncoder

X = df.loc[:, 2:].values
y = df.loc[:, 1].values

le = LabelEncoder()
y = le.fit_transform(y)

le.classes_

array(['B', 'M'], dtype=object)

In [3]:
le.transform(["B", "M"])

array([0, 1])

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=1
)

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=2),
                        LogisticRegression(random_state=1))
pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)

print("Test Accuracy: %.3f" % pipe_lr.score(X_test, y_test))

Test Accuracy: 0.956


In [6]:
from sklearn import set_config

set_config(display='diagram')

In [7]:
pipe_lr

In [8]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [9]:
kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)
scores = []

for k, (train, test) in enumerate(kfold):
  pipe_lr.fit(X_train[train], y_train[train])
  score = pipe_lr.score(X_train[test], y_train[test])
  scores.append(score)
  print(f"Fold: {k+1}, Class Distribution: {np.bincount(y_train[train])}, Accuracy: {score}")

Fold: 1, Class Distribution: [256 153], Accuracy: 0.9347826086956522
Fold: 2, Class Distribution: [256 153], Accuracy: 0.9347826086956522
Fold: 3, Class Distribution: [256 153], Accuracy: 0.9565217391304348
Fold: 4, Class Distribution: [256 153], Accuracy: 0.9565217391304348
Fold: 5, Class Distribution: [256 153], Accuracy: 0.9347826086956522
Fold: 6, Class Distribution: [257 153], Accuracy: 0.9555555555555556
Fold: 7, Class Distribution: [257 153], Accuracy: 0.9777777777777777
Fold: 8, Class Distribution: [257 153], Accuracy: 0.9333333333333333
Fold: 9, Class Distribution: [257 153], Accuracy: 0.9555555555555556
Fold: 10, Class Distribution: [257 153], Accuracy: 0.9555555555555556


In [10]:
print(f"CV Accuracy: {np.mean(scores)} +/- {np.std(scores)}")

CV Accuracy: 0.9495169082125603 +/- 0.013854294239660376
