# 1. Cross-Validation

In [1]:
import numpy as np
import pandas as pd
from scipy.special import comb

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.model_selection import (
    KFold,
    RepeatedKFold,
    LeaveOneOut,
    LeavePOut,
    StratifiedKFold,
    cross_validate,
    train_test_split,
)

In [2]:
np.set_printoptions(precision=2)

In [3]:
X, y = load_breast_cancer(as_frame=True, return_X_y=True)

In [4]:
y = pd.Series(y).map({0:1, 1:0})

In [5]:
y.sample(3)

157    0
193    1
67     0
Name: target, dtype: int64

In [6]:
# percentage of benign(0) and malignant(1)

y.value_counts(normalize=True)

target
0    0.627417
1    0.372583
Name: proportion, dtype: float64

In [7]:
# splitting the dataset into a train and test set

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=13)

display(X_train.shape)
display(X_test.shape)

(398, 30)

(171, 30)

# 2. K-Fold Cross-Validation

In [8]:
logit = LogisticRegression(
    penalty='l2', C=10, solver='liblinear', random_state=14, max_iter=10_000)

kf = KFold(n_splits=5, shuffle=True, random_state=31)

clf = cross_validate(
    logit,
    X_train,
    y_train,
    scoring='accuracy',
    return_train_score=True,
    cv=kf)

clf['test_score']

array([0.95, 0.93, 0.94, 0.96, 0.95])

In [9]:
print('mean train set accuracy: ', np.mean(clf['train_score']), '+-', np.std(clf['train_score']))

print('mean test set accuracy: ', np.mean(clf['test_score']), '+-', np.std(clf['test_score']))

mean train set accuracy:  0.9673389720234222 +- 0.0054682032235071395
mean test set accuracy:  0.9447784810126582 +- 0.012569307936512805


# 3. Repeated K-Fold

In [10]:
logit = LogisticRegression(
    penalty='l2', C=1, solver='liblinear', random_state=25, max_iter=10_000)

rkf = RepeatedKFold(
    n_splits=5,
    n_repeats=10,
    random_state=124)

print(f'We expect {5 * 10} performance metrics.')

clf = cross_validate(
    logit, X_train, y_train,
    scoring='accuracy', return_train_score=True,
    cv=rkf)

print('Number of metrics obtained:', len(clf['test_score']))

clf['test_score']

We expect 50 performance metrics.
Number of metrics obtained: 50


array([0.99, 0.96, 0.94, 0.91, 0.94, 0.99, 0.93, 0.94, 0.96, 0.92, 0.96,
       0.96, 0.95, 0.91, 0.94, 0.94, 0.99, 0.91, 0.96, 0.96, 0.94, 0.97,
       0.91, 0.95, 0.97, 0.95, 0.96, 0.97, 0.94, 0.91, 0.94, 0.9 , 0.95,
       0.97, 0.99, 0.95, 0.95, 0.95, 0.96, 0.95, 0.93, 0.95, 0.96, 0.99,
       0.89, 0.97, 0.97, 0.9 , 0.95, 0.95])

In [11]:
print('mean train set accuracy: ', np.mean(clf['train_score']), '+-', np.std(clf['train_score']))

print('mean test set accuracy: ', np.mean(clf['test_score']), '+-', np.std(clf['test_score']))

mean train set accuracy:  0.9579142761380888 +- 0.006060919300600648
mean test set accuracy:  0.9482310126582277 +- 0.02511411804647664


# 4. Leave One Out

In [12]:
logit = LogisticRegression(
    penalty='l2', C=10, solver='liblinear', random_state=133, max_iter=10_000)

loo = LeaveOneOut()

print(f'We expect {len(X_train)} metrics.')
clf = cross_validate(
    logit, X_train, y_train,
    scoring='accuracy',
    return_train_score=True,
    cv=loo)

print('Number of metrics obtained:', len(clf['test_score']))

len(clf['test_score'])

We expect 398 metrics.
Number of metrics obtained: 398


398

In [13]:
print('mean train set accuracy: ', np.mean(clf['train_score']), '+-', np.std(clf['train_score']))

print('mean test set accuracy: ', np.mean(clf['test_score']), '+-', np.std(clf['test_score']))

mean train set accuracy:  0.965691176284445 +- 0.0019219466789449397
mean test set accuracy:  0.9547738693467337 +- 0.20779972993100837


# 5. Leave P Out

In [17]:
logit = LogisticRegression(
    penalty='l2', C=1, solver='liblinear', random_state=23, max_iter=10_000)

lpo = LeavePOut(p=2)

X_train_small = X_train.head(100)
y_train_small = y_train.head(100)

print('We expect:', comb(100, 2), 'metrics.')

clf = cross_validate(
    logit,
    X_train_small,
    y_train_small,
    scoring='accuracy',
    return_train_score=True,
    cv=lpo)

print('Number of metrics obtained:', len(clf['test_score']))

We expect: 4950.0 metrics.
Number of metrics obtained: 4950


In [18]:
print('mean train set accuracy: ', np.mean(clf['train_score']), '+-', np.std(clf['train_score']))

print('mean test set accuracy: ', np.mean(clf['test_score']), '+-', np.std(clf['test_score']))

mean train set accuracy:  0.9411255411255409 +- 0.006452738887205399
mean test set accuracy:  0.9294949494949495 +- 0.18112651829883458


# 6. Stratified K-Fold Cross-Validation

In [19]:
logit = LogisticRegression(
    penalty='l2', C=1, solver='liblinear', random_state=23, max_iter=10_000)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=134)

X_train_small = X_train.head(100)
y_train_small = y_train.head(100)



clf = cross_validate(
    logit,
    X_train_small,
    y_train_small,
    scoring='accuracy',
    return_train_score=True,
    cv=skf)

print('Number of metrics obtained:', len(clf['test_score']))

Number of metrics obtained: 5


In [20]:
print('mean train set accuracy: ', np.mean(clf['train_score']), '+-', np.std(clf['train_score']))

print('mean test set accuracy: ', np.mean(clf['test_score']), '+-', np.std(clf['test_score']))

mean train set accuracy:  0.9650000000000001 +- 0.021505813167606556
mean test set accuracy:  0.9099999999999999 +- 0.05830951894845301
