In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

<span style='color:blue'> Stratified K-Fold Cross-Validation </span>

In [13]:
# Generate the dataset

# Integers from -10 to 39
data = np.array(list(range(-10, 0)) + list(range(0, 40)))

# Class 0 for negatives, Class 1 for positives
labels = np.array([0] * 10 + [1] * 40)  

# Display the dataset
df = pd.DataFrame({'Data': data, 'Class': labels})
print("Full Dataset:")
print(df)

Full Dataset:
    Data  Class
0    -10      0
1     -9      0
2     -8      0
3     -7      0
4     -6      0
5     -5      0
6     -4      0
7     -3      0
8     -2      0
9     -1      0
10     0      1
11     1      1
12     2      1
13     3      1
14     4      1
15     5      1
16     6      1
17     7      1
18     8      1
19     9      1
20    10      1
21    11      1
22    12      1
23    13      1
24    14      1
25    15      1
26    16      1
27    17      1
28    18      1
29    19      1
30    20      1
31    21      1
32    22      1
33    23      1
34    24      1
35    25      1
36    26      1
37    27      1
38    28      1
39    29      1
40    30      1
41    31      1
42    32      1
43    33      1
44    34      1
45    35      1
46    36      1
47    37      1
48    38      1
49    39      1


In [14]:
# Create StratifiedKFold with 5 splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
# Iterate through the folds and display results
fold_number = 1
for train_index, test_index in skf.split(data, labels):
    
    # Get the training and test sets based on the fold indices
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    # Create a DataFrame to display the test set and its class labels for each fold
    fold_df = pd.DataFrame({'Test Set': X_test, 'Label Set': y_test})
    
    # Display the fold number and corresponding test set and labels
    print(f"\nFold {fold_number}:")
    print(fold_df)
    
    # Verify class distribution in the test set for each fold
    unique, counts = np.unique(y_test, return_counts=True)
    class_distribution = dict(zip(unique, counts))
    print(f"Class Distribution in Test Set for Fold {fold_number}: {class_distribution}")
    
    fold_number += 1


Fold 1:
   Test Set  Label Set
0        -9          0
1        -7          0
2         3          1
3         8          1
4        11          1
5        12          1
6        24          1
7        27          1
8        30          1
9        33          1
Class Distribution in Test Set for Fold 1: {0: 2, 1: 8}

Fold 2:
   Test Set  Label Set
0        -5          0
1        -2          0
2         6          1
3        10          1
4        15          1
5        17          1
6        18          1
7        19          1
8        28          1
9        37          1
Class Distribution in Test Set for Fold 2: {0: 2, 1: 8}

Fold 3:
   Test Set  Label Set
0        -8          0
1        -3          0
2         7          1
3        16          1
4        20          1
5        31          1
6        32          1
7        35          1
8        38          1
9        39          1
Class Distribution in Test Set for Fold 3: {0: 2, 1: 8}

Fold 4:
   Test Set  Label Set
0        -6   

<span style='color:blue'>Bias Calculation</span>

In [18]:
true_values = {
    'True_1': [10.0, 7.5, -3.0, 15.0, 2.0, 0.0, -5.0, 9.0, -2.0, 8.0],
    'True_2': [5.0, 8.0, 12.0, -6.0, -1.0, 4.0, 10.0, 6.0, 3.0, -3.0],
    'True_3': [-2.0, 0.5, 5.0, -3.0, 7.0, 2.0, -4.0, 0.0, 1.0, 4.0],
    'True_4': [3.0, -4.0, 2.0, 10.0, -8.0, 1.0, -2.0, 3.0, -7.0, 0.0]
}

predicted_values = {
    'Predicted_1': [11.0, 8.5, -2.0, 16.0, 3.0, 1.0, -4.0, 10.0, -1.0, 9.0],
    'Predicted_2': [4.5, 7.5, 11.5, -6.5, -1.5, 3.5, 9.5, 5.5, 2.5, -3.5],
    'Predicted_3': [-1.2, 1.3, 5.8, -2.2, 7.8, 2.8, -3.2, 0.8, 1.8, 4.8],
    'Predicted_4': [1.8, -5.2, 0.8, 8.8, -9.2, -0.2, -3.2, 1.8, -8.2, -1.2]
}

In [19]:
# Bias calculation and Euclidean norm
for i in range(1, 5):
    true_vals = np.array(true_values[f'True_{i}'])
    predicted_vals = np.array(predicted_values[f'Predicted_{i}'])
    
    # Calculate the bias for each data point
    bias = predicted_vals - true_vals

    # Euclidean norm (L2 norm)  
    norm = np.linalg.norm(bias)  
    
    print(f"\nBias for Predicted_{i}: {bias}")
    print(f"Euclidean norm for Predicted_{i}: {norm}")


Bias for Predicted_1: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Euclidean norm for Predicted_1: 3.1622776601683795

Bias for Predicted_2: [-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5]
Euclidean norm for Predicted_2: 1.5811388300841898

Bias for Predicted_3: [0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8]
Euclidean norm for Predicted_3: 2.529822128134703

Bias for Predicted_4: [-1.2 -1.2 -1.2 -1.2 -1.2 -1.2 -1.2 -1.2 -1.2 -1.2]
Euclidean norm for Predicted_4: 3.7947331922020546


<span style='color:blue'>Impact of the Number of Folds (k)</span>

a) How Increasing or Decreasing the Number of Folds Affects Bias and Variance
- Increasing the number of folds (larger k):
        - Lower bias: The training set size per fold rises with the number of folds, so the model is trained on more data. As a result of the model learning from a bigger portion of the dataset, bias is typically reduced.
        - Higher variance: The validation set becomes smaller as k rises. The model's performance estimations may vary more between folds when smaller validation sets are more susceptible to data volatility.
- Decreasing the number of folds (smaller k):
        - Higher bias: Since the model has less data to work with, it gets trained on a smaller subset of the data in each fold when there are fewer folds. This can result in a higher bias.
        - Lower variance: The variance in performance estimates falls as the validation set gets bigger (with fewer folds). Because they are less susceptible to particular variances in the data, larger validation sets yield an evaluation that is more stable.

b) Good Choices for the Number of Folds (k)

i. A small dataset:
- A larger k (e.g., 10 or more) is generally preferred for small datasets. This maximizes the training set size in each fold, allowing the model to learn from more data while still performing multiple evaluations. However, be cautious not to make k too large, as very small validation sets can lead to high variance in performance estimates.

ii. A large dataset:
- A smaller k (e.g., 5) is usually sufficient for large datasets. Each fold will still contain enough data for the model to learn effectively and provide stable validation results. Using a smaller k also reduces computational overhead, as fewer model training and evaluation cycles are needed.

iii. An imbalanced dataset:
- Stratified K-Fold with k=10 is often recommended. This ensures that the minority and majority classes are represented in each fold, helping to mitigate the issues caused by class imbalance. A higher number of folds ensures that each model is exposed to diverse representations of both classes during training and evaluation, leading to more reliable performance metrics, especially when dealing with minority classes.

<span style='color:blue'>Implementing Standard K-Fold (Without Stratification)</span>

In [None]:
from sklearn.model_selection import KFold

In [None]:
# Create the KFold object without stratification
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Iterate through the folds and display test sets
fold_number = 1
for train_index, test_index in kf.split(data):
    X_test, y_test = data[test_index], labels[test_index]
    
    # Display test set for each fold
    print(f"\nFold {fold_number} Test Set (without stratification):")
    print(f"Test Data: {X_test}")
    print(f"Test Labels: {y_test}")
    
    fold_number += 1


Fold 1 Test Set (without stratification):
Test Data: [ 3  7  9 15 16 20 22 29 35 38]
Test Labels: [1 1 1 1 1 1 1 1 1 1]

Fold 2 Test Set (without stratification):
Test Data: [-7 -6 -4 -2  2  5 27 31 36 37]
Test Labels: [0 0 0 0 1 1 1 1 1 1]

Fold 3 Test Set (without stratification):
Test Data: [-10  -5  -1   6  14  17  21  23  24  34]
Test Labels: [0 0 0 1 1 1 1 1 1 1]

Fold 4 Test Set (without stratification):
Test Data: [-9 -8  1 11 13 19 25 26 30 33]
Test Labels: [0 0 1 1 1 1 1 1 1 1]

Fold 5 Test Set (without stratification):
Test Data: [-3  0  4  8 10 12 18 28 32 39]
Test Labels: [0 1 1 1 1 1 1 1 1 1]
