03/23/2021

# Table of Content

1. [How to Fix k-Fold Cross-Validation for Imbalanced Classification](#1)
    * [Repeated Stratified K-Fold Cross-Validation](#1.1)
    * [Stratified K-Fold Cross-Validation](#1.2)
    * [Train-Test Split Cross-Validation](#1.3)
2. [How to Avoid Data Leakage](#2)
    * [With Data Leakage](#2.1)
    * [Without Data Leakage](#2.2)
3. [Example of Stratified K-Fold Cross Validation & Data Leakage](#3)
    * [With Data Leakage (Original)](#3.1)
    * [Without Data Leakage (Fixed by Myself)](#3.2)

<a id='1'>
    <h2 style='font-size:210%;'>
        How to Fix k-Fold Cross-Validation for Imbalanced Classification
    </h2>
 </a>

[Source: How to Fix k-Fold Cross-Validation for Imbalanced Classification, *Machine Learning Mastery*](https://machinelearningmastery.com/cross-validation-for-imbalanced-classification/)

<a id='1.1'>
    <h2 style='font-size:180%;'>
        Repeated Stratified K-Fold Cross-Validation
    </h2>
 </a>

In [1]:
# example of stratified k-fold cross-validation with an imbalanced dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99, 0.01], flip_y=0, random_state=1)
kfold = RepeatedStratifiedKFold(n_splits=5, random_state=1)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X, y):
	# select rows
	train_X, test_X = X[train_ix], X[test_ix]
	train_y, test_y = y[train_ix], y[test_ix]
	# summarize train and test composition
	train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
	test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
# 	print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))
c = 0
for train_ix, test_ix in kfold.split(X, y):
    c += 1
print(c)

50


<a id='1.2'>
    <h2 style='font-size:180%;'>
        Stratified K-Fold Cross-Validation
    </h2>
 </a>

In [2]:
# example of stratified k-fold cross-validation with an imbalanced dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99, 0.01], flip_y=0, random_state=1)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X, y):
	# select rows
	train_X, test_X = X[train_ix], X[test_ix]
	train_y, test_y = y[train_ix], y[test_ix]
	# summarize train and test composition
	train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
	test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
	print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

>Train: 0=792, 1=8, Test: 0=198, 1=2
>Train: 0=792, 1=8, Test: 0=198, 1=2
>Train: 0=792, 1=8, Test: 0=198, 1=2
>Train: 0=792, 1=8, Test: 0=198, 1=2
>Train: 0=792, 1=8, Test: 0=198, 1=2


<a id='1.3'>
    <h2 style='font-size:180%;'>
        Train-Test Split Cross-Validation
    </h2>
 </a>

In [3]:
# example of stratified train/test split with an imbalanced dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99, 0.01], flip_y=0, random_state=1)
# split into train/test sets with same class ratio
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
# summarize
train_0, train_1 = len(trainy[trainy==0]), len(trainy[trainy==1])
test_0, test_1 = len(testy[testy==0]), len(testy[testy==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

>Train: 0=495, 1=5, Test: 0=495, 1=5


<a id='2'>
    <h2 style='font-size:210%;'>
        How to Avoid Data Leakage
    </h2>
 </a>

[Source: How to Avoid Data Leakage When Performing Data Preparation, *Machine Learning Mastery*](https://machinelearningmastery.com/data-preparation-without-data-leakage/)

<a id='2.1'>
    <h2 style='font-size:180%;'>
        With Data Leakage
    </h2>
 </a>

In [4]:
# naive approach to normalizing the data before splitting the data and evaluating the model
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# standardize the dataset
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# fit the model
model = LogisticRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy*100))

Accuracy: 84.848


<a id='2.2'>
    <h2 style='font-size:180%;'>
        Without Data Leakage
    </h2>
 </a>

In [5]:
# correct approach for normalizing the data after the data is split before the model is evaluated
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# define the scaler
scaler = MinMaxScaler()
# fit on the training dataset
scaler.fit(X_train)
# scale the training dataset
X_train = scaler.transform(X_train)
# scale the test dataset
X_test = scaler.transform(X_test)
# fit the model
model = LogisticRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy*100))

Accuracy: 85.455


<a id='3'>
    <h2 style='font-size:210%;'>
        Example of Stratified K-Fold Cross Validation & Data Leakage
    </h2>
 </a>

[Source: Stratified K Fold Cross Validation, *Geeks for Geeks*](https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/)

<a id='3.1'>
    <h2 style='font-size:180%;'>
        With Data Leakage (Original)
    </h2>
 </a>

In [6]:
# STRATIFIES K-FOLD CROSS VALIDATION { 10-fold }

# Import Required Modules.
from statistics import mean, stdev
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn import linear_model
from sklearn import datasets

# FEATCHING FEATURES AND TARGET VARIABLES IN ARRAY FORMAT.
cancer = datasets.load_breast_cancer()

# Input_x_Features.
x = cancer.data

# Input_ y_Target_Variable.
y = cancer.target

# Feature Scaling for input features.
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)

# Create classifier object.
lr = linear_model.LogisticRegression()

# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified = []

for train_index, test_index in skf.split(x, y):
    x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    lr.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(round(lr.score(x_test_fold, y_test_fold),2))

# Print the output.
print('List of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
	round(max(lst_accu_stratified)*100,2), '%')
print('\nMinimum Accuracy:',
	round(min(lst_accu_stratified)*100,2), '%')
print('\nOverall Accuracy:',
	round(mean(lst_accu_stratified)*100,2), '%')
print('\nStandard Deviation is:', round(stdev(lst_accu_stratified),2))


List of possible accuracy: [0.93, 0.96, 0.98, 1.0, 0.96, 0.96, 0.98, 0.95, 0.95, 0.98]

Maximum Accuracy That can be obtained from this model is: 100.0 %

Minimum Accuracy: 93.0 %

Overall Accuracy: 96.5 %

Standard Deviation is: 0.02


<a id='3.2'>
    <h2 style='font-size:180%;'>
        Without Data Leakage (Fixed by Myself)
    </h2>
 </a>

In [7]:
# STRATIFIES K-FOLD CROSS VALIDATION { 10-fold }

# Import Required Modules.
from statistics import mean, stdev
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn import linear_model
from sklearn import datasets

# FEATCHING FEATURES AND TARGET VARIABLES IN ARRAY FORMAT.
cancer = datasets.load_breast_cancer()

# Input_x_Features.
x = cancer.data

# Input_ y_Target_Variable.
y = cancer.target

# Create classifier object.
lr = linear_model.LogisticRegression()

# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_accu_stratified = []

for train_index, test_index in skf.split(x, y):
    x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    
    # Feature Scaling for input features.
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(x_train_fold)
    x_train_scaled = scaler.transform(x_train_fold)
    x_test_scaled = scaler.transform(x_test_fold)
    
    lr.fit(x_train_scaled, y_train_fold)
    lst_accu_stratified.append(round(lr.score(x_test_scaled, y_test_fold),2))

# Print the output.
print('List of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
	round(max(lst_accu_stratified)*100,2), '%')
print('\nMinimum Accuracy:',
	round(min(lst_accu_stratified)*100,2), '%')
print('\nOverall Accuracy:',
	round(mean(lst_accu_stratified)*100,2), '%')
print('\nStandard Deviation is:', round(stdev(lst_accu_stratified),2))

List of possible accuracy: [0.93, 0.96, 0.98, 0.98, 0.96, 0.96, 0.98, 0.95, 0.95, 0.98]

Maximum Accuracy That can be obtained from this model is: 98.0 %

Minimum Accuracy: 93.0 %

Overall Accuracy: 96.3 %

Standard Deviation is: 0.02
