<a href="https://colab.research.google.com/github/cagBRT/Data/blob/main/Imbalanced_Dataset_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Cross Validation for Imbalanced Datasets**

In [None]:
from numpy import unique
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [None]:
# create a binary classification dataset

# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99, 0.01], flip_y=0,
random_state=1)
# summarize dataset
classes = unique(y)
total = len(y)
for c in classes:
  n_examples = len(y[y==c])
  percent = n_examples / total * 100
  print('> Class=%d : %d/%d (%.1f%%)' % (c, n_examples, total, percent))

In [None]:
# example of k-fold cross-validation with an imbalanced dataset

kfold = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X):
  # select rows
  train_X, test_X = X[train_ix], X[test_ix]
  train_y, test_y = y[train_ix], y[test_ix]
  # summarize train and test composition
  train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
  test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
  print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

In [None]:
# example of train/test split with an imbalanced dataset
# split into train/test sets with same class ratio
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
# summarize
train_0, train_1 = len(trainy[trainy==0]), len(trainy[trainy==1])
test_0, test_1 = len(testy[testy==0]), len(testy[testy==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

In [None]:
# example of stratified k-fold cross-validation with an imbalanced dataset

# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99, 0.01], flip_y=0,
random_state=1)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) # enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X, y):
  # select rows
  train_X, test_X = X[train_ix], X[test_ix]
  train_y, test_y = y[train_ix], y[test_ix]
  # summarize train and test composition
  train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
  test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
  print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

In [None]:
# example of stratified train/test split with an imbalanced dataset

# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99, 0.01], flip_y=0,
random_state=1)
# split into train/test sets with same class ratio
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
# summarize
train_0, train_1 = len(trainy[trainy==0]), len(trainy[trainy==1])
test_0, test_1 = len(testy[testy==0]), len(testy[testy==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))