In [21]:
import pandas as pd
import numpy as np
#%pip install PyCaret        #to run caret dataset on Python
from pycaret.datasets import get_data

### The Function

In [22]:
def create_K_folds(vector, K, seed):
    vector=pd.Series(vector)
    N = vector.count() #length of the vector
    C = vector.nunique() #number of classes in a given vector
    vector.value_counts().index[C-1] #to iterate over individual class values to obtain a vector for each class

    Classes={} #empty dictionary to store class-wise observations
    for i in range(1,C+1):
        Classes['Class{0}'.format(i)] = vector[vector==vector.value_counts().index[C-i]] #dictionary of classes as keys and values as observations within the classes.

    Obs_perClass = np.int32(np.round([N/K * (Classes[i].count()/N) for i in Classes.keys()])) #Number of observations per class that every Kth fold must contain (based on the proportion in the dataset)

    j=0
    Index_perClass = {} #empty dictionary to store class-wise indices
    np.random.seed(seed=seed)
    for i in Classes.keys(): # loop to iterate over the Classes dictionary and produce a new dictionary with the indices for each class as a matrix
        Index_perClass['{0}'.format(i)]=np.random.choice(Classes[i].index,(K,Obs_perClass[j])) #creating randomly sampled matrix with K rows and columns=number of observations for each class
        j+=1

    train_idx={} #empty dictionary to store indices with classes combined
    for i in range(0,K): #to iterate over folds
        train_idx['Train{0}'.format(i)]=[]
        for key in Index_perClass.keys():
            train_idx['Train'+str(i)].extend(Index_perClass[key][i]) #adding index values to keys that are named for the Kth number of fold.
    
    train_idx=pd.DataFrame(train_idx)

    iterations={} #empty dictionary to store itetation indices
    for i in range(len(train_idx.columns)): 
        iterations['iter'+str(i)]= train_idx[np.delete(train_idx.columns,i)].melt()['value'] #looping over K-training sets to delete one out and store indices of 
                                                                                             #K-1 folds K times in the dictionary.

    iterations=pd.DataFrame(iterations)


    return iterations


### Applied to the GermanCredit data

In [23]:
#loading the dataset from PyCaret
df = get_data('GermanCredit')

Unnamed: 0.1,Unnamed: 0,Duration,Amount,InstallmentRatePercentage,ResidenceDuration,Age,NumberExistingCredits,NumberPeopleMaintenance,Telephone,ForeignWorker,...,OtherInstallmentPlans.Bank,OtherInstallmentPlans.Stores,OtherInstallmentPlans.None,Housing.Rent,Housing.Own,Housing.ForFree,Job.UnemployedUnskilled,Job.UnskilledResident,Job.SkilledEmployee,Job.Management.SelfEmp.HighlyQualified
0,1,6,1169,4,4,67,2,1,0,1,...,0,0,1,0,1,0,0,0,1,0
1,2,48,5951,2,2,22,1,1,1,1,...,0,0,1,0,1,0,0,0,1,0
2,3,12,2096,2,3,49,1,2,1,1,...,0,0,1,0,1,0,0,1,0,0
3,4,42,7882,2,4,45,1,2,1,1,...,0,0,1,0,0,1,0,0,1,0
4,5,24,4870,3,4,53,2,2,1,1,...,0,0,1,0,0,1,0,0,1,0


In [24]:
df['Class'] #the labels

0      Good
1       Bad
2      Good
3      Good
4       Bad
       ... 
995    Good
996    Good
997    Good
998     Bad
999    Good
Name: Class, Length: 1000, dtype: object

In [25]:
df['Class'].value_counts() #7:3 ratio

Good    700
Bad     300
Name: Class, dtype: int64

In [27]:
folds = create_K_folds(df['Class'], 10, 77) #dataframe of 10 training iterations (900 observations to include 9 folds and exclude 1 for testing in each iteration)

In [30]:
folds

Unnamed: 0,iter0,iter1,iter2,iter3,iter4,iter5,iter6,iter7,iter8,iter9
0,471,722,722,722,722,722,722,722,722,722
1,585,349,349,349,349,349,349,349,349,349
2,105,308,308,308,308,308,308,308,308,308
3,973,788,788,788,788,788,788,788,788,788
4,10,973,973,973,973,973,973,973,973,973
...,...,...,...,...,...,...,...,...,...,...
895,567,567,567,567,567,567,567,567,567,928
896,422,422,422,422,422,422,422,422,422,792
897,572,572,572,572,572,572,572,572,572,459
898,758,758,758,758,758,758,758,758,758,644


In [32]:
for cols in folds.columns:
    
    print(df['Class'][folds[cols]].value_counts()) #7:3 lables ratio preserved after creating 10 folds.

Good    630
Bad     270
Name: Class, dtype: int64
Good    630
Bad     270
Name: Class, dtype: int64
Good    630
Bad     270
Name: Class, dtype: int64
Good    630
Bad     270
Name: Class, dtype: int64
Good    630
Bad     270
Name: Class, dtype: int64
Good    630
Bad     270
Name: Class, dtype: int64
Good    630
Bad     270
Name: Class, dtype: int64
Good    630
Bad     270
Name: Class, dtype: int64
Good    630
Bad     270
Name: Class, dtype: int64
Good    630
Bad     270
Name: Class, dtype: int64
