In [495]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
pd.set_option('display.max_columns', None)
import numbers

In [496]:
# Carga da base original
df = pd.read_csv('dataset/dataset-normalizado.csv', header = 0)

# Reordena colocando a coluna is_approved no final da tabela
df = df[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus',
       'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid',
       'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'Medu_0',
       'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 'Fedu_2',
       'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other',
       'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course',
       'reason_home', 'reason_other', 'reason_reputation', 'guardian_father',
       'guardian_mother', 'guardian_other', 'is_approved']]


In [497]:
len(df)

1044

## Rascunho da função

In [None]:
df_copy = df
X = df.drop(df.columns[-1], axis=1)
y = df[df.columns[-1]]

X = X.to_numpy()
y = y.to_numpy()

In [None]:
k = 5
rows, columns = X.shape

# check if k is a valid number of folds
check_k(k, rows) 

# get proportion from target
(unique, counts) = np.unique(y, return_counts=True) 

# calculate percentage
prop_neg = counts[0]/rows*100
prop_pos = counts[1]/rows*100

print("k = {}, Dataset: {} positivas, {} negativas ({:.2f}% x {:.2f}%)".format(k, counts[1], counts[0], prop_pos, prop_neg))

# calculate folds size
fold_size = split(class_0_index, class_1_index, k)   

In [None]:
folds = []
for idx, f in enumerate(fold_size):
    temp_neg = int(f[0])
    temp_pos = int(f[1])
    total = temp_neg+temp_pos
    prop_temp_neg = temp_neg/total*100
    prop_temp_pos = temp_pos/total*100
    print("Fold {}: Pos: {}, Neg: {}, Total: {}, Proporção: {:.2f}% , {:.2f}%".format(idx, temp_pos, temp_neg, total, prop_temp_pos, prop_temp_neg))
    

In [None]:
# map index positions on target for class 0 and class 1
class_0_index = [idx for idx, j in enumerate(y) if j==unique[0]]
class_1_index = [idx for idx, j in enumerate(y) if j==unique[1]]

# map usage of each instance
bool_0_index = [False] * len(class_0_index)
bool_1_index = [False] * len(class_1_index)

## Funções auxiliares

In [514]:
def split(a, b, n):
    k, m = divmod(len(a), n)
    l, p = divmod(len(b), n)
    
    groups = []
    
    for i in range(n):
        g1 = a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]
        g2 = b[i * l + min(i, p):(i + 1) * l + min(i + 1, p)]
        groups.append((len(g1), len(g2)))
    return groups

In [519]:
def check_k(k, rows):
    if k == 1:
        raise ValueError("Expected number of folds higher than 1. Got %s." % k)
    if not (isinstance(k, numbers.Integral)):
        raise ValueError("Expected number of folds as an integer. Got %s." % k)
    if k > rows:
        raise ValueError("Expected number of folds lower than dataset instances. Got %s." % k)

## Função final

In [516]:
def StratifiedKFold(df, k):
    """Stratified K-Folds cross-validator

    Parameters
    ----------
    dataset : dataframe

    k : int
        Determines the number of folds.


    print information:
        k = 3, dataset: X instances, P positive, N negative (P%, N%)

        Fold 0: pos: xx, neg: yy, total: cc, proportion: P%, N%
        ....

    """

    df_copy = df
    
    # transform data to numpy array
    X = df.drop(df.columns[-1], axis=1).to_numpy()
    y = df[df.columns[-1]].to_numpy()

    # get dataset rows = instances and columns = features
    rows, columns = X.shape

    # check if k is a valid number of folds
    check_k(k, rows) 

    # get proportion from target
    (unique, counts) = np.unique(y, return_counts=True) 

    # calculate percentage
    prop_neg = counts[0]/rows*100
    prop_pos = counts[1]/rows*100

    print("k = {}, Dataset: {} positivas, {} negativas ({:.2f}% x {:.2f}%)".format(k, counts[1], counts[0], prop_pos, prop_neg))

    # calculate folds size
    fold_size = split(class_0_index, class_1_index, k)   

    folds = []
    for idx, f in enumerate(fold_size):
        temp_neg = int(f[0])
        temp_pos = int(f[1])
        total = temp_neg+temp_pos
        prop_temp_neg = temp_neg/total*100
        prop_temp_pos = temp_pos/total*100
        print("Fold {}: Pos: {}, Neg: {}, Total: {}, Proporção: {:.2f}% , {:.2f}%".format(idx, temp_pos, temp_neg, total, prop_temp_pos, prop_temp_neg))

In [526]:
StratifiedKFold(df, 5)

k = 5, Dataset: 557 positivas, 487 negativas (53.35% x 46.65%)
Fold 0: Pos: 112, Neg: 98, Total: 210, Proporção: 53.33% , 46.67%
Fold 1: Pos: 112, Neg: 98, Total: 210, Proporção: 53.33% , 46.67%
Fold 2: Pos: 111, Neg: 97, Total: 208, Proporção: 53.37% , 46.63%
Fold 3: Pos: 111, Neg: 97, Total: 208, Proporção: 53.37% , 46.63%
Fold 4: Pos: 111, Neg: 97, Total: 208, Proporção: 53.37% , 46.63%
