In [18]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
pd.set_option('display.max_columns', None)

## Load data

In [19]:
# Carga da base original
df = pd.read_csv('dataset/dataset-normalizado.csv', header = 0)

X = df.drop('is_approved',axis=1).to_numpy() # DATASET
y = df['is_approved'].to_numpy() # target

In [23]:
def describe_dataset(X, y, k):
    # get dataset rows: instances , columns: features
    rows, columns = X.shape

    # get proportion from target
    (unique, counts) = np.unique(y, return_counts=True) 

    # calculate proportion
    prop_neg = int(counts[0]/rows*100)
    prop_pos = int(counts[1]/rows*100)

    print("k = {}, Dataset: {} positivas, {} negativas ({}% x {}%)".format(k, counts[1], counts[0], prop_pos, prop_neg))

In [24]:
def get_classes_from_index(y):
    _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
    y_counts = np.bincount(y_inv)
    _, class_perm = np.unique(y_idx, return_inverse=True)
    y_encoded = class_perm[y_inv]
    y_order = np.sort(y_encoded)
    n_classes = len(y_idx)
    allocation = np.asarray(
            [np.bincount(y_order[i::skf.n_splits], minlength=n_classes)
             for i in range(skf.n_splits)])

    for idx, f in enumerate(allocation):
        count_neg = int(f[0])
        count_pos = int(f[1])
        total = count_neg+count_pos
        prop_temp_neg = int(count_neg/total*100)
        prop_temp_pos = int(count_pos/total*100)
        print("Fold {}: Pos: {}, Neg: {}, Total: {}, Proporção: {}% x {}%".format(idx, count_pos, count_neg, total, prop_temp_pos, prop_temp_neg))

## Create StratifiedKFold

In [20]:
from sklearn.model_selection import StratifiedKFold

In [28]:
skf = StratifiedKFold(n_splits=10)
k = skf.get_n_splits(X, y)

In [25]:
describe_dataset(X, y, k)

k = 10, Dataset: 557 positivas, 487 negativas (53% x 46%)


In [29]:
for train_index, test_index in skf.split(X, y):
    print("TRAIN: {}  TEST: {}".format(len(train_index), len(test_index)))
        
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: 939  TEST: 105
TRAIN: 939  TEST: 105
TRAIN: 939  TEST: 105
TRAIN: 939  TEST: 105
TRAIN: 940  TEST: 104
TRAIN: 940  TEST: 104
TRAIN: 940  TEST: 104
TRAIN: 940  TEST: 104
TRAIN: 940  TEST: 104
TRAIN: 940  TEST: 104


In [27]:
get_classes_from_index(y)

Fold 0: Pos: 56, Neg: 49, Total: 105, Proporção: 53% x 46%
Fold 1: Pos: 56, Neg: 49, Total: 105, Proporção: 53% x 46%
Fold 2: Pos: 56, Neg: 49, Total: 105, Proporção: 53% x 46%
Fold 3: Pos: 56, Neg: 49, Total: 105, Proporção: 53% x 46%
Fold 4: Pos: 55, Neg: 49, Total: 104, Proporção: 52% x 47%
Fold 5: Pos: 55, Neg: 49, Total: 104, Proporção: 52% x 47%
Fold 6: Pos: 55, Neg: 49, Total: 104, Proporção: 52% x 47%
Fold 7: Pos: 56, Neg: 48, Total: 104, Proporção: 53% x 46%
Fold 8: Pos: 56, Neg: 48, Total: 104, Proporção: 53% x 46%
Fold 9: Pos: 56, Neg: 48, Total: 104, Proporção: 53% x 46%
