# Análise de Risco de Crédito - Divisão e Balanceamento

In [18]:
import pandas as pd
from numpy import vstack
from sklearn.model_selection import train_test_split
from sklearn.utils import resample, shuffle

In [19]:
# Construir DataFrame a partir da base de dados.
data_set = pd.read_csv('data/TRN', sep='\t')

In [20]:
# Exibe as 5 primeiras linhas da base de dados.
data_set.head(5)

Unnamed: 0,INDEX,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,...,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14,IND_BOM_1_1,IND_BOM_1_2
0,0,1,1,1,0,0,0,0,0.135098,1,...,0,0,1,1,0,1,1,1,0,1
1,1,1,0,1,0,0,1,0,0.273504,1,...,0,1,0,1,1,0,0,0,1,0
2,2,1,0,1,0,0,1,0,0.28191,0,...,1,1,0,0,0,0,1,0,1,0
3,3,1,1,1,0,0,0,0,0.225741,0,...,1,1,0,1,1,0,1,0,1,0
4,4,1,1,0,0,0,1,0,0.480403,0,...,1,1,1,0,0,1,0,1,1,0


In [21]:
# Separa as classes em DataFrames distintos.
class_1_df = data_set.loc[data_set['IND_BOM_1_2'] == 0]
class_2_df = data_set.loc[data_set['IND_BOM_1_2'] == 1]

# Balancea as classes através do up-sampling da classe 2.
class_2_up_sampled_df = resample(class_2_df, n_samples=class_1_df.shape[0], random_state=123)

In [22]:
# Separa "features" de "targets" para cada classe, transformando-os em numpy arrays.
class_1_X = class_1_df.iloc[:, :-1].values
class_1_y = class_1_df.iloc[:, -1].values

class_2_X = class_2_up_sampled_df.iloc[:, :-1].values
class_2_y = class_2_up_sampled_df.iloc[:, -1].values

In [23]:
# Treino: 50%, Validação: 25%, Teste: 25% (para ambas as classes).
class_1_X_train, class_1_X_test, class_1_y_train, class_1_y_test = train_test_split(
    class_1_X, class_1_y, test_size=0.25, random_state=42, stratify=class_1_y)

class_1_X_train, class_1_X_val, class_1_y_train, class_1_y_val = train_test_split(
    class_1_X_train, class_1_y_train, test_size=0.33, random_state=42, stratify=class_1_y_train)

class_2_X_train, class_2_X_test, class_2_y_train, class_2_y_test = train_test_split(
    class_2_X, class_2_y, test_size=0.25, random_state=42, stratify=class_2_y)

class_2_X_train, class_2_X_val, class_2_y_train, class_2_y_val = train_test_split(
    class_2_X_train, class_2_y_train, test_size=0.33, random_state=42, stratify=class_2_y_train)

In [24]:
# Concatena os arrays de treinamento das classes (features e targets).
X_train = vstack((class_1_X_train, class_2_X_train))
y_train = vstack((class_1_y_train.reshape(class_1_y_train.shape[0], 1),
                  class_2_y_train.reshape(class_2_y_train.shape[0], 1)))

# Concatena os arrays de teste das classes (features e targets).
X_test = vstack((class_1_X_test, class_2_X_test))
y_test = vstack((class_1_y_test.reshape(class_1_y_test.shape[0], 1),
                  class_2_y_test.reshape(class_2_y_test.shape[0], 1)))

# Concatena os arrays de validação das classes (features e targets).
X_val = vstack((class_1_X_val, class_2_X_val))
y_val = vstack((class_1_y_val.reshape(class_1_y_val.shape[0], 1),
                  class_2_y_val.reshape(class_2_y_val.shape[0], 1)))

In [25]:
# Aleatorização.
X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_test, y_test = shuffle(X_test, y_test, random_state=42)
X_val, y_val = shuffle(X_val, y_val, random_state=42)