In [1]:
# Imports padrão
import os
import sys
import time

# Adicionando o caminho ao sys.path
sys.path.append('../src')

# Imports de bibliotecas de terceiros
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Imports locais
from sbkm import SimilarityBasedKM

# Definição de variáveis
seed = 42
input_file = '../data/processed/'


In [2]:
numerical = ['amount', 'installment_rate']
features = numerical
feature_types = ['numerical' for i in range(len(numerical))]

In [3]:
def censored_data(data, perc=0.5, n=250):
    censored_data = data[data['full_repaid'] == 0]
    num_censored = round(n*perc)
    uncensored_data = data[data['full_repaid'] == 1]
    num_uncensored = n-num_censored

    uncensored_data = uncensored_data.sample(n=num_uncensored, random_state=seed)
    censored_data = censored_data.sample(n=num_censored, random_state=seed)
    
    final_data = pd.concat([censored_data, uncensored_data])
    print(f'Censuras: {num_censored} Falhas: {num_uncensored} Tamanho: {final_data.shape[0]} Taxa de censura: {num_censored/final_data.shape[0]}')
    return final_data

## 5%

In [4]:
train = pd.read_csv(f'{input_file}train.csv')
print("Train: {}.".format(train.shape))
valid = pd.read_csv(f'{input_file}validation.csv')
print("Valid: {}.".format(valid.shape))
test = pd.read_csv(f'{input_file}test.csv')
print("Test: {}.".format(test.shape))

perc = 0.05
train = censored_data(train, perc)

X_train = train.drop(['duration','full_repaid'], axis=1)
X_train = X_train[features]
y_train = train[['duration','full_repaid']]

print("X: {}.".format(X_train.shape))
print("y: {}.\n".format(y_train.shape))

X_valid = valid.drop(['duration','full_repaid'], axis=1)
X_valid = X_valid[features]
y_valid = valid[['duration','full_repaid']]

print("X: {}.".format(X_valid.shape))
print("y: {}.\n".format(y_valid.shape))

X_test = test.drop(['duration','full_repaid'], axis=1)
X_test = X_test[features]
y_test = test[['duration','full_repaid']]

print("X: {}.".format(X_test.shape))
print("y: {}.\n".format(y_test.shape))

scaler = MinMaxScaler()
X_train[numerical] = scaler.fit_transform(X_train[numerical])
X_valid[numerical] = scaler.transform(X_valid[numerical])
X_test[numerical] = scaler.transform(X_test[numerical])

Train: (560, 19).
Valid: (140, 19).
Test: (300, 19).
Censuras: 12 Falhas: 238 Tamanho: 250 Taxa de censura: 0.048
X: (250, 2).
y: (250, 2).

X: (140, 2).
y: (140, 2).

X: (300, 2).
y: (300, 2).



In [5]:
sbkm = SimilarityBasedKM()
start = time.time()
sbkm.fit(X_train, y_train['duration'], y_train['full_repaid'], feature_types, 'likelihood')
end = time.time()
print("Time: {}.".format(end-start))
sbkm.print_summary(X_valid.values, y_valid['duration'].values, y_valid['full_repaid'].values)

Time: 4.475844144821167.


Unnamed: 0,Weight,Train CI,Test CI,Train BS,Test BS,Likelihood,SD
0,"[78.90177707588361, 21.0982229241164]",0.773465,0.779136,0.072537,0.10483,9.105266999999999e-178,7.043595


In [8]:
rng = np.random.RandomState(seed=seed)

concatenated_dfs = []
bootstrap_n_samples = 200
for bootstrap_idx in range(bootstrap_n_samples):
    bootstrap_sample_indices = rng.choice(X_test.shape[0], size=X_test.shape[0], replace=True)
    _X_test = X_test.iloc[bootstrap_sample_indices]
    _y_test = y_test.iloc[bootstrap_sample_indices]
    result = sbkm.print_summary(_X_test.values, _y_test['duration'].values, _y_test['full_repaid'].values)
    concatenated_dfs.append(result)

final_results = pd.concat(concatenated_dfs)
final_results.to_csv(f'../data/results/censored/{perc}.csv', index=False)
final_results[['Test CI', 'Test BS', 'SD']].describe(percentiles=[.025, .975])

Unnamed: 0,Test CI,Test BS,SD
count,200.0,200.0,200.0
mean,0.775318,0.092561,6.180083
std,0.017725,0.006221,0.537389
min,0.734062,0.074173,4.826134
2.5%,0.742492,0.081297,5.122519
50%,0.775999,0.092213,6.176903
97.5%,0.810869,0.105746,7.154741
max,0.812498,0.115156,7.600499


## 20%

In [9]:
train = pd.read_csv(f'{input_file}train.csv')
print("Train: {}.".format(train.shape))
valid = pd.read_csv(f'{input_file}validation.csv')
print("Valid: {}.".format(valid.shape))
test = pd.read_csv(f'{input_file}test.csv')
print("Test: {}.".format(test.shape))

perc = 0.2
train = censored_data(train, perc)

X_train = train.drop(['duration','full_repaid'], axis=1)
X_train = X_train[features]
y_train = train[['duration','full_repaid']]

print("X: {}.".format(X_train.shape))
print("y: {}.\n".format(y_train.shape))

X_valid = valid.drop(['duration','full_repaid'], axis=1)
X_valid = X_valid[features]
y_valid = valid[['duration','full_repaid']]

print("X: {}.".format(X_valid.shape))
print("y: {}.\n".format(y_valid.shape))

X_test = test.drop(['duration','full_repaid'], axis=1)
X_test = X_test[features]
y_test = test[['duration','full_repaid']]

print("X: {}.".format(X_test.shape))
print("y: {}.\n".format(y_test.shape))

scaler = MinMaxScaler()
X_train[numerical] = scaler.fit_transform(X_train[numerical])
X_valid[numerical] = scaler.transform(X_valid[numerical])
X_test[numerical] = scaler.transform(X_test[numerical])

Train: (560, 19).
Valid: (140, 19).
Test: (300, 19).
Censuras: 50 Falhas: 200 Tamanho: 250 Taxa de censura: 0.2
X: (250, 2).
y: (250, 2).

X: (140, 2).
y: (140, 2).

X: (300, 2).
y: (300, 2).



In [10]:
sbkm = SimilarityBasedKM()
start = time.time()
sbkm.fit(X_train, y_train['duration'], y_train['full_repaid'], feature_types, 'likelihood')
end = time.time()
print("Time: {}.".format(end-start))
sbkm.print_summary(X_valid.values, y_valid['duration'].values, y_valid['full_repaid'].values)

Time: 4.7695276737213135.


Unnamed: 0,Weight,Train CI,Test CI,Train BS,Test BS,Likelihood,SD
0,"[70.83119000734469, 29.168809992655316]",0.766297,0.780699,0.08867,0.117155,8.737089e-186,8.129795


In [12]:
rng = np.random.RandomState(seed=seed)

concatenated_dfs = []
bootstrap_n_samples = 200
for bootstrap_idx in range(bootstrap_n_samples):
    bootstrap_sample_indices = rng.choice(X_test.shape[0], size=X_test.shape[0], replace=True)
    _X_test = X_test.iloc[bootstrap_sample_indices]
    _y_test = y_test.iloc[bootstrap_sample_indices]
    result = sbkm.print_summary(_X_test.values, _y_test['duration'].values, _y_test['full_repaid'].values)
    concatenated_dfs.append(result)

final_results = pd.concat(concatenated_dfs)
final_results.to_csv(f'../data/results/censored/{perc}.csv', index=False)
final_results[['Test CI', 'Test BS', 'SD']].describe(percentiles=[.025, .975])

Unnamed: 0,Test CI,Test BS,SD
count,200.0,200.0,200.0
mean,0.756182,0.122515,7.208507
std,0.018937,0.011259,0.571
min,0.711014,0.093035,5.723321
2.5%,0.72053,0.103655,6.082149
50%,0.757057,0.120419,7.213453
97.5%,0.78794,0.144924,8.305754
max,0.806252,0.152475,8.766234
