# Lectura de datos master y estructura

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score,roc_auc_score,roc_curve,log_loss,confusion_matrix

In [2]:
path = '/home/angelmc/MyFirstDSProject'

In [4]:
df = pd.read_csv(path+'/data/Dataset Endeudamiento Crediticio.csv',sep=';')

In [5]:
df.head()

Unnamed: 0,ID,Default,Prct_uso_tc,Edad,Nro_prestao_retrasados,Prct_deuda_vs_ingresos,Mto_ingreso_mensual,Nro_prod_financieros_deuda,Nro_retraso_60dias,Nro_creditos_hipotecarios,Nro_retraso_ultm3anios,Nro_dependiente
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


# Tratamiento de datos

In [5]:
# Definiendo el target
target = 'Default'
# Definiendo las columnas que servirán para predecir el target
columnas = list(set(df.columns) - set([target]))
coltotal = columnas + [target]

In [6]:
df_miss = pd.DataFrame({'count': df.isnull().sum(), 'percentage': df.isnull().mean()*100})
df_miss.to_csv(path+'/reports/1st_missing_report.csv')

In [7]:
def setTratamientoCotasSup(base,var_T,q):
    p=base[var_T].quantile(q=q)
    qq=(q+0.05) if (q+0.05<1) else q
    base[f"{var_T}"]=base[var_T].apply(lambda x: p if x>=p else x)
def setTratamientoCotasInf(base,var_T,q):
    p=base[var_T].quantile(q=q)
    qq=(q-0.05) if (q-0.05>0) else q
    base[f"{var_T}"]=base[var_T].apply(lambda x: p if x<=p else x)

In [8]:
setTratamientoCotasSup(df,'Edad',0.99)
setTratamientoCotasInf(df,'Edad',0.01)

setTratamientoCotasSup(df,'Mto_ingreso_mensual',0.99)
setTratamientoCotasInf(df,'Mto_ingreso_mensual',0.02)

setTratamientoCotasSup(df,'Prct_uso_tc',0.97)

setTratamientoCotasSup(df,'Prct_deuda_vs_ingresos',0.80)

In [9]:
col_nulls = ['Mto_ingreso_mensual','Nro_dependiente']

def setTratamientoMissingsVarContinua(base,var_T):
    imputer=base[var_T].mode().values[0]
    base[f"{var_T}"]=base[var_T].fillna(imputer)
    print(f"{var_T}(Mode: {imputer})")

for i in col_nulls:
    setTratamientoMissingsVarContinua(df,i)

Mto_ingreso_mensual(Mode: 5000.0)
Nro_dependiente(Mode: 0.0)


In [10]:
df_miss = pd.DataFrame({'count': df.isnull().sum(), 'percentage': df.isnull().mean()*100})
df_miss.to_csv(path+'/reports/2nd_missing_report.csv')

In [12]:
X,y = df.drop(columns=[target]),df[target]

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y, random_state=0)

In [14]:
df_train = pd.concat([X_train,y_train],axis=1)

# Balanceo de datos

In [15]:
# Oversampling
count_class_0, count_class_1 = df_train.Default.value_counts()

df_class_0 = df_train[df_train['Default'] == 0]
df_class_1 = df_train[df_train['Default'] == 1]

df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_train_over = pd.concat([df_class_0, df_class_1_over], axis=0)

In [16]:
# Undersampling
df_class_0_under = df_class_0.sample(count_class_1)
df_train_under = pd.concat([df_class_0_under, df_class_1], axis=0)

In [21]:
# Modelo de árbol de decision
model = DecisionTreeClassifier(random_state=0)
# Entrenando el modelo
model.fit(X_train, y_train)
# Obteniendo la predicción
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy DATOS ORIGINALES: %.2f%%" % (accuracy * 100.0))

ll = log_loss(y_test, y_pred)
print("Log Loss: {}".format(ll))

Accuracy DATOS ORIGINALES: 89.54%
Log Loss: 3.7701661445016548


In [22]:
confusion = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[39447  2545]
 [ 2162   846]]


In [23]:
# Calculando indicadores
print("\tIndicadores:")
print("\t1. Accuracy: %1.3f" % accuracy_score(y_test, y_pred))
print("\t2. Precision: %1.3f" % precision_score(y_test, y_pred))
print("\t3. Recall: %1.3f" % recall_score(y_test, y_pred))
print("\t4. F1: %1.3f" % f1_score(y_test, y_pred))
print("\t5. AUC: %1.3f" % roc_auc_score(y_test, y_pred))
print("\t6. Gini: %1.3f" % (2*roc_auc_score(y_test, y_pred)-1))

	Indicadores:
	1. Accuracy: 0.895
	2. Precision: 0.249
	3. Recall: 0.281
	4. F1: 0.264
	5. AUC: 0.610
	6. Gini: 0.221
