In [None]:
# Cargar funciones de la librería de python data analysis
import pandas as pd 

# Leer csv con datos y cargar en el dataframe data
data = pd.read_csv("data/creditos.csv") 

# Preview de las 5 primeras filas de data 
data.head()

In [None]:
import numpy as np

# calcular variable edad a partir de fecha de solicitud - fecha de nacimiento
data['fechaHora'] = pd.to_datetime(data['fechaHora'])
data['nacimiento'] = pd.to_datetime(data['nacimiento'])
data['edad'] = ((data['fechaHora']-data['nacimiento'])/np.timedelta64(1,'Y')).astype(int)

# seleccionar variables de la solicitud, sistema financiero y target, descartar variables pos aprobación
df1 = data.iloc[:,2:3]
df2 = data.iloc[:,83:84]
df3 = data.iloc[:,4:68]
df4 = data.iloc[:,82:83]

# Unificar en un dataframe filtrado
df = pd.concat([df1,df2,df3,df4], axis=1)

# One-hot encoding para variables categoricas
dfOHEncoded = pd.get_dummies(df)
dfOHEncoded.head()

In [None]:
# Carga decision tree, holdout split y metricas
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# split dataset en train (70%) y test (30%)
X = dfOHEncoded.iloc[:,0:110]
y = dfOHEncoded['resultadoFinal_BIEN']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Entrenar decision tree usando entropia y profundidad maxima 6
clf = DecisionTreeClassifier(criterion="entropy",max_depth=6)
clf = clf.fit(X_train,y_train)

# Predecir con datos de test
y_pred = clf.predict(X_test)

# Accuracy: (tp+tn)/n
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
metrics.precision_recall_fscore_support(y_test, y_pred, average=None)

In [None]:
# cargar exportador de grafos y funcion de llamada a sistema
from sklearn.tree import export_graphviz

export_graphviz(clf, out_file="data/creditos.dot",  
                filled=True, rounded=True,
                special_characters=True, feature_names = X.columns,class_names = ['0','1'])

import pydot
(graph,) = pydot.graph_from_dot_file('data/creditos.dot')
graph.write_png('data/creditos.png')

In [None]:
# Identificar variables categoricas
df.dtypes[df.dtypes=='object']
#df["nacionalidad"].unique()

In [None]:
# cargar codificador por etiquetas (LabelEncoder)
from sklearn import preprocessing

# Codificar cada variable categorica con su propio encoder
leNacionalidad = preprocessing.LabelEncoder()
df["nacionalidad"] = leNacionalidad.fit_transform(df["nacionalidad"])
leSexo = preprocessing.LabelEncoder()
df["sexo"] = leSexo.fit_transform(df["sexo"].astype(str))
leEstCivil = preprocessing.LabelEncoder()
df["est_civil"] = leEstCivil.fit_transform(df["est_civil"].astype(str))
leOcupCargo = preprocessing.LabelEncoder()
df["ocup_cargo"] = leOcupCargo.fit_transform(df["ocup_cargo"].astype(str))
leClienteNoR = preprocessing.LabelEncoder()
df["cliente_nuevo_o_recurrente"] = leClienteNoR.fit_transform(df["cliente_nuevo_o_recurrente"])
leTieneVC = preprocessing.LabelEncoder()
df["tiene_visa_classic"] = leTieneVC.fit_transform(df["tiene_visa_classic"])
leTieneVG = preprocessing.LabelEncoder()
df["tiene_visa_gold"] = leTieneVG.fit_transform(df["tiene_visa_gold"])
leTieneMG = preprocessing.LabelEncoder()
df["tiene_mc_gold"] = leTieneMG.fit_transform(df["tiene_mc_gold"])
leTieneFC = preprocessing.LabelEncoder()
df["tiene_fc"] = leTieneFC.fit_transform(df["tiene_fc"])
leTieneMC = preprocessing.LabelEncoder()
df["tiene_mc_classic"] = leTieneMC.fit_transform(df["tiene_mc_classic"])
leFaja = preprocessing.LabelEncoder()
df["respuesta_iconf_faja_score"] = leFaja.fit_transform(df["respuesta_iconf_faja_score"].astype(str))
leResultadoFinal = preprocessing.LabelEncoder()
df["resultadoFinal"] = leResultadoFinal.fit_transform(df["resultadoFinal"])

In [None]:
# Verificar dataframe codificado 
df.head()

# Verificar inversa de codificacion
#leFaja.inverse_transform(df["respuesta_iconf_faja_score"])
#leResultadoFinal.inverse_transform(df["resultadoFinal"])

In [None]:
# Split en train y test
X = df.iloc[:,0:66]
y = df['resultadoFinal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Entrenar decision tree con entropia, size minimo de nodo 50 y profundidad maxima 6
    clf = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=50, max_depth=6)
clf = clf.fit(X_train,y_train)

# Predecir con datos de test
y_pred = clf.predict(X_test)

# Accuracy: (tp+tn)/n
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Exportar arbol
export_graphviz(clf, out_file="data/creditos2.dot",  
                filled=True, rounded=True,
                special_characters=True, feature_names = X.columns,class_names = ['BIEN','MAL'])

(graph,) = pydot.graph_from_dot_file('data/creditos2.dot')
graph.write_png('data/creditos2.png')

In [None]:
# Calcular matriz de confusion
#metrics.confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, 
            rownames=['actual'], 
            colnames=['pred'], margins=False, margins_name="Total")

#leResultadoFinal.classes_

In [None]:
# Calcular metricas: precision, recall, f-measure (f-score)

#print(metrics.recall_score(y_test, y_pred, average=None))
#print(metrics.precision_score(y_test, y_pred, average=None))
metrics.precision_recall_fscore_support(y_test, y_pred, average=None)

In [None]:
# Obtener importancia de variables y vertificar variables mas relevantes
fi = pd.DataFrame(zip(X.columns,clf.feature_importances_), columns=['feature','importance'])

fi[fi['importance'] > 0.0].sort_values(by=['importance'], ascending=False)

In [None]:
# Obtener scores de prediccion
y_scores = clf.predict_proba(X_test)
#y_scores

# Crear dataframe de predicciones con scores, filtrar segun criterio de aprobacion (80%)
dp = pd.DataFrame(zip(y_test,y_pred,y_scores[:,0],y_scores[:,1]), columns=['actual','pred','prob_yes','prob_no'])

dp[dp['prob_yes'] >= 0.80]