In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier , export_graphviz
from sklearn.model_selection import train_test_split # Import train_test_split function
import os
import s3fs
import pyarrow.parquet as pq

pd.set_option('max_rows',100)
pd.set_option('max_columns',None)

# os.chdir('/../mnt')

def train_tree(mdt, y_var, params_dict):
    """
    Trains the tree to produce the profile
    :param params_dict:
    :return:
    """
    # Parameters
    predictive_vars = params_dict['predictive_cols']
    min_percent_leaf = params_dict['min_percent_leaf']

    mdt = mdt[predictive_vars]

    reg_tree = DecisionTreeClassifier(criterion="entropy", max_depth=3,min_samples_leaf=int(mdt.shape[0] * min_percent_leaf))

    reg_tree.fit(mdt, y_var)

    return reg_tree


def produce_dot_file(trained_tree, file_name):
    full_path = params_dict['output_path'] + file_name
    export_graphviz(trained_tree, feature_names=params_dict['predictive_cols'],
                    out_file=full_path,
                    filled=True, proportion=True,
                    rounded=True)


def main(mdt, y_var):
    name = 'tree_'+params_dict['segment_name']+'_V0.dot'
    my_tree = train_tree(mdt, y_var, params_dict)
    produce_dot_file(my_tree, name)
    print('Tree produced')
    return my_tree



In [2]:
# Definir parámetros
params_dict = {'min_percent_leaf': 0.05,
               'input_path':'s3://adl-refined-dev-popular/parquet/TC_adquisicion/total_tdc_paprob',
              'output_path':'/mnt/work/CU_adquisicionTC/Notebooks/creacion modelo/resultados_arbolV0/',
              'segment_name':'preaprobados'}

In [3]:
fs = s3fs.S3FileSystem()

# Leyendo base
input = params_dict['input_path']
dataset = pq.ParquetDataset(input, filesystem=fs)
table = dataset.read()
mdt = table.to_pandas()

del input, dataset, table

# Definiendo Var Obj'var_final'etivo
y_var = '30first_use'

In [4]:
mdt.shape

(35614, 1559)

In [6]:
mdt.columns

Index(['id_cliente', 'fecha_tx_pav', 'fecha_tx_act', 'fecha_pasivo',
       'fecha_lib', 'fecha_activo', 'fecha_buro', 'data_camp', 'mes_campaña',
       'venta',
       ...
       'prom_mov_ult_3meses_ahr', 'prom_mov_ult_4meses_ahr',
       'prom_mov_ult_6meses_ahr', 'mode_dia_ahr', 'estado_cta_actual_ahr',
       'tipo_transaccion_ahr', 'sum_tx_linea_ahr', 'sum_tx_nolinea_ahr',
       'fecha_ult_tx_ahr', 'dias_desde_ult_tx_ahr'],
      dtype='object', length=1559)

In [7]:
np.random.seed(seed=4321) # Semilla
## Muestra del 50%
#Se generan numeros aleatorios entre 0 y 1
#mdt['ran']=np.random.random(size=mdt.shape[0])

#Se seleccion train : 50% 
mdt_v0=mdt.copy()

# quitando otras variables VO
mdt_v0.drop(columns=['var_final','mes_campaña','venta', 'fecha_activo', 'fecha_buro', 'fecha_data', 'fecha_envio',
                     'tipo_campana', 'periodo','cedulaenc','tipo_cliente','tipo_id','derogatorio',
                     'fecha_real','fecha_mas_antigua_apertura','timestamp','lag_t_6_saldo_tot_tot_SMLV', 
                     'lag_t_1_cuota_tot_LB_SMLV', 'lag_t_6_cuota_tot_LB_SMLV', 'lag_t_6_cuota_tot_tot_SMLV', 
                     'lag_t_1_amortizacion_prom_LB', 'lag_t_6_amortizacion_prom_LB', 'lag_t_1_amortizacion_prom_tot', 
                     'lag_t_6_amortizacion_prom_tot', 'lag_t_6_saldo_tot_LB_SMLV', 'lag_t_1_saldo_tot_tot_SMLV', 
                     'lag_t_1_cuota_tot_tot_SMLV', 'lag_t_6_cupo_tot_LB_SMLV', 'lag_t_1_cupo_tot_tot_SMLV', 'lag_t_6_cupo_tot_tot_SMLV'], inplace = True)

In [8]:
mdt_v0.shape

(35614, 1529)

In [9]:
## Vars modelo:

top_vars = [
'acierta_a_financiero',
'acep_oferta_prev',
'amortizacion_max_LB',
'amortizacion_max_tot',
'amortizacion_min_LB',
'amortizacion_min_tot',
'amortizacion_prom_LB',
'amortizacion_prom_LB_ult3_meses',
'amortizacion_prom_LB_ult9_meses',
'amortizacion_prom_tot',
'amortizacion_prom_tot_ult3_meses',
'amortizacion_prom_tot_ult9_meses',
'ano_po',
'antiguedad_tot_conlibranza',
'cartera_banca_alt_max_de_mora_was_is',
'cartera_coope_alt_max_de_mora_was_is',
'cartera_hipote_alt_max_de_mora_was_is',
'ctas_de_ahorro_act_ctas_bancadif',
'cuota_max_LB_SMLV',
'cuota_max_tot_SMLV',
'cuota_paga_prom_ult3_meses_total',
'cuota_paga_prom_ult7_meses_total',
'cuota_prom_LB_SMLV',
'cuota_prom_tot_SMLV',
'cuota_tot_LB_SMLV',
'cuota_tot_LB_SMLV_ult3_meses',
'cuota_tot_LB_SMLV_ult8_meses',
'cuota_tot_tot_SMLV',
'cuota_tot_tot_SMLV_ult3_meses',
'cuota_tot_tot_SMLV_ult8_meses',
'cuota_mercado_smlv',
'cupo_max_LB_SMLV',
'cupo_max_tot_SMLV',
'cupo_mercadodif',
'cupo_prom_LB_SMLV',
'cupo_prom_tot_SMLV',
'cupo_tot_LB_SMLV',
'cupo_tot_LB_SMLV_ult3_meses',
'cupo_tot_LB_SMLV_ult9_meses',
'cupo_tot_tot_SMLV',
'cupo_tot_tot_SMLV_ult3_meses',
'cupo_tot_tot_SMLV_ult9_meses',
'cuposectorbancario_sin_popular_smlv',
'dias_desde_ult_tx_ahr',
'dias_desde_ult_tx_cdt',
'dias_desde_ult_tx_cte',
'dias_desde_ult_tx_pasv',
'dif_porc_t_1_amortizacion_prom_LB',
'dif_porc_t_1_amortizacion_prom_tot',
'dif_porc_t_6_amortizacion_prom_LB',
'dif_porc_t_6_amortizacion_prom_tot',
'dif_porc_t_6_saldo_tot_tot_SMLV',
'estado_cta_actual_ahr',
'estado_cta_actual_cdt',
'estado_cta_actual_cte',
'estado_cta_actual_pasv',
'fec_mas_anti_aper_tdc_sin_popu',
'fecantiapersectorbancasinpopu',
'fecha',
'fecha_lib',
'fecha_pasivo',
'fecha_tx_act',
'fecha_tx_pav',
'fecha_ult_tx_ahr',
'fecha_ult_tx_cdt',
'fecha_ult_tx_cte',
'fecha_ult_tx_pasv',
'lag_t_1_cupo_tot_LB_SMLV',
'lag_t_1_saldo_tot_LB_SMLV',
'marca_derogatorio',
'max_cupo_tdc_sin_populardif',
'max_dias_procesamiento',
'maxcuposectorbancasin_popular_smlv',
'meses_ultim_aper_LB',
'meses_ultim_aper_tot',
'mode_dia_ahr',
'mode_dia_cdt',
'mode_dia_cte',
'mode_dia_pasv',
'moramax_mercado_was_is',
'n_novaciones',
'num_act_utl_meses',
'num_aper_ultim_12meses_LB',
'num_aper_ultim_12meses_tot',
'num_aper_ultim_6meses_LB',
'num_aper_ultim_6meses_tot',
'num_camp_ult_meses',
'num_lib_solicitadas',
'num_meses_ult_camp',
'num_no_aceptado',
'num_tx_ult_1mes_ahr',
'num_tx_ult_1mes_cdt',
'num_tx_ult_1mes_cte',
'num_tx_ult_1mes_pasv',
'num_tx_ult_2meses_ahr',
'num_tx_ult_2meses_cdt',
'num_tx_ult_2meses_cte',
'num_tx_ult_2meses_pasv',
'num_tx_ult_3meses_ahr',
'num_tx_ult_3meses_cdt',
'num_tx_ult_3meses_cte',
'num_tx_ult_3meses_pasv',
'num_tx_ult_4meses_ahr',
'num_tx_ult_4meses_cdt',
'num_tx_ult_4meses_cte',
'num_tx_ult_4meses_pasv',
'num_tx_ult_6meses_ahr',
'num_tx_ult_6meses_cdt',
'num_tx_ult_6meses_cte',
'num_tx_ult_6meses_pasv',
'numero_creditos_cf',
'numero_creditos_codeudores',
'numero_obligaciones_activasdif',
'numoblvigensectorbancasin_popu',
'pasv_antig_total',
'pasv_dias_desde_ultima_trans',
'pasv_num_ctas_vig_ca',
'pasv_num_ctas_vig_cc',
'pasv_num_ctas_vig_cdt',
'pasv_num_ctas_vig_tot',
'pasv_num_meses_ult_apertura',
'pasv_saldo_ca_1mes_atras',
'pasv_saldo_ca_3mes_atras',
'pasv_saldo_ca_6mes_atras',
'pasv_saldo_ca_fin_mes_smlv',
'pasv_saldo_cc_1mes_atras',
'pasv_saldo_cc_3mes_atras',
'pasv_saldo_cc_6mes_atras',
'pasv_saldo_cc_fin_mes_smlv',
'pasv_saldo_cdt_1mes_atras',
'pasv_saldo_cdt_3mes_atras',
'pasv_saldo_cdt_6mes_atras',
'pasv_saldo_cdt_fin_mes_smlv',
'pasv_saldo_max_ca_fin_mes_smlv',
'pasv_saldo_max_cc_fin_mes_smlv',
'pasv_saldo_max_cdt_fin_mes_smlv',
'pasv_saldo_max_fin_mes_smlv',
'pasv_saldo_min_ca_fin_mes_smlv',
'pasv_saldo_min_cc_fin_mes_smlv',
'pasv_saldo_min_cdt_fin_mes_smlv',
'pasv_saldo_min_fin_mes_smlv',
'pasv_saldo_tot_fin_mes_smlv',
'pasv_saldo_total_1mes_atras',
'pasv_saldo_total_3mes_atras',
'pasv_saldo_total_6mes_atras',
'peor_calif_trim_2_endeud',
'peor_califi_trim_1_endeud',
'prom_dias_procesamiento',
'prom_monto_aprobado',
'prom_monto_desembolsado',
'prom_monto_novado',
'prom_monto_solicitado',
'prom_n_cuotas',
'prom_prop_desembolso',
'prom_tasa',
'prom_tx_ult_1meses_ahr',
'prom_tx_ult_1meses_cdt',
'prom_tx_ult_1meses_cte',
'prom_tx_ult_1meses_pasv',
'prom_tx_ult_2meses_ahr',
'prom_tx_ult_2meses_cdt',
'prom_tx_ult_2meses_cte',
'prom_tx_ult_2meses_pasv',
'prom_tx_ult_3meses_ahr',
'prom_tx_ult_3meses_cdt',
'prom_tx_ult_3meses_cte',
'prom_tx_ult_3meses_pasv',
'prom_tx_ult_4meses_ahr',
'prom_tx_ult_4meses_cdt',
'prom_tx_ult_4meses_cte',
'prom_tx_ult_4meses_pasv',
'prom_tx_ult_6meses_ahr',
'prom_tx_ult_6meses_cdt',
'prom_tx_ult_6meses_cte',
'prom_tx_ult_6meses_pasv',
'prom_valor_cuota',
'promcuposectorbancasin_popular_smlv',
'quanto_mod',
'rango_2',
'rango_4',
'rango_5',
'saldo_max_LB_SMLV',
'saldo_max_tot_SMLV',
'saldo_prom_LB_SMLV',
'saldo_prom_tot_SMLV',
'saldo_prom_ult3_meses_total',
'saldo_prom_ult6_meses_total',
'saldo_prom_ult7_meses_total',
'saldo_tot_LB_SMLV',
'saldo_tot_LB_SMLV_ult3_meses',
'saldo_tot_LB_SMLV_ult7_meses',
'saldo_tot_tot_SMLV',
'saldo_tot_tot_SMLV_ult3_meses',
'saldo_tot_tot_SMLV_ult7_meses',
'sector_ultlibranza',
'subsector_ultlibranza',
'sum_tx_linea_ahr',
'sum_tx_linea_cte',
'sum_tx_linea_pasv',
'sum_tx_nolinea_ahr',
'sum_tx_nolinea_cdt',
'sum_tx_nolinea_pasv',
'tdc_altura_maxima_de_mora_was_is',
'tiene_cartfinanciera',
'tiene_codeudor',
'tiene_sectorreal',
'tipo_pagaduria_ultlibranza',
'tipo_transaccion',
'tipo_transaccion_ahr',
'tipo_transaccion_cdt',
'tipo_transaccion_cte',
'utilizacion_sin_popular',
'util_prom_ult3_meses_total',
'util_prom_ult6_meses_total',
'util_prom_ult9_meses_total',
'util_sector_banca_sin_popular',
'val_cuo_sector_banca_sin_popu',
'valor_cuotas_cbdif',
'valor_cuotas_codeudores_smlv',
'valor_cuotas_sr_smlv',
'valor_inicial_cbdif',
'valor_inicial_cf_smlv',
'valor_saldo_codeudores_smlv',
'valor_saldo_cbdif',
'valor_saldo_srdif',
'valor_saldo_sr_smlv',
'valor_utilisectorbancasin_popu_smlv',
'valor_utilisectorbancasin_popudif',
'valor_utilizado_smlv',
'valor_utilizadodif',
'was_is_CC',
'was_is_CE',
'was_is_CH',
'was_is_LB',
'was_is_LE',
'was_is_ME',
'was_is_NV',
'was_is_SO',
'was_is_TC',
'was_is_TC_T',
'was_is_TC_X',
'porcentaje_utilizacion'
]

In [10]:
mdt_v0 = mdt_v0[['id_cliente','data_camp','30first_use']+top_vars]

In [11]:
mdt_v0['num_nulos'] = mdt_v0.apply(lambda row: (row.isnull()).sum(), axis=1)
mdt_v0 = mdt_v0[mdt_v0['num_nulos']<50]

In [12]:
mdt['30first_use'].value_counts(normalize=True)*100

0    95.63374
1     4.36626
Name: 30first_use, dtype: float64

In [13]:
mdt_v0['30first_use'].value_counts(normalize=True)*100

0    95.109385
1     4.890615
Name: 30first_use, dtype: float64

In [14]:
mdt_v0.set_index(['id_cliente','data_camp'], inplace=True)

In [15]:
cat_cols = list(mdt_v0.dtypes[mdt_v0.dtypes=='object'].index)
cat_cols = cat_cols + ['fecha']

cat_cols

['cartera_banca_alt_max_de_mora_was_is',
 'cartera_coope_alt_max_de_mora_was_is',
 'cartera_hipote_alt_max_de_mora_was_is',
 'estado_cta_actual_ahr',
 'estado_cta_actual_cdt',
 'estado_cta_actual_cte',
 'estado_cta_actual_pasv',
 'fecha_ult_tx_ahr',
 'fecha_ult_tx_cdt',
 'fecha_ult_tx_cte',
 'fecha_ult_tx_pasv',
 'marca_derogatorio',
 'mode_dia_ahr',
 'mode_dia_cdt',
 'mode_dia_cte',
 'mode_dia_pasv',
 'moramax_mercado_was_is',
 'peor_calif_trim_2_endeud',
 'peor_califi_trim_1_endeud',
 'sector_ultlibranza',
 'subsector_ultlibranza',
 'tdc_altura_maxima_de_mora_was_is',
 'tipo_pagaduria_ultlibranza',
 'tipo_transaccion',
 'tipo_transaccion_ahr',
 'tipo_transaccion_cdt',
 'tipo_transaccion_cte',
 'was_is_CC',
 'was_is_CE',
 'was_is_CH',
 'was_is_LB',
 'was_is_LE',
 'was_is_ME',
 'was_is_NV',
 'was_is_SO',
 'was_is_TC',
 'was_is_TC_T',
 'was_is_TC_X',
 'fecha']

In [16]:
#feature engineering

#mdt_v0['ciudad_de_expedicion_cat'] = (mdt_v0['ciudad_de_expedicion'] == 'BOGOTA D.C.').astype('int')
#mdt_v0['genero_cat'] = (mdt_v0['genero'] == 'F').astype('int')
mdt_v0['peor_califi_trim_1_endeud_cat'] = (mdt_v0['peor_califi_trim_1_endeud'] == 'mantiene').astype('int')
mdt_v0['peor_calif_trim_2_endeud_cat'] = (mdt_v0['peor_calif_trim_2_endeud'] == 'mantiene').astype('int')
#mdt_v0['rango_aproximado_edad_cat'] = (mdt_v0['rango_aproximado_edad'] == 'mantiene').astype('int')
mdt_v0['marca_derogatorio_cat'] = (mdt_v0['marca_derogatorio'] == 'aprobado').astype('int')
mdt_v0['tdc_altura_maxima_de_mora_was_is_cat'] = (mdt_v0['tdc_altura_maxima_de_mora_was_is'] == 'mantiene').astype('int')
mdt_v0['cartera_banca_alt_max_de_mora_was_is_cat'] = (mdt_v0['cartera_banca_alt_max_de_mora_was_is'] == 'mantiene').astype('int')
mdt_v0['cartera_coope_alt_max_de_mora_was_is_cat'] = (mdt_v0['cartera_coope_alt_max_de_mora_was_is'] == 'mantiene').astype('int')
mdt_v0['cartera_hipote_alt_max_de_mora_was_is_cat'] = (mdt_v0['cartera_hipote_alt_max_de_mora_was_is'] == 'mantiene').astype('int')
mdt_v0['moramax_mercado_was_is_cat'] = (mdt_v0['moramax_mercado_was_is'] == 'mantiene').astype('int')
mdt_v0['was_is_LB_cat'] = (mdt_v0['was_is_LB'] == 'mantiene').astype('int')
mdt_v0['was_is_TC_cat'] = (mdt_v0['was_is_TC'] == 'mantiene').astype('int')
mdt_v0['was_is_NV_cat'] = (mdt_v0['was_is_NV'] == 'mantiene').astype('int')
mdt_v0['was_is_CH_cat'] = (mdt_v0['was_is_CH'] == 'mantiene').astype('int')
mdt_v0['was_is_CE_cat'] = (mdt_v0['was_is_CE'] == 'mantiene').astype('int')
mdt_v0['was_is_LE_cat'] = (mdt_v0['was_is_LE'] == 'mantiene').astype('int')
mdt_v0['was_is_ME_cat'] = (mdt_v0['was_is_ME'] == 'mantiene').astype('int')
mdt_v0['was_is_SO_cat'] = (mdt_v0['was_is_SO'] == 'mantiene').astype('int')
mdt_v0['was_is_CC_cat'] = (mdt_v0['was_is_CC'] == 'mantiene').astype('int')
mdt_v0['was_is_TC_X_cat'] = (mdt_v0['was_is_TC_X'] == 'mantiene').astype('int')
mdt_v0['was_is_TC_T_cat'] = (mdt_v0['was_is_TC_T'] == 'mantiene').astype('int')
mdt_v0['was_is_tot_cat'] = (mdt_v0['was_is_tot'] == 'mantiene').astype('int')

mdt_v0['tipo_transaccion_cat'] = (mdt_v0['tipo_transaccion'] == '2').astype('int')
mdt_v0['tipo_transaccion_ahr_cat'] = (mdt_v0['tipo_transaccion_ahr'] == '2').astype('int')
mdt_v0['tipo_transaccion_cdt_cat'] = (mdt_v0['tipo_transaccion_cdt'] == '2').astype('int')
mdt_v0['tipo_transaccion_cte_cat'] = (mdt_v0['tipo_transaccion_cte'] == '2').astype('int')

# mdt_v0['tiene_sector_libranza'] = (mdt_v0['sector_ultlibranza'].notnull()).astype('int')
# mdt_v0['tiene_subsector_libranza'] = (mdt_v0['subsector_ultlibranza'] == 'mantiene').astype('int')
# mdt_v0['tiene_pagaduria_libranza'] = (mdt_v0['tipo_pagaduria_ultlibranza'] == 'mantiene').astype('int')
# mdt_v0['estado_cta_actual_pasv1'] = (mdt_v0['estado_cta_actual_pasv'] == '1').astype('int')

# mdt_v0['tipo_transaccion_pasv2'] = (mdt_v0['tipo_transaccion'] == '2').astype('int')
# mdt_v0['estado_cta_actual_pasv1'] = (mdt_v0['estado_cta_actual_pasv'] == '1').astype('int')

# mdt_v0['tipo_transaccion_cte2'] = (mdt_v0['tipo_transaccion_cte'] == '2').astype('int')
# mdt_v0['estado_cta_actual_cte1'] = (mdt_v0['estado_cta_actual_cte'] == '1').astype('int')

# mdt_v0['tipo_transaccion_ahr2'] = (mdt_v0['tipo_transaccion_ahr'] == '2').astype('int')
# mdt_v0['estado_cta_actual_ahr1'] = (mdt_v0['estado_cta_actual_ahr'] == '1').astype('int')

# mdt_v0['tipo_transaccion_pasv2'] = (mdt_v0['tipo_transaccion'] == '2').astype('int')
# mdt_v0['estado_cta_actual_pasv1'] = (mdt_v0['estado_cta_actual_pasv'] == '1').astype('int')

KeyError: 'was_is_tot'

In [None]:
mdt_v0.reset_index(inplace=True)

In [None]:
# Fill missing con categoria 'missing' para variables categoricas existentes
vars_cat = ['estado_cta_actual_ahr',
'estado_cta_actual_cdt',
'estado_cta_actual_cte',
'estado_cta_actual_pasv',
'mode_dia_ahr',
'mode_dia_cdt',
'mode_dia_cte',
'mode_dia_pasv',
'sector_ultlibranza']

mdt_v0[vars_cat] = mdt_v0[vars_cat].apply(lambda x: x.fillna('missing'))

In [None]:
mdt_v0['estado_cta_actual_ahr'] = pd.Categorical(
                   mdt_v0.estado_cta_actual_ahr,
                   ['missing','1','4','0']
                   )

mdt_v0['estado_cta_actual_cdt'] = pd.Categorical(
                   mdt_v0.estado_cta_actual_cdt,
                   ['missing','1','4','0']
                   )

mdt_v0['estado_cta_actual_cte'] = pd.Categorical(
                   mdt_v0.estado_cta_actual_cte,
                   ['missing','1']
                   )

mdt_v0['estado_cta_actual_pasv'] = pd.Categorical(
                   mdt_v0.estado_cta_actual_pasv,
                   ['missing','1','4','0']
                   )

mdt_v0['mode_dia_ahr'] = pd.Categorical(
                   mdt_v0.mode_dia_ahr,
                   ['missing','Friday','Tuesday','Thursday', 'Monday', 'Wednesday', 'Saturday', 'Sunday']
                   )

mdt_v0['mode_dia_cdt'] = pd.Categorical(
                   mdt_v0.mode_dia_cdt,
                   ['missing','Friday','Tuesday','Thursday', 'Monday', 'Wednesday', 'Saturday', 'Sunday']
                   )

mdt_v0['mode_dia_pasv'] = pd.Categorical(
                   mdt_v0.mode_dia_pasv,
                   ['missing','Friday','Tuesday','Thursday', 'Monday', 'Wednesday', 'Saturday', 'Sunday']
                   )

mdt_v0['sector_ultlibranza'] = pd.Categorical(
                   mdt_v0.sector_ultlibranza,
                   ['missing','PENSIONADOS','ENTES TERRITORIALES','ORDEN NACIONAL', 'ENTES DESENTRALIZADOS', 'PRIVADO']
                   )

In [None]:
## Categorizing variables
mdt_v0 = mdt_v0.join(pd.get_dummies(mdt_v0[[
'estado_cta_actual_ahr',
'estado_cta_actual_cdt',
'estado_cta_actual_cte',
'estado_cta_actual_pasv',
'mode_dia_ahr',
'mode_dia_cdt',
'mode_dia_cte',
'mode_dia_pasv',
'sector_ultlibranza'
]]), how='left')

In [None]:
mdt_v0.drop(columns=['data_camp']+cat_cols, inplace = True)

In [None]:
mdt_v0.reset_index(inplace=True)

In [None]:
mdt_v0.drop(columns=['index'], inplace = True)

In [None]:
mdt_v0.set_index(['id_cliente'], inplace=True)

In [None]:
np.random.seed(seed=1234) # Semilla

#Se generan numeros aleatorios entre 0 y 1
mdt_v0['ran2']=np.random.random(size=mdt_v0.shape[0])

#Se seleccion train : 50% 
mdt_train=mdt_v0[mdt_v0['ran2']<=0.7]
mdt_test=mdt_v0[mdt_v0['ran2']>0.7]

###
y = mdt_train['30first_use']
y_test = mdt_test['30first_use']

mdt_train.drop(columns=['30first_use','ran2'],inplace=True)
mdt_test.drop(columns=['30first_use','ran2'],inplace=True)

In [None]:
mdt['30first_use'].value_counts(normalize=True)

In [None]:
mdt_v0['30first_use'].value_counts(normalize=True)

In [None]:
y.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

In [None]:
cols_modelo = mdt_train.columns
params_dict['predictive_cols'] = cols_modelo

In [None]:
np.random.seed(seed=1234) # Semilla
# Split dataset into training set and test set
X_train, X_val, y_train, y_val = train_test_split(mdt_train, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [None]:
y_train.value_counts(normalize=True)*100

In [None]:
y_val.value_counts(normalize=True)*100

In [None]:
y_test.value_counts(normalize=True)*100

In [None]:
vars_final = list(cols_modelo)
# vars_final = ['acep_oferta_prev', 'num_act_utl_meses', 'num_camp_ult_meses', 'num_meses_ult_camp', 'num_no_aceptado', 
# 'num_lib_solicitadas', 'prom_monto_solicitado', 'prom_monto_aprobado', 'prom_monto_desembolsado', 'prom_monto_novado', 
# 'prom_prop_desembolso', 'prom_n_cuotas', 'prom_valor_cuota', 'prom_tasa', 'prom_dias_procesamiento', 'max_dias_procesamiento', 
# 'n_novaciones', 'pasv_num_ctas_vig_tot', 'pasv_num_ctas_vig_cc', 'pasv_num_ctas_vig_ca', 'pasv_num_ctas_vig_cdt', 
# 'pasv_dias_desde_ultima_trans', 'pasv_antig_total', 'pasv_num_meses_ult_apertura', 'pasv_saldo_tot_fin_mes_smlv', 
# 'pasv_saldo_cc_fin_mes_smlv', 'pasv_saldo_ca_fin_mes_smlv', 'pasv_saldo_cdt_fin_mes_smlv', 'pasv_saldo_min_fin_mes_smlv', 
# 'pasv_saldo_min_cc_fin_mes_smlv', 'pasv_saldo_min_ca_fin_mes_smlv', 'pasv_saldo_min_cdt_fin_mes_smlv', 
# 'pasv_saldo_max_fin_mes_smlv', 'pasv_saldo_max_cc_fin_mes_smlv', 'pasv_saldo_max_ca_fin_mes_smlv', 
# 'pasv_saldo_max_cdt_fin_mes_smlv', 'pasv_saldo_total_1mes_atras', 'pasv_saldo_cc_1mes_atras', 'pasv_saldo_ca_1mes_atras', 
# 'pasv_saldo_cdt_1mes_atras', 'pasv_saldo_total_3mes_atras', 'pasv_saldo_cc_3mes_atras', 'pasv_saldo_ca_3mes_atras', 
# 'pasv_saldo_cdt_3mes_atras', 'pasv_saldo_total_6mes_atras', 'pasv_saldo_cc_6mes_atras', 'pasv_saldo_ca_6mes_atras', 
# 'pasv_saldo_cdt_6mes_atras', 'tiene_sector_libranza', 'tiene_subsector_libranza', 'tiene_pagaduria_libranza']

In [None]:
list(cols_modelo)

### Modelo V0 1200 variables

In [None]:
import shap
import scikitplot as skplt
from sklearn import metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

#Se ajusta el modelo con las mejores variables best_20_v3
#'multi:softmax'
xgb_model = xgb.XGBClassifier(objective = 'binary:logistic', 
                              seed = 1,  
                              max_depth = 5, 
                              colsample_bytree = 0.8,
                              learning_rate = 0.08,
                              gamma=0.2,
                              scale_pos_weight=np.ceil(y_train.shape[0]/y_train.sum()).astype(int),
                              n_estimators=300, min_child_weight = 5)

xgb_model.fit(X_train[vars_final], y_train, 
              eval_set = [(X_train[vars_final], y_train), (X_val[vars_final], y_val)],
              early_stopping_rounds=100)

In [None]:
# Predecir probabilidades para data test (20%)

probs_test = xgb_model.predict_proba(mdt_test[vars_final])

fpr, tpr, thresholds = metrics.roc_curve(y_test, probs_test[:, 1])

#AUC
metrics.auc(fpr, tpr)

In [None]:
#Matriz de confusión
predict_test = (probs_test[:, 1] < 0.00036)*1
metrics.confusion_matrix(y_test,predict_test)

In [None]:
#Accuracy
mdt_test['probs'] = probs_test[:, 1]
metrics.accuracy_score(y_test, predict_test)

In [None]:
#Recall
metrics.recall_score(y_test, predict_test)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager
%matplotlib inline

# Graficar probabilidades según Fraude/No Fraude observado (real)

subset_si = mdt_test[y_test == 1]

subset_no = mdt_test[y_test == 0]

sns.distplot(subset_no[['probs']], hist = True, norm_hist = True, bins = 10, kde = False, kde_kws = {'linewidth' : 3},
            label = 'No usa', color = 'green')

sns.distplot(subset_si[['probs']], hist = True, norm_hist = True, bins = 10, kde = False, kde_kws = {'linewidth' : 3},
            label = 'Si usa', color = 'red')

plt.legend(prop = {'size' : 10})
plt.title('Distribución de scores según Primer uso')
plt.xlabel('Probabilidad')
plt.ylabel('Porcentaje')
plt.show()

In [None]:
skplt.metrics.plot_cumulative_gain(y_test, probs_test)

In [None]:
shap_values = shap.TreeExplainer(xgb_model).shap_values(mdt_test[vars_final])
shap.summary_plot(shap_values, mdt_test[vars_final])


In [None]:
shap.summary_plot(shap_values, mdt_test[vars_final], plot_type = 'bar')

## Grid Search

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=150, max_depth=5,
 scale_pos_weight=np.ceil(y_train.shape[0]/y_train.sum()).astype(int),
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train,y_train)

In [None]:
gsearch1.best_score_,gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2 = {
 'max_depth':[8,9,10],
 'min_child_weight':[1,2,3]
}
gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=150, max_depth=9,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=np.ceil(y_train.shape[0]/y_train.sum()).astype(int)
                                                      ,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.best_params_, gsearch2.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=150, max_depth=5,
 min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=np.ceil(y_train.shape[0]/y_train.sum()).astype(int)
                                                      ,seed=27), 
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=150, max_depth=5,
 min_child_weight=5, gamma=0.2, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=np.ceil(y_train.shape[0]/y_train.sum()).astype(int)
                                                      ,seed=27), 
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

In [None]:
param_test2 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=150, max_depth=5,
 min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=np.ceil(y_train.shape[0]/y_train.sum()).astype(int)
                                                      ,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.best_params_, gsearch2.best_score_

In [None]:
# #Grid seach on subsample and max_features
# #Choose all predictors except target & IDcols
# #kappa_scorer=make_scorer(cohen_kappa_score)
# scoring = {'Precision': 'precision', 'Recall': 'recall', 'Roc':'roc_auc'}
# param_test1 = {
#     'max_depth': range(4,12,2),
#     'min_child_weight':range(4,8,2),
#     'learning_rate': [0.01, 0.1, 0.2]
# }
# gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate =0.1, n_estimators=150, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4,                                       
#                                                       scale_pos_weight=np.ceil(y_train.shape[0]/y_train.sum()).astype(int), seed=None), 
#                        param_grid = param_test1, scoring=scoring, n_jobs=4,iid=False, cv=3)
# gsearch1.fit(X_train,y_train)

### Modelo con variables más importantes (top 60)

In [None]:
def train_eval_xgboost(df, features, y, model_num, df_importance):
    
    """
    Función que realiza lo siguiente: 
    1. A partir de Train, realiza un sampling aleatorio 70-30 para Train y Validación
    2. Entrena un modelo XGBoost con la data de Train, usando tanto la data Train como Validación 
        para evaluar (métrica: AUC)
    3. Calcula mean(|SHAP Value|) por variable para base de Train y guarda el valor en df_importance
    4. Calcula auc, accuracy y recall sobre base de Test y guarda los valores en df_metricas, utilizando
    un threshold determinado por el usuario (0.5 por default)
    """
    
    X_train, X_val, y_train, y_val = train_test_split(df[features], y, test_size = .3)
    
    print(model_num)
    
    "Entrenar modelo XGBoost"
    
    xgb_model = xgb.XGBClassifier(objective = 'binary:logistic', 
                              seed = 1,  
                              max_depth = 9, 
                              colsample_bytree = 0.8,
                              learning_rate = 0.08,
                              gamma=0.2,
                              scale_pos_weight=np.ceil(y_train.shape[0]/y_train.sum()).astype(int),
                              n_estimators=150, min_child_weight = 1)

    xgb_model.fit(X_train, y_train, 
                  eval_set = [(X_train, y_train), (X_val, y_val)], 
                  early_stopping_rounds = 500, 
                  verbose = False)
    
    print('Modelo entrenado')
    
    "Calcular y guardar SHAP Values"
    
    shap_values = shap.TreeExplainer(xgb_model).shap_values(df[features])
    shap_feature = np.abs(shap_values).mean(axis = 0)
    
    df_importance['mean_SHAP_'+model_num] = shap_feature
    
    print('SHAP Values guardados')

In [None]:
features_x = cols_modelo
feature_importances = pd.DataFrame({'feature_name' : features_x}, columns = ['feature_name'])

In [None]:
cols_modelo

In [None]:
#Se ejecuta 10 veces la función train_eval_xgboost para la base train (80% de la base total de desarrollo) 
# y se calculan los shap values (para las 10 iteraciones) que quedan guardados en el feature_importances

#for i in range(1, 3):
for i in range(1, 11):
    
    model_num = 'model_'+str(i)
    
    train_eval_xgboost(mdt_train, features_x, y, model_num, feature_importances)

In [None]:
#A partir de los shap values guardados en feature_importances, se otorga un ranking a las variables y se ordenan según ese ranking

for col in feature_importances.columns[1:]:
    
    feature_importances[col + '_rank'] = feature_importances[col].rank(ascending = False)

In [None]:
feature_importances

In [None]:
# Se seleccionan las 20 mejores variables según ranking anterior
# OJO!!!!!En feature_importances.columns[5:9] se debe poner la posicion de la primera variable _rank hasta la posicion de la ultima variable _rank
principales = []
for col in feature_importances.columns[11:]:
    
    best_60 = [x for x in feature_importances[feature_importances[col] < 40].feature_name.values]
    principales.extend(best_60)

In [None]:
# Se seleccionan las 20 mejores variables según ranking anterior
# OJO!!!!!En feature_importances.columns[5:9] se debe poner la posicion de la primera variable _rank hasta la posicion de la ultima variable _rank
peores = []
for col in feature_importances.columns[11:]:
    
    peorest = [x for x in feature_importances[feature_importances[col] > 90].feature_name.values]
    peores.extend(peorest)

In [None]:
peores

In [None]:
best_60

In [None]:
best_60 = ['acierta_a_financiero',
'amortizacion_prom_tot',
'cuota_paga_prom_ult3_meses_total',
'cuota_mercado_smlv',
'cupo_mercadodif',
'dif_porc_t_6_saldo_tot_tot_SMLV',
#'lag_t_1_saldo_tot_LB_SMLV',
'max_dias_procesamiento',
'meses_ultim_aper_LB',
'num_tx_ult_1mes_ahr',
'num_tx_ult_6meses_ahr',
'num_tx_ult_6meses_pasv',
'pasv_antig_total',
# 'pasv_dias_desde_ultima_trans',
'pasv_num_meses_ult_apertura',
'pasv_saldo_ca_3mes_atras',
'pasv_saldo_ca_6mes_atras',
'pasv_saldo_ca_fin_mes_smlv',
'pasv_saldo_min_fin_mes_smlv',
'pasv_saldo_total_6mes_atras',
'prom_dias_procesamiento',
'prom_monto_aprobado',
'prom_monto_desembolsado',
'prom_monto_novado',
'prom_n_cuotas',
'prom_prop_desembolso',
'prom_tasa',
'quanto_mod',
'sum_tx_linea_ahr',
'sum_tx_nolinea_ahr',
'util_sector_banca_sin_popular',
'valor_saldo_cbdif',
'valor_utilizado_smlv',
'porcentaje_utilizacion'
]

In [None]:
#Se ajusta el modelo con las mejores variables best_20_v3
#'multi:softmax'
xgb_model_v2 = xgb.XGBClassifier(objective = 'binary:logistic', 
                              seed = 1,  
                              max_depth = 9, 
                              colsample_bytree = 0.8,
                              learning_rate = 0.1,
                              gamma=0.4,
                              scale_pos_weight=np.ceil(y_train.shape[0]/y_train.sum()).astype(int)*2,
                              n_estimators=300, min_child_weight = 1)

xgb_model_v2.fit(X_train[best_60], y_train, 
              eval_set = [(X_train[best_60], y_train), (X_val[best_60], y_val)],
              early_stopping_rounds=100)

In [None]:
# Predecir probabilidades para data test (20%)

probs_test = xgb_model_v2.predict_proba(mdt_test[best_60])

fpr, tpr, thresholds = metrics.roc_curve(y_test, probs_test[:, 1])

#AUC
metrics.auc(fpr, tpr)

In [None]:
#Matriz de confusión
predict_test = (probs_test[:, 1] < 0.005)*1
metrics.confusion_matrix(y_test,predict_test)


In [None]:
#Accuracy
mdt_test['probs'] = probs_test[:, 1]
metrics.accuracy_score(y_test, predict_test)


In [None]:
#Recall
metrics.recall_score(y_test, predict_test)

In [None]:
# Graficar probabilidades según Fraude/No Fraude observado (real)

subset_si = mdt_test[y_test == 1]

subset_no = mdt_test[y_test == 0]

sns.distplot(subset_no[['probs']], hist = True, norm_hist = True, bins = 10, kde = False, kde_kws = {'linewidth' : 3},
            label = 'No usa < 30D', color = 'green')

sns.distplot(subset_si[['probs']], hist = True, norm_hist = True, bins = 10, kde = False, kde_kws = {'linewidth' : 3},
            label = 'Si usa < 30D', color = 'red')

plt.legend(prop = {'size' : 10})
plt.title('Distribución de scores según Primer uso')
plt.xlabel('Probabilidad')
plt.ylabel('Porcentaje')
plt.show()

In [None]:
skplt.metrics.plot_cumulative_gain(y_test, probs_test)

In [None]:
shap_values = shap.TreeExplainer(xgb_model_v2).shap_values(mdt_test[best_60])
shap.summary_plot(shap_values, mdt_test[best_60])

In [None]:
shap.summary_plot(shap_values, mdt_test[best_60], plot_type = 'bar')

## Tablas de validación

In [None]:
#Test
mdt_test['probs'] = xgb_model_v2.predict_proba(mdt_test[best_60])[:, 1]
mdt_test['decil_ModSinAcierta']=pd.qcut(mdt_test['probs'].rank(method='first'),9, labels=False)
pd.crosstab(mdt_test['decil_ModSinAcierta'],y_test, margins=False)

In [None]:
#Train
mdt_test['probs'] = xgb_model_v2.predict_proba(mdt_test[best_60])[:, 1]
mdt_test['decil_ModSinAcierta']=pd.qcut(mdt_test['probs'].rank(method='first'),9, labels=False)
pd.crosstab(mdt_test['decil_ModSinAcierta'],y_test, margins=False)

In [None]:
## Correr arbol
#main(X_train,y_train)


In [None]:
# Testing tree
my_tree(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))