In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier , export_graphviz
from sklearn.model_selection import train_test_split # Import train_test_split function
import os
import s3fs
import pyarrow.parquet as pq

# os.chdir('/../mnt')

def train_tree(mdt, y_var, params_dict):
    """
    Trains the tree to produce the profile
    :param params_dict:
    :return:
    """
    # Parameters
    predictive_vars = params_dict['predictive_cols']
    min_percent_leaf = params_dict['min_percent_leaf']

    mdt = mdt[predictive_vars]

    reg_tree = DecisionTreeClassifier(criterion="entropy", max_depth=3,min_samples_leaf=int(mdt.shape[0] * min_percent_leaf))

    reg_tree.fit(mdt, y_var)

    return reg_tree


def produce_dot_file(trained_tree, file_name):
    full_path = params_dict['output_path'] + file_name
    export_graphviz(trained_tree, feature_names=params_dict['predictive_cols'],
                    out_file=full_path,
                    filled=True, proportion=True,
                    rounded=True)


def main(mdt, y_var):
    name = 'tree_'+params_dict['segment_name']+'_V0.dot'
    my_tree = train_tree(mdt, y_var, params_dict)
    produce_dot_file(my_tree, name)
    print('Tree produced')
    return my_tree


In [2]:
# Definir parámetros
params_dict = {'min_percent_leaf': 0.05,
               'input_path':'s3://adl-refined-dev-popular/parquet/TC_adquisicion/total_tdc_paprob',
               'output_path':'/mnt/work/CU_adquisicionTC/Notebooks/creacion modelo/resultados_arbolV0/',
               'segment_name':'preaprobados'}

In [3]:
fs = s3fs.S3FileSystem()

# Leyendo base
input = params_dict['input_path']
dataset = pq.ParquetDataset(input, filesystem=fs)
table = dataset.read()
mdt = table.to_pandas()

del input, dataset, table

# Definiendo Var Obj'var_final'etivo
y_var = 'var_final'

In [5]:
## Muestra del 50%
#Se generan numeros aleatorios entre 0 y 1
mdt['ran']=np.random.random(size=mdt.shape[0])

#Se seleccion train : 50% 
mdt_v0=mdt[mdt['ran']<=0.5]

In [8]:
# Completitud

completitud = pd.DataFrame(1-mdt_v0.count()/len(mdt_v0)).reset_index()
completitud.columns =['var', 'porcentaje'] 
print(completitud.head())
print(completitud.shape)

vars_final = completitud[completitud["porcentaje"]<=0.20]
print(vars_final.head())
print(vars_final.shape)

vars_finales = pd.Series(vars_final["var"])

mdt_v0 = mdt_v0[vars_finales]

            var  porcentaje
0    id_cliente         0.0
1  fecha_activo         0.0
2    fecha_buro         0.0
3   mes_campaña         0.0
4         venta         0.0
(1261, 2)
            var  porcentaje
0    id_cliente         0.0
1  fecha_activo         0.0
2    fecha_buro         0.0
3   mes_campaña         0.0
4         venta         0.0
(722, 2)


In [9]:
# Var target
y = mdt_v0[y_var]

# quitando otras variables VO
mdt_v0.drop(columns=['var_final','30first_use','venta'], inplace = True)

# Defining varss modelo
cols_modelo = mdt_v0.columns[~mdt_v0.columns.isin(
    ['id_cliente', 'fecha_activo', 'fecha_buro', 'mes_campaña','tipo_campana', 'periodo',
    'cedulaenc','tipo_cliente','tipo_id','derogatorio'])
                            ]

params_dict['predictive_cols'] = cols_modelo

In [10]:
cat_cols = list(mdt_v0.dtypes[mdt_v0.dtypes=='object'].index)
cat_cols

['tipo_campana',
 'periodo',
 'ciudad_de_expedicion',
 'genero',
 'peor_califi_trim_1_endeud',
 'rango_aproximado_edad',
 'tipo_id',
 'marca_derogatorio',
 'tdc_altura_maxima_de_mora_was_is',
 'cartera_banca_alt_max_de_mora_was_is',
 'cartera_coope_alt_max_de_mora_was_is',
 'cartera_hipote_alt_max_de_mora_was_is',
 'moramax_mercado_was_is',
 'was_is_LB',
 'was_is_TC',
 'was_is_NV',
 'was_is_CH',
 'was_is_CE',
 'was_is_LE',
 'was_is_ME',
 'was_is_SO',
 'was_is_CC',
 'was_is_TC_X',
 'was_is_TC_T',
 'was_is_tot']

In [12]:
# feature engineering

mdt_v0['ciudad_de_expedicion_cat'] = (mdt_v0['ciudad_de_expedicion'] == 'BOGOTA D.C.').astype('int')
mdt_v0['genero_cat'] = (mdt_v0['genero'] == 'F').astype('int')
mdt_v0['peor_califi_trim_1_endeud_cat'] = (mdt_v0['peor_califi_trim_1_endeud'] == 'mantiene').astype('int')
mdt_v0['rango_aproximado_edad_cat'] = (mdt_v0['rango_aproximado_edad'] == 'mantiene').astype('int')
mdt_v0['marca_derogatorio_cat'] = (mdt_v0['marca_derogatorio'] == 'aprobado').astype('int')
mdt_v0['tdc_altura_maxima_de_mora_was_is_cat'] = (mdt_v0['tdc_altura_maxima_de_mora_was_is'] == 'mantiene').astype('int')
mdt_v0['cartera_banca_alt_max_de_mora_was_is_cat'] = (mdt_v0['cartera_banca_alt_max_de_mora_was_is'] == 'mantiene').astype('int')
mdt_v0['cartera_coope_alt_max_de_mora_was_is_cat'] = (mdt_v0['cartera_coope_alt_max_de_mora_was_is'] == 'mantiene').astype('int')
mdt_v0['cartera_hipote_alt_max_de_mora_was_is_cat'] = (mdt_v0['cartera_hipote_alt_max_de_mora_was_is'] == 'mantiene').astype('int')
mdt_v0['moramax_mercado_was_is_cat'] = (mdt_v0['moramax_mercado_was_is'] == 'mantiene').astype('int')
mdt_v0['was_is_LB_cat'] = (mdt_v0['was_is_LB'] == 'mantiene').astype('int')
mdt_v0['was_is_TC_cat'] = (mdt_v0['was_is_TC'] == 'mantiene').astype('int')
mdt_v0['was_is_NV_cat'] = (mdt_v0['was_is_NV'] == 'mantiene').astype('int')
mdt_v0['was_is_CH_cat'] = (mdt_v0['was_is_CH'] == 'mantiene').astype('int')
mdt_v0['was_is_CE_cat'] = (mdt_v0['was_is_CE'] == 'mantiene').astype('int')
mdt_v0['was_is_LE_cat'] = (mdt_v0['was_is_LE'] == 'mantiene').astype('int')
mdt_v0['was_is_ME_cat'] = (mdt_v0['was_is_ME'] == 'mantiene').astype('int')
mdt_v0['was_is_SO_cat'] = (mdt_v0['was_is_SO'] == 'mantiene').astype('int')
mdt_v0['was_is_CC_cat'] = (mdt_v0['was_is_CC'] == 'mantiene').astype('int')
mdt_v0['was_is_TC_X_cat'] = (mdt_v0['was_is_TC_X'] == 'mantiene').astype('int')
mdt_v0['was_is_TC_T_cat'] = (mdt_v0['was_is_TC_T'] == 'mantiene').astype('int')
mdt_v0['was_is_tot_cat'] = (mdt_v0['was_is_tot'] == 'mantiene').astype('int')


In [13]:
mdt_v0.drop(columns=cat_cols, inplace = True)
cols_modelo = mdt_v0.columns

In [14]:
params_dict['predictive_cols'] = cols_modelo

In [15]:
# Split dataset into training set and test set
X_train, X_val, y_train, y_val = train_test_split(mdt_v0, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [16]:
## Correr arbol
main(X_train,y_train)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [None]:
cols_modelo = mdt_v0.columns[~mdt_v0.columns.isin(
    ['id_cliente', 'fecha_activo', 'fecha_buro', 'mes_campaña','tipo_campana', 'periodo',
    'cedulaenc','tipo_cliente','tipo_id','derogatorio'])
                            ]

In [None]:
params_dict['predictive_cols'] = cols_modelo

In [None]:
params_dict['predictive_cols']