# BC Dataton

In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns



# Pre-processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

#modeling

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

# Feature engineerin and Dimension reduction
from sklearn.feature_selection import VarianceThreshold


# Utils
from tqdm import tqdm
from pandas_profiling import ProfileReport
from pycaret.regression import *


%matplotlib inline

sns.set_style("white")
matplotlib.rc('xtick', labelsize=15)
matplotlib.rc('ytick', labelsize=15)
plt.rcParams['figure.figsize'] = [16.0, 10.0]

In [34]:
data = pd.read_csv("train_201910.csv")
data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,ddd,eee,fff,ggg,hhh,iii,jjj,kkk,lll,mmm
0,201910,2089776,19840630,35.23614,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,0,0,629869.0,C,0,\N,1255032,\N,95511.0
1,201910,2088434,19880109,31.709788,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,...,0,\N,\N,4742261.859999999,SIN INFO,\N,\N,\N,\N,1929721.0
2,201910,2088089,19860727,33.163587,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,...,0,\N,\N,1698047.975,SIN INFO,\N,\N,\N,\N,374940.0
3,201910,4780572,19940208,25.626283,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,...,0,5,0,2185655.0,B,0,\N,\N,\N,2165030.0
4,201910,3894402,19910809,28.128679,M,SOLTERO,TECNOLOGO,DISEÑO Y PUBLICIDAD,Independiente,\N,...,0,2,0,-42000.0,SIN INFO,0,\N,\N,\N,77469.29


# Data Processing class

In [14]:
class DataFrameBuilder:
    
    HEADER="https://bc-dataton2020.s3.amazonaws.com/dataton_all_data/header.txt"
    NUMERIC_COLUMNS=[
        "edad",
        "ingreso_segurida_social",
        "mora_max",
        "ingreso_nomina",
        "ind",
        "ingreso_final",
        "cuota_cred_hipot",
        "saldo_prom3_tdc_mdo"
    ]
    DROP_COLUMNS=[
        "fecha_nacimiento",
        "profesion",
        "ocupacion",
        "codigo_ciiu",
        "ciudad_residencia",
        "ciudad_laboral",
        "departamento_laboral",
        "nivel_academico",
        "tipo_vivienda",
        "categoria",
        "rechazo_credito",
        "cartera_castigada",
        "cant_moras_30_ult_12_meses",
        "cant_moras_60_ult_12_meses",
        "cant_moras_90_ult_12_meses",
        "ctas_embargadas",
        "tiene_ctas_embargadas",
        "pension_fopep",
        "tiene_cred_hipo_1",
        "tiene_cred_hipo_2",
        "cant_cast_ult_12m_sr",
        "tenencia_tc",
        "tiene_consumo",
        "tiene_crediagil",
        "pol_centr_ext",
        "tiene_ctas_activas"
    ]
    
    def __init__(self, dataframe, keep_original=False, test=False):
        self.test=test
        self.original_dataframe = self._assign_columns(dataframe.copy())
        self.cleaned_dataframe = None
        self.keep_original = keep_original

        
    def _assign_columns(self, dataframe):
        column_names = pd.read_csv(DataFrameBuilder.HEADER).columns.to_list()
        if self.test:
            column_names.remove("gasto_familiar")
            column_names.insert(0, "id_registro")
        dataframe.columns = column_names
        return dataframe
    
    # Manejo de datos faltantes
    # Reemplazar \N por NA
    def cleanNA(self, dataframe):
        
        for column in dataframe.columns:
            if column in DataFrameBuilder.NUMERIC_COLUMNS :
                dataframe[column] = dataframe[column].replace("\\N", np.nan).astype('float')
        return dataframe
    
    # Modificacion de columnas existentes
    def process_columns(self, dataframe):
        
        # Procesamiento columnas demograficas
        dataframe['edad'] =  dataframe['edad'].round().fillna(method='ffill').astype('int') 
        dataframe['departamento_residencia'] = dataframe['departamento_residencia'].str.strip()
        dataframe['estado_civil'] = np.where(
                    dataframe['estado_civil'] == "SOLTERO", "SOL",
                    np.where(
                        dataframe['estado_civil'] == "CASADO", "CAS",
                        np.where(
                            dataframe['estado_civil'] == "UNION LIBRE", "UL",
                                np.where(
                                    dataframe['estado_civil'] == "NO INFORMA", "NI",
                                        np.where(
                                            dataframe['estado_civil'] == "DIVORCIADO", "DIV",
                                            np.where(
                                                dataframe['estado_civil'] == "VIUDO", "VIU",
                                                    np.where(
                                                        dataframe['estado_civil'] == "\\N", "NI",
                                                        dataframe['estado_civil']
                                                        )
                                                )
                                            )
                                    )
                            )
                        )
                    )
        ########## Procesamiento columnas financieras
        dataframe['convenio_lib'] = np.where(dataframe['convenio_lib'] == "\\N", "N", "S")
        #dataframe['tiene_consumo'] = np.where(dataframe['tiene_consumo'] == "\\N", "N", "S")
        #dataframe['tenencia_tc'] = np.where(dataframe['tenencia_tc'] == "NO", "N", "S")
        dataframe['cat_ingreso'] = np.where(
                                        dataframe['cat_ingreso'] == "\\N","OTROS",
                                        dataframe['cat_ingreso']
        )
        
        dataframe['cuota_cred_hipot'] = dataframe['cuota_cred_hipot'].fillna(0)
        dataframe['cant_oblig_tot_sf'] = pd.Series(np.where(
                                            dataframe['cant_oblig_tot_sf'] == "\\N", "0",
                                            dataframe['cant_oblig_tot_sf']
        )).astype("int")
        
        dataframe['ingreso_nomina'] = dataframe['ingreso_nomina'].fillna(0)
        dataframe['ingreso_segurida_social'] = dataframe['ingreso_segurida_social'].fillna(0)
        
        dataframe['ctas_activas'] = pd.Series(np.where(dataframe['ctas_activas'] =="\\N", "0",
                                             dataframe['ctas_activas']
                                            )).astype("int")
        dataframe['nro_tot_cuentas'] = pd.Series(np.where(dataframe['nro_tot_cuentas'] =="\\N", "0",
                                             dataframe['nro_tot_cuentas']
                                            )).astype("int")
        ########### Procesamiento columnas de riesgo
        dataframe['ind_mora_vigente'] = np.where(
                                        dataframe['ind_mora_vigente'] == '\\N', "NApl",
                                        dataframe['ind_mora_vigente']
        )
        dataframe['rep_calif_cred'] = np.where(
                                        dataframe['rep_calif_cred'] == "SIN INFO","NApl",
                                        dataframe['rep_calif_cred']
        )
        
        dataframe['mora_max'] = np.where(
                                   dataframe['mora_max'] < 30, "Entre 0 y 30 dias",
                                   np.where(
                                       dataframe['mora_max'] < 60, "Entre 31 y 60 dias",
                                       np.where(
                                           dataframe['mora_max'] > 60, "Mas de 60", "NApl")
                                   )
        )
        
        dataframe['cant_mora_30_tdc_ult_3m_sf'] = np.where(
                                                    dataframe['cant_mora_30_tdc_ult_3m_sf'] == "\\N", "NApl",
                                                    np.where(
                                                        dataframe['cant_mora_30_tdc_ult_3m_sf'] == "0",
                                                            "SIN MORA", "CON MORA")
        )
        
        dataframe['cant_mora_30_consum_ult_3m_sf'] = np.where(
                                                    dataframe['cant_mora_30_consum_ult_3m_sf'] == "\\N", "NApl",
                                                    np.where(
                                                        dataframe['cant_mora_30_consum_ult_3m_sf'] == "0",
                                                            "SIN MORA", "CON MORA")
        )

        return dataframe
    
    # Para eliminar las columnas que no vamos a usar
    def remove_columns(self, dataframe):
        return dataframe.drop(DataFrameBuilder.DROP_COLUMNS, axis=1)
    
    # Para creacion de columnas nuevas
    def create_columns(self):
        # CREACION CUENTAS PASIVO Y CUENTAS ACTIVAS CON BANCOLOMBIA
        # POSIBLE: SUMAR LOS CUPOS DE TC
        
        pass
    
    def filter_rows(self, dataframe):
        # BORRAR CUENTAS ACTIVAS > 10
        # BORRAR OBLIGACIONES 10+
        pass
    
    # Guardar Dataframe
    def save_dataframe(self, dataframe, path):
        pass
    
    # En esta funcion va todo el flujo
    def build(self, to_s3=False):
        
        # Borrar variables
        sliced_dataframe = self.remove_columns(self.original_dataframe)
        # Missing values
        na_cleaned_dataframe = self.cleanNA(sliced_dataframe)
        # Procesamiento de columnas
        cleaned_dataframe = self.process_columns(na_cleaned_dataframe)
        self.cleaned_dataframe = cleaned_dataframe
        
        # Eliminar de memoria el dataframe original
        if not self.keep_original:
            self.original_dataframe = None
        
        # Guardado (En local o S3)
        
        return self.cleaned_dataframe
        

In [73]:
test_df = DataFrameBuilder(data)

In [74]:
%time test_df.build()

Wall time: 5.85 s


Unnamed: 0,periodo,id_cli,edad,genero,estado_civil,ult_actual,ind_mora_vigente,departamento_residencia,mora_max,cupo_total_tc,...,cuota_tc_mdo,saldo_no_rot_mdo,cuota_libranza_sf,cant_oblig_tot_sf,ind,rep_calif_cred,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,201910,2089776,35,M,DIV,20180526,N,VALLE,NApl,0.0,...,0,0,0,0,629869.000,C,N,1255032.0,0.0,95511.00
1,201910,2088434,32,F,UL,20190710,N,HUILA,NApl,0.0,...,0,0,0,0,4742261.860,NApl,N,0.0,0.0,1929721.00
2,201910,2088089,33,M,UL,20190312,N,VALLE,Entre 0 y 30 dias,0.0,...,0,0,0,0,1698047.975,NApl,N,0.0,0.0,374940.00
3,201910,4780572,26,M,SOL,20190719,N,BOGOTA D.C.,Entre 0 y 30 dias,15700000.0,...,0,9517000,0,5,2185655.000,B,N,0.0,0.0,2165030.00
4,201910,3894402,28,M,SOL,20190311,N,BOGOTA D.C.,Entre 0 y 30 dias,0.0,...,0,2181000,0,2,-42000.000,NApl,N,0.0,0.0,77469.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118881,201910,2892117,35,M,UL,20190726,N,BOGOTA D.C.,Entre 0 y 30 dias,4200000.0,...,0,0,0,0,2145150.000,A,N,0.0,0.0,488768.00
1118882,201910,4772894,33,F,SOL,20190609,N,CUNDINAMARCA,Entre 0 y 30 dias,0.0,...,0,0,0,0,1021137.300,NApl,N,0.0,0.0,114924.71
1118883,201910,4774849,24,F,DIV,20190410,N,BOGOTA D.C.,Entre 0 y 30 dias,0.0,...,0,0,0,1,788100.000,E,N,0.0,0.0,0.00
1118884,201910,3895049,29,M,UL,20190909,N,ANTIOQUIA,Entre 0 y 30 dias,0.0,...,0,68000,0,2,1016335.400,H,S,2324915.0,1416107.0,153150.00


In [75]:
test_df.cleaned_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1118886 entries, 0 to 1118885
Data columns (total 39 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   periodo                        1118886 non-null  int64  
 1   id_cli                         1118886 non-null  int64  
 2   edad                           1118886 non-null  int32  
 3   genero                         1118886 non-null  object 
 4   estado_civil                   1118886 non-null  object 
 5   ult_actual                     1118886 non-null  int64  
 6   ind_mora_vigente               1118886 non-null  object 
 7   departamento_residencia        1118886 non-null  object 
 8   mora_max                       1118886 non-null  object 
 9   cupo_total_tc                  1118886 non-null  float64
 10  cuota_tc_bancolombia           1118886 non-null  float64
 11  nro_tot_cuentas                1118886 non-null  int32  
 12  ctas_activas  

In [77]:
test_df.cleaned_dataframe['nro_tot_cuentas'].value_counts()

1     945797
2     115717
0      46248
3      10043
4        895
5        119
6         33
7         11
8          5
9          5
14         2
23         2
11         2
13         1
15         1
17         1
12         1
25         1
30         1
35         1
Name: nro_tot_cuentas, dtype: int64

In [22]:
pd.Series(np.where(test_df.cleaned_dataframe['convenio_lib'] == "\\N", "N", "S")).value_counts()

N    959199
S    159687
dtype: int64

In [79]:
test_df.cleaned_dataframe.groupby("categoria")['gasto_familiar'].mean()

categoria
1     5.263293e+05
2     1.072414e+06
3     1.725755e+06
4     1.085627e+06
5     4.580606e+05
\N    2.588147e+06
Name: gasto_familiar, dtype: float64

In [78]:
test_df.cleaned_dataframe.to_csv("cleaned_df.csv", index=False)

In [94]:
df = test_df.original_dataframe
df['ind_mora_vigente'].value_counts()

N     1101373
S       17210
\N        303
Name: ind_mora_vigente, dtype: int64

In [48]:
pd.crosstab(df['ind_mora_vigente'], df['rep_calif_cred'])

rep_calif_cred,A,B,C,D,E,F,G,H,SIN INFO
ind_mora_vigente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
N,249144,158664,152652,118046,78455,46784,138942,59662,99024
S,1284,1122,2095,2389,2047,1636,3184,2420,1033
\N,1,6,43,59,28,17,92,45,12


In [28]:
df['mora_max'].value_counts()[1] / len(df) * 100

28.513539359684543

In [235]:
test_df.cleaned_dataframe.isnull().sum()

periodo                               0
id_cli                                0
edad                                  0
genero                                0
estado_civil                          0
nivel_academico                       0
tipo_vivienda                         0
ult_actual                            0
categoria                             0
ind_mora_vigente                      0
cartera_castigada                     0
departamento_residencia               0
rechazo_credito                       0
mora_max                              0
cant_moras_30_ult_12_meses            0
cant_moras_60_ult_12_meses            0
cant_moras_90_ult_12_meses            0
cupo_total_tc                         0
tenencia_tc                           0
cuota_tc_bancolombia                  0
tiene_consumo                         0
tiene_crediagil                       0
nro_tot_cuentas                       0
ctas_activas                          0
tiene_ctas_activas                    0


# Metadata report

In [22]:
class MetaDataReport:
    
    
    def __init__(self):
        pass

In [54]:
test_df = pd.read_csv("https://bc-dataton2020.s3.amazonaws.com/dataton_all_data/dt2020_base_evaluar_corrected.csv")

# Test DataFrameBuilder in Train and test

In [62]:
train=[]
test=[]
for dataframe in [train, test]:
    pass

In [110]:
import pandas as pd

pycaret_df = pd.read_csv("cleaned_df.csv")
pycaret_df

Unnamed: 0,periodo,id_cli,edad,genero,estado_civil,ult_actual,ind_mora_vigente,departamento_residencia,mora_max,cupo_total_tc,...,cuota_tc_mdo,saldo_no_rot_mdo,cuota_libranza_sf,cant_oblig_tot_sf,ind,rep_calif_cred,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,201910,2089776,35,M,DIV,20180526,N,VALLE,NApl,0.0,...,0,0,0,0,629869.000,C,N,1255032.0,0.0,95511.00
1,201910,2088434,32,F,UL,20190710,N,HUILA,NApl,0.0,...,0,0,0,0,4742261.860,NApl,N,0.0,0.0,1929721.00
2,201910,2088089,33,M,UL,20190312,N,VALLE,Entre 0 y 30 dias,0.0,...,0,0,0,0,1698047.975,NApl,N,0.0,0.0,374940.00
3,201910,4780572,26,M,SOL,20190719,N,BOGOTA D.C.,Entre 0 y 30 dias,15700000.0,...,0,9517000,0,5,2185655.000,B,N,0.0,0.0,2165030.00
4,201910,3894402,28,M,SOL,20190311,N,BOGOTA D.C.,Entre 0 y 30 dias,0.0,...,0,2181000,0,2,-42000.000,NApl,N,0.0,0.0,77469.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118881,201910,2892117,35,M,UL,20190726,N,BOGOTA D.C.,Entre 0 y 30 dias,4200000.0,...,0,0,0,0,2145150.000,A,N,0.0,0.0,488768.00
1118882,201910,4772894,33,F,SOL,20190609,N,CUNDINAMARCA,Entre 0 y 30 dias,0.0,...,0,0,0,0,1021137.300,NApl,N,0.0,0.0,114924.71
1118883,201910,4774849,24,F,DIV,20190410,N,BOGOTA D.C.,Entre 0 y 30 dias,0.0,...,0,0,0,1,788100.000,E,N,0.0,0.0,0.00
1118884,201910,3895049,29,M,UL,20190909,N,ANTIOQUIA,Entre 0 y 30 dias,0.0,...,0,68000,0,2,1016335.400,H,S,2324915.0,1416107.0,153150.00


In [111]:
pycaret_df.drop(["periodo", "id_cli", "departamento_residencia", 
                 "ind", "ult_actual", "rep_calif_cred", "nro_tot_cuentas", "ctas_activas", "genero"], axis=1, inplace=True)

In [112]:
pycaret_df

Unnamed: 0,edad,estado_civil,ind_mora_vigente,mora_max,cupo_total_tc,cuota_tc_bancolombia,cuota_cred_hipot,mediana_nom3,mediana_pen3,ingreso_nompen,...,cupo_tc_mdo,saldo_prom3_tdc_mdo,cuota_tc_mdo,saldo_no_rot_mdo,cuota_libranza_sf,cant_oblig_tot_sf,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,35,DIV,N,NApl,0.0,0.000000e+00,0.0,1255032.0,0.0,1255032.0,...,0,0.0,0,0,0,0,N,1255032.0,0.0,95511.00
1,32,UL,N,NApl,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0,0.0,0,0,0,0,N,0.0,0.0,1929721.00
2,33,UL,N,Entre 0 y 30 dias,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0,0.0,0,0,0,0,N,0.0,0.0,374940.00
3,26,SOL,N,Entre 0 y 30 dias,15700000.0,2.796635e+06,0.0,0.0,0.0,0.0,...,0,0.0,0,9517000,0,5,N,0.0,0.0,2165030.00
4,28,SOL,N,Entre 0 y 30 dias,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0,0.0,0,2181000,0,2,N,0.0,0.0,77469.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118881,35,UL,N,Entre 0 y 30 dias,4200000.0,1.218000e+05,0.0,0.0,0.0,0.0,...,0,0.0,0,0,0,0,N,0.0,0.0,488768.00
1118882,33,SOL,N,Entre 0 y 30 dias,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0,0.0,0,0,0,0,N,0.0,0.0,114924.71
1118883,24,DIV,N,Entre 0 y 30 dias,0.0,0.000000e+00,0.0,0.0,0.0,0.0,...,0,0.0,0,0,0,1,N,0.0,0.0,0.00
1118884,29,UL,N,Entre 0 y 30 dias,0.0,0.000000e+00,0.0,2371104.0,0.0,2371104.0,...,7500000,0.0,0,68000,0,2,S,2324915.0,1416107.0,153150.00


In [65]:
pycaret_df['cat_ingreso'].unique()

array(['NOM', 'OTROS', 'PEN', 'NOM PEN'], dtype=object)

In [82]:
pycaret_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1118886 entries, 0 to 1118885
Data columns (total 33 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   edad                           1118886 non-null  int64  
 1   genero                         1118886 non-null  object 
 2   estado_civil                   1118886 non-null  object 
 3   ind_mora_vigente               1118886 non-null  object 
 4   mora_max                       1118886 non-null  object 
 5   cupo_total_tc                  1118886 non-null  float64
 6   cuota_tc_bancolombia           1118886 non-null  float64
 7   nro_tot_cuentas                1118886 non-null  int64  
 8   ctas_activas                   1118886 non-null  int64  
 9   cuota_cred_hipot               1118886 non-null  float64
 10  mediana_nom3                   1118886 non-null  float64
 11  mediana_pen3                   1118886 non-null  float64
 12  ingreso_nompen

In [None]:
from pycaret.regression import *

In [None]:
sample_df = pycaret_df.sample(n=250000,random_state=None).dropna()
sample_df

In [None]:
categorical_features=sample_df.select_dtypes("object").columns.tolist()
categorical_features

In [None]:
sample_df

In [122]:
reg1 = setup(data = sample_df.drop(categorical_features, axis=1), target = 'gasto_familiar',
             session_id=123, fold_shuffle=True, fold=5, preprocess=False)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,gasto_familiar
2,Original Data,"(99996, 23)"
3,Missing Values,False
4,Numeric Features,22
5,Categorical Features,0
6,Transformed Train Set,"(69997, 22)"
7,Transformed Test Set,"(29999, 22)"
8,Shuffle Train-Test,True
9,Stratify Train-Test,False


In [None]:
best_model = compare_models(fold=5)

IntProgress(value=0, description='Processing: ', max=99)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ridge,Ridge Regression,648549.9375,2524049309696.0,1565193.4,0.089,3.1811,144.0803,0.07
llar,Lasso Least Angle Regression,649288.6539,2530969423191.6465,1566964.8903,0.0874,3.1826,148.7454,0.092
lasso,Lasso Regression,649319.075,2531030728704.0,1566989.925,0.0873,3.1825,148.6931,0.674
en,Elastic Net,649210.95,2531032039424.0,1566989.625,0.0873,3.1826,148.8762,0.656
lar,Least Angle Regression,649515.6267,2531028125567.5,1566989.2389,0.0873,3.1824,148.3154,0.086
lr,Linear Regression,648762.4125,2532805260083.2,1567569.3,0.0866,3.1829,150.9876,0.048
br,Bayesian Ridge,649911.3825,2534547563763.2607,1568165.0681,0.0859,3.1831,152.6632,0.108
omp,Orthogonal Matching Pursuit,652320.6098,2541005333148.775,1570142.2593,0.0837,3.1879,158.5463,0.09
huber,Huber Regressor,621163.5207,2804976718627.6846,1656397.8675,-0.028,3.014,102.1053,1.128
dt,Decision Tree Regressor,898402.4756,4777388950172.699,2161346.1832,-0.7548,4.221,125.5676,9.254


In [12]:
best_model

NameError: name 'best_model' is not defined

In [1]:
import pandas as pd

In [3]:
eval_df = pd.read_csv("../data/dt2020_base_evaluar_corrected.csv")

In [4]:
eval_df

Unnamed: 0,id_registro,periodo,id_cli,fecha_nacimiento,edad,genero,estado_civil,nivel_academico,profesion,ocupacion,...,saldo_no_rot_mdo,cuota_libranza_sf,cant_oblig_tot_sf,cant_cast_ult_12m_sr,ind,rep_calif_cred,pol_centr_ext,convenio_lib,ingreso_nomina,ingreso_segurida_social
0,4055#201902,201902,4055,19721103,46.203970,M,CASADO,UNIVERSITARIO,DISEÑO Y PUBLICIDAD,Empleado,...,127148000.0,1797000.0,7.0,0.0,2.060457e+06,A,0.0,74339,2734528.0,
1,12598#201902,201902,12598,19750930,43.299110,M,SOLTERO,UNIVERSITARIO,ADMINISTRACION,Empleado,...,87661000.0,0.0,5.0,66.0,-7.187437e+07,H,15.0,,2291070.0,
2,23697#201902,201902,23697,19740130,44.963723,M,CASADO,UNIVERSITARIO,OTROS,Empleado,...,58967000.0,0.0,3.0,0.0,-1.448765e+06,A,0.0,,,
3,23768#201902,201902,23768,19610620,57.577002,M,NO INFORMA,BACHILLER,,Empleado,...,0.0,0.0,0.0,0.0,2.208260e+06,G,0.0,,,
4,7174#201902,201902,7174,19661010,52.271047,F,DIVORCIADO,UNIVERSITARIO,MEDICINA,Empleado,...,22746000.0,0.0,7.0,0.0,3.212636e+06,A,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281661,8596193#202011,202011,8596193,19740415,46.516085,F,SOLTERO,UNIVERSITARIO,OTROS,Empleado,...,0.0,0.0,2.0,0.0,2.101424e+06,A,0.0,70721,2281362.5,3876343.0
281662,8616083#202011,202011,8616083,19850620,35.334702,M,NO INFORMA,NO INFORMA,,Socio Empleado - Socio,...,99000.0,0.0,12.0,0.0,1.170550e+07,C,0.0,,14510896.0,
281663,8670207#202011,202011,8670207,19740316,46.598220,M,DIVORCIADO,NO INFORMA,,Profesional Independiente,...,300000.0,0.0,17.0,0.0,-4.154790e+07,D,0.0,,,
281664,8677655#202011,202011,8677655,19780722,42.247775,F,CASADO,ESPECIALIZACION,ECONOMIA,Empleado,...,184811000.0,0.0,18.0,0.0,5.313457e+06,B,0.0,70077,11401109.0,14481371.0


In [13]:
list_ =[1,2,3]
list_.remove(3).insert(0, 8)
list_

[1, 2]

In [15]:
dataset_test=DataFrameBuilder(eval_df, test=True)

In [18]:
import numpy as np

In [19]:
dataset_test.build()

ValueError: invalid literal for int() with base 10: '7.0'

In [25]:
eval_df['cant_oblig_tot_sf'].astype(float).astype(int)

ValueError: Cannot convert non-finite values (NA or inf) to integer