# BC Dataton

In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns



# Pre-processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

#modeling

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

# Feature engineerin and Dimension reduction
from sklearn.feature_selection import VarianceThreshold


# Utils
from tqdm import tqdm
from pandas_profiling import ProfileReport
from pycaret.regression import *


%matplotlib inline

sns.set_style("white")
matplotlib.rc('xtick', labelsize=15)
matplotlib.rc('ytick', labelsize=15)
plt.rcParams['figure.figsize'] = [16.0, 10.0]

In [None]:
data = pd.read_csv("https://bc-dataton2020.s3.amazonaws.com/dataton_all_data/train/train_201910.csv")
data.head()

# Data Processing class

In [151]:
class DataFrameBuilder:
    
    HEADER="https://bc-dataton2020.s3.amazonaws.com/dataton_all_data/header.txt"
    NUMERIC_COLUMNS=[
        "ingreso_segurida_social",
        "mora_max"
    ]
    STRING_COLUMNS=[]
    BOOLEAN_COLUMNS=[]
    DROP_COLUMNS=[
        "fecha_nacimiento",
        "profesion",
        "ocupacion"
    ]
    
    def __init__(self, dataframe, keep_original=False):
        self.original_dataframe = self._assign_columns(dataframe.copy())
        self.cleaned_dataframe = None
        self.keep_original = keep_original
        
    def _assign_columns(self, dataframe):
        column_names = pd.read_csv(DataFrameBuilder.HEADER).columns.to_list()
        dataframe.columns = column_names
        return dataframe
    
    # Reemplazar \N por NA
    def cleanNA(self, dataframe):
        
        for column in dataframe.columns:
            if column in DataFrameBuilder.NUMERIC_COLUMNS :
                dataframe[column] = dataframe[column].replace("\\N", np.nan).astype('float')
        return dataframe
    
    # Manejo de datos faltantes
    def handle_missing_data(self):
        pass
    
    # Modificacion de columnas existentes
    def process_columns(self, dataframe):
        
        # Procesamiento columnas demograficas
        # Procesamiento columnas financieras
        # Procesamiento columnas de riesgo
        dataframe['rep_calif_cred'] = np.where(
                                        dataframe['rep_calif_cred'] == "SIN INFO","NA",
                                        dataframe['rep_calif_cred']
        )
        
        dataframe['mora_max'] = np.where(
                                   dataframe['mora_max'] < 30, "Entre 0 y 30 dias",
                                   np.where(
                                       dataframe['mora_max'] < 60, "Entre 31 y 60 dias",
                                       np.where(
                                           dataframe['mora_max'] > 60, "Mas de 60", "NA")
                                   )
        )
        return dataframe
    
    # Para eliminar las columnas que no vamos a usar
    def remove_columns(self, dataframe):
        
        return dataframe.drop(DataFrameBuilder.DROP_COLUMNS, axis=1)
    
    # Para creacion de columnas nuevas
    def create_columns(self):
        pass
    
    # Guardar Dataframe
    def save_dataframe(self):
        pass
    
    # En esta funcion va todo el flujo
    def build(self, to_s3=False):
        
        # Borrar variables
        sliced_dataframe = self.remove_columns(self.original_dataframe)
        # Missing values
        na_cleaned_dataframe = self.cleanNA(sliced_dataframe)
        # Procesamiento de columnas
        cleaned_dataframe = self.process_columns(na_cleaned_dataframe)
        self.cleaned_dataframe = cleaned_dataframe
        
        # Eliminar de memoria el dataframe original
        if not self.keep_original:
            self.original_dataframe = None
        
        # Guardado (En local o S3)
        
        return self.cleaned_dataframe
        

In [147]:
test_df = DataFrameBuilder(data)

In [148]:
test_df.build()['mora_max'].value_counts()

ValueError: could not convert string to float: 'NA'

In [145]:
test_df.cleaned_dataframe

Unnamed: 0,periodo,id_cli,fecha_nacimiento,edad,genero,estado_civil,nivel_academico,profesion,ocupacion,tipo_vivienda,ult_actual,categoria,codigo_ciiu,ind_mora_vigente,cartera_castigada,ciudad_residencia,departamento_residencia,ciudad_laboral,departamento_laboral,rechazo_credito,mora_max,cant_moras_30_ult_12_meses,cant_moras_60_ult_12_meses,cant_moras_90_ult_12_meses,cupo_total_tc,tenencia_tc,cuota_tc_bancolombia,tiene_consumo,tiene_crediagil,nro_tot_cuentas,ctas_activas,tiene_ctas_activas,ctas_embargadas,tiene_ctas_embargadas,pension_fopep,cuota_cred_hipot,tiene_cred_hipo_1,tiene_cred_hipo_2,mediana_nom3,mediana_pen3,ingreso_nompen,cat_ingreso,ingreso_final,cant_mora_30_tdc_ult_3m_sf,cant_mora_30_consum_ult_3m_sf,cuota_de_vivienda,cuota_de_consumo,cuota_rotativos,cuota_tarjeta_de_credito,cuota_de_sector_solidario,cuota_sector_real_comercio,cupo_tc_mdo,saldo_prom3_tdc_mdo,cuota_tc_mdo,saldo_no_rot_mdo,cuota_libranza_sf,cant_oblig_tot_sf,cant_cast_ult_12m_sr,ind,rep_calif_cred,pol_centr_ext,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,201910,2089776,19840630,35.236140,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,20180526,1,10,N,N,CALI ...,VALLE ...,CALI ...,VALLE ...,\N,,\N,\N,\N,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,1255032.0,0.0,1255032.0,NOM,1259738,0,0,0.0,0,0,0,0,0,0,0.0,0,0,0,0,0,629869,C,0,\N,1255032,,95511.00
1,201910,2088434,19880109,31.709788,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,20190710,4,10,N,N,GARZON ...,HUILA ...,\N,\N,\N,,\N,\N,\N,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7970188,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,4742261.859999999,,\N,\N,\N,,1929721.00
2,201910,2088089,19860727,33.163587,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,20190312,4,8230,N,N,PALMIRA ...,VALLE ...,\N,\N,\N,Entre 0 y 30 dias,0,0,0,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,3073390,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,1698047.975,,\N,\N,\N,,374940.00
3,201910,4780572,19940208,25.626283,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,20190719,2,90,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,Entre 0 y 30 dias,0,0,0,15700000.0,SI,2.796635e+06,X,X,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7749000,0,0,0.0,1750000,0,675000,0,0,0,0.0,0,9517000,0,5,0,2185655,B,0,\N,\N,,2165030.00
4,201910,3894402,19910809,28.128679,M,SOLTERO,TECNOLOGO,DISEÑO Y PUBLICIDAD,Independiente,\N,20190311,4,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,Entre 0 y 30 dias,0,0,0,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,1200000,0,0,0.0,134000,0,0,0,244000,0,0.0,0,2181000,0,2,0,-42000,,0,\N,\N,,77469.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118881,201910,2892117,19840610,35.290897,M,UNION LIBRE,BACHILLER,\N,Comerciante,\N,20190726,4,4724,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,Entre 0 y 30 dias,0,0,0,4200000.0,SI,1.218000e+05,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,3810000.0,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,2145150.000000,A,\N,\N,\N,,488768.00
1118882,201910,4772894,19860721,33.180014,F,SOLTERO,NO INFORMA,\N,Independiente,\N,20190609,4,10,N,N,BOGOTA D.C. ...,CUNDINAMARCA ...,BOGOTA D.C. ...,AMAZONAS ...,\N,Entre 0 y 30 dias,0,0,0,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,2002230.0,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,1021137.300000,,\N,\N,\N,,114924.71
1118883,201910,4774849,19960318,23.520876,F,DIVORCIADO,NO INFORMA,\N,Empleado,\N,20190410,1,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,AMAZONAS ...,\N,Entre 0 y 30 dias,0,0,0,0.0,NO,0.000000e+00,\N,\N,0,0,\N,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,2110000.0,0,0,0.0,144000,0,0,0,144000,0,0.0,0,0,0,1,0,788100.000000,E,0,\N,\N,,0.00
1118884,201910,3895049,19910224,28.583162,M,UNION LIBRE,NO INFORMA,\N,Empleado,\N,20190909,1,10,N,N,APARTADO ...,ANTIOQUIA ...,NECOCLI ...,ANTIOQUIA ...,\N,Entre 0 y 30 dias,0,0,0,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,2371104.0,0.0,2371104.0,NOM,1955559.0,0,0,0.0,0,0,157000,0,0,7500000,0.0,0,68000,0,2,0,1016335.400000,H,0,72080,2324915,1416107.0,153150.00


In [150]:
data['mora_max']

0                         NA
1                         NA
2          Entre 0 y 30 dias
3          Entre 0 y 30 dias
4          Entre 0 y 30 dias
                 ...        
1118881    Entre 0 y 30 dias
1118882    Entre 0 y 30 dias
1118883    Entre 0 y 30 dias
1118884    Entre 0 y 30 dias
1118885    Entre 0 y 30 dias
Name: mora_max, Length: 1118886, dtype: object

In [94]:
df = test_df.original_dataframe
df['ind_mora_vigente'].value_counts()

N     1101373
S       17210
\N        303
Name: ind_mora_vigente, dtype: int64

In [48]:
pd.crosstab(df['ind_mora_vigente'], df['rep_calif_cred'])

rep_calif_cred,A,B,C,D,E,F,G,H,SIN INFO
ind_mora_vigente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
N,249144,158664,152652,118046,78455,46784,138942,59662,99024
S,1284,1122,2095,2389,2047,1636,3184,2420,1033
\N,1,6,43,59,28,17,92,45,12


In [28]:
df['mora_max'].value_counts()[1] / len(df) * 100

28.513539359684543

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1118886 entries, 0 to 1118885
Data columns (total 65 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   periodo                        1118886 non-null  int64  
 1   id_cli                         1118886 non-null  int64  
 2   fecha_nacimiento               1118886 non-null  int64  
 3   edad                           1118886 non-null  object 
 4   genero                         1118886 non-null  object 
 5   estado_civil                   1118886 non-null  object 
 6   nivel_academico                1118886 non-null  object 
 7   profesion                      1118886 non-null  object 
 8   ocupacion                      1118886 non-null  object 
 9   tipo_vivienda                  1118886 non-null  object 
 10  ult_actual                     1118886 non-null  int64  
 11  categoria                      1118886 non-null  object 
 12  codigo_ciiu   

# Metadata report

In [22]:
class MetaDataReport:
    
    
    def __init__(self):
        pass

In [58]:
test_df = pd.read_csv("https://bc-dataton2020.s3.amazonaws.com/dataton_all_data/dt2020_base_evaluar_corrected.csv")

In [59]:
test_df

Unnamed: 0,id_registro,periodo,id_cli,fecha_nacimiento,edad,genero,estado_civil,nivel_academico,profesion,ocupacion,tipo_vivienda,ult_actual,categoria,codigo_ciiu,ind_mora_vigente,cartera_castigada,ciudad_residencia,departamento_residencia,ciudad_laboral,departamento_laboral,rechazo_credito,mora_max,cant_moras_30_ult_12_meses,cant_moras_60_ult_12_meses,cant_moras_90_ult_12_meses,cupo_total_tc,tenencia_tc,cuota_tc_bancolombia,tiene_consumo,tiene_crediagil,nro_tot_cuentas,ctas_activas,tiene_ctas_activas,ctas_embargadas,tiene_ctas_embargadas,pension_fopep,cuota_cred_hipot,tiene_cred_hipo_1,tiene_cred_hipo_2,mediana_nom3,mediana_pen3,ingreso_nompen,cat_ingreso,ingreso_final,cant_mora_30_tdc_ult_3m_sf,cant_mora_30_consum_ult_3m_sf,cuota_de_vivienda,cuota_de_consumo,cuota_rotativos,cuota_tarjeta_de_credito,cuota_de_sector_solidario,cuota_sector_real_comercio,cupo_tc_mdo,saldo_prom3_tdc_mdo,cuota_tc_mdo,saldo_no_rot_mdo,cuota_libranza_sf,cant_oblig_tot_sf,cant_cast_ult_12m_sr,ind,rep_calif_cred,pol_centr_ext,convenio_lib,ingreso_nomina,ingreso_segurida_social
0,4055#201902,201902,4055,19721103,46.203970,M,CASADO,UNIVERSITARIO,DISEÑO Y PUBLICIDAD,Empleado,FAMILIAR,20180807,2.0,10,N,N,SABANETA - ANTIOQUIA ...,ANTIOQUIA ...,ENVIGADO ...,ANTIOQUIA ...,,,,,,0.0,NO,0.000,,,2,2,X,0,,,,,,7216586.0,0.0,7216586.0,NOM,4827796.0,0.0,0.0,0.00,1797000.00,954000.0,365000.0,0.0,0.0,18290000.0,2.825000e+05,19000.0,127148000.0,1797000.0,7.0,0.0,2.060457e+06,A,0.0,74339,2734528.0,
1,12598#201902,201902,12598,19750930,43.299110,M,SOLTERO,UNIVERSITARIO,ADMINISTRACION,Empleado,FAMILIAR,20180130,1.0,10,N,N,BOGOTA D.C. ...,CUNDINAMARCA ...,BOGOTA D.C. ...,CUNDINAMARCA ...,,1.0,0.0,0.0,0.0,0.0,NO,0.000,,,1,1,X,0,,,,,,2331528.0,0.0,2331528.0,NOM,2316057.0,0.0,6.0,0.00,4637000.00,0.0,0.0,0.0,68627000.0,0.0,0.000000e+00,0.0,87661000.0,0.0,5.0,66.0,-7.187437e+07,H,15.0,,2291070.0,
2,23697#201902,201902,23697,19740130,44.963723,M,CASADO,UNIVERSITARIO,OTROS,Empleado,ALQUILADA,20190110,2.0,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,,1.0,0.0,0.0,0.0,0.0,NO,0.000,,,2,2,X,0,,,,,,5954982.0,0.0,5954982.0,NOM,5354621.0,0.0,0.0,0.00,1750000.00,0.0,147000.0,0.0,0.0,7000000.0,3.300000e+04,33000.0,58967000.0,0.0,3.0,0.0,-1.448765e+06,A,0.0,,,
3,23768#201902,201902,23768,19610620,57.577002,M,NO INFORMA,BACHILLER,,Empleado,ALQUILADA,20180208,1.0,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,,,,,,0.0,NO,0.000,,,1,1,X,0,,,,,,0.0,0.0,0.0,,3711362.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,2.208260e+06,G,0.0,,,
4,7174#201902,201902,7174,19661010,52.271047,F,DIVORCIADO,UNIVERSITARIO,MEDICINA,Empleado,PROPIA,20180825,3.0,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,,0.0,0.0,0.0,0.0,5750000.0,SI,316178.937,,,1,1,X,0,,,,,,0.0,0.0,0.0,,9059892.0,0.0,0.0,0.00,1500000.00,81000.0,597000.0,0.0,0.0,29425000.0,4.088500e+06,11000.0,22746000.0,0.0,7.0,0.0,3.212636e+06,A,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281661,8596193#202011,202011,8596193,19740415,46.516085,F,SOLTERO,UNIVERSITARIO,OTROS,Empleado,FAMILIAR,20201010,2.0,10,N,N,MEDELLIN ...,ANTIOQUIA ...,MEDELLIN ...,ANTIOQUIA ...,,0.0,0.0,0.0,0.0,6000000.0,SI,508134.276,,,1,1,X,0,,,,,,2281362.0,0.0,2281362.0,NOM,3876343.0,0.0,0.0,0.00,0.00,0.0,205000.0,0.0,0.0,6800000.0,0.000000e+00,0.0,0.0,0.0,2.0,0.0,2.101424e+06,A,0.0,70721,2281362.5,3876343.0
281662,8616083#202011,202011,8616083,19850620,35.334702,M,NO INFORMA,NO INFORMA,,Socio Empleado - Socio,PROPIA,20201003,3.0,90,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,,,,28.0,0.0,0.0,0.0,71000000.0,SI,2485000.000,,X,2,2,X,0,,,,,,14956096.0,0.0,14956096.0,NOM,26913722.0,0.0,0.0,0.00,889000.00,886000.0,2700000.0,0.0,14000.0,121890000.0,8.286667e+06,237000.0,99000.0,0.0,12.0,0.0,1.170550e+07,C,0.0,,14510896.0,
281663,8670207#202011,202011,8670207,19740316,46.598220,M,DIVORCIADO,NO INFORMA,,Profesional Independiente,FAMILIAR,20200616,3.0,10,N,N,LA CALERA ...,CUNDINAMARCA ...,BOGOTA D.C. ...,BOGOTA D.C. ...,,23.0,0.0,0.0,0.0,150800000.0,SI,7620965.514,X,X,1,1,X,0,,,9794649.65,X,,0.0,0.0,0.0,,16301000.0,0.0,0.0,9794649.65,15648142.70,163000.0,26334000.0,0.0,0.0,30100000.0,3.091700e+07,10350000.0,300000.0,0.0,17.0,0.0,-4.154790e+07,D,0.0,,,
281664,8677655#202011,202011,8677655,19780722,42.247775,F,CASADO,ESPECIALIZACION,ECONOMIA,Empleado,NO INFORMA,20200423,2.0,10,N,N,MEDELLIN ...,ANTIOQUIA ...,MEDELLIN ...,ANTIOQUIA ...,,0.0,0.0,0.0,0.0,26000000.0,SI,1014000.000,X,,1,1,X,0,,,,,,11916465.0,0.0,11916465.0,NOM,14481371.0,0.0,0.0,0.00,1380417.00,173000.0,1894000.0,0.0,0.0,102025000.0,3.760867e+07,1953000.0,184811000.0,0.0,18.0,0.0,5.313457e+06,B,0.0,70077,11401109.0,14481371.0


In [60]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 281666 entries, 0 to 281665
Data columns (total 65 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   id_registro                    281666 non-null  object 
 1   periodo                        281666 non-null  int64  
 2   id_cli                         281666 non-null  int64  
 3   fecha_nacimiento               281666 non-null  int64  
 4   edad                           281666 non-null  float64
 5   genero                         281666 non-null  object 
 6   estado_civil                   281148 non-null  object 
 7   nivel_academico                281666 non-null  object 
 8   profesion                      232080 non-null  object 
 9   ocupacion                      281650 non-null  object 
 10  tipo_vivienda                  190599 non-null  object 
 11  ult_actual                     281666 non-null  int64  
 12  categoria                     

# Test DataFrameBuilder in Train and test

In [62]:
train=[]
test=[]
for dataframe in [train, test]:
    pass

In [113]:
df.head(20)

Unnamed: 0,periodo,id_cli,fecha_nacimiento,edad,genero,estado_civil,nivel_academico,profesion,ocupacion,tipo_vivienda,ult_actual,categoria,codigo_ciiu,ind_mora_vigente,cartera_castigada,ciudad_residencia,departamento_residencia,ciudad_laboral,departamento_laboral,rechazo_credito,mora_max,cant_moras_30_ult_12_meses,cant_moras_60_ult_12_meses,cant_moras_90_ult_12_meses,cupo_total_tc,tenencia_tc,cuota_tc_bancolombia,tiene_consumo,tiene_crediagil,nro_tot_cuentas,ctas_activas,tiene_ctas_activas,ctas_embargadas,tiene_ctas_embargadas,pension_fopep,cuota_cred_hipot,tiene_cred_hipo_1,tiene_cred_hipo_2,mediana_nom3,mediana_pen3,ingreso_nompen,cat_ingreso,ingreso_final,cant_mora_30_tdc_ult_3m_sf,cant_mora_30_consum_ult_3m_sf,cuota_de_vivienda,cuota_de_consumo,cuota_rotativos,cuota_tarjeta_de_credito,cuota_de_sector_solidario,cuota_sector_real_comercio,cupo_tc_mdo,saldo_prom3_tdc_mdo,cuota_tc_mdo,saldo_no_rot_mdo,cuota_libranza_sf,cant_oblig_tot_sf,cant_cast_ult_12m_sr,ind,rep_calif_cred,pol_centr_ext,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,201910,2089776,19840630,35.23614,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,20180526,1,10,N,N,CALI ...,VALLE ...,CALI ...,VALLE ...,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,1255032.0,0.0,1255032.0,NOM,1259738.0,0,0,0.0,0,0,0,0,0,0,0.0,0,0,0,0,0,629869.0,C,0,\N,1255032,\N,95511.0
1,201910,2088434,19880109,31.709788,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,20190710,4,10,N,N,GARZON ...,HUILA ...,\N,\N,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7970188.0,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,4742261.859999999,,\N,\N,\N,\N,1929721.0
2,201910,2088089,19860727,33.163587,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,20190312,4,8230,N,N,PALMIRA ...,VALLE ...,\N,\N,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,3073390.0,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,1698047.975,,\N,\N,\N,\N,374940.0
3,201910,4780572,19940208,25.626283,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,20190719,2,90,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,0,0,0,0,15700000.0,SI,2796635.0,X,X,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7749000.0,0,0,0.0,1750000,0,675000,0,0,0,0.0,0,9517000,0,5,0,2185655.0,B,0,\N,\N,\N,2165030.0
4,201910,3894402,19910809,28.128679,M,SOLTERO,TECNOLOGO,DISEÑO Y PUBLICIDAD,Independiente,\N,20190311,4,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,1200000.0,0,0,0.0,134000,0,0,0,244000,0,0.0,0,2181000,0,2,0,-42000.0,,0,\N,\N,\N,77469.29
5,201910,3892351,19910108,28.711841,M,SOLTERO,TECNOLOGO,OTROS,Independiente,\N,20190802,4,10,N,N,MEDELLIN ...,ANTIOQUIA ...,\N,\N,\N,\N,\N,\N,\N,0.0,SI,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7372850.0,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,4386845.75,,\N,\N,\N,\N,710999.0
6,201910,2897552,19900903,29.059548,M,SOLTERO,BACHILLER,\N,Empleado,\N,20190906,1,10,N,N,MEDELLIN ...,ANTIOQUIA ...,MEDELLIN ...,ANTIOQUIA ...,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,1337120.0,0.0,1337120.0,NOM,1300628.0,0,0,0.0,0,0,0,0,524000,0,0.0,0,2481000,0,0,1,126314.0,F,7,\N,1252458.5,\N,210600.0
7,201910,2897375,19850119,34.680356,F,SOLTERO,UNIVERSITARIO,COMUNICACION SOCIAL,Empleado,\N,20190820,1,10,N,N,CALI ...,VALLE ...,SIN INFORMACION ...,SIN INFORMACION ...,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,2782797.0,0.0,2782797.0,NOM,3722742.0,0,0,0.0,221000,0,255000,0,0,500000,100000.0,567000,1237000,0,2,0,2129919.4,D,0,\N,\N,\N,344640.0
8,201910,4782141,19790623,40.257358,F,NO INFORMA,SIN INFORMACION,\N,Empleado,\N,20190725,1,10,N,N,BOGOTA D.C. ...,CUNDINAMARCA ...,\N,\N,\N,0,0,0,0,0.0,NO,0.0,X,\N,1,1,X,0,\N,\N,\N,\N,\N,4353538.0,0.0,4353538.0,NOM,3374070.0,0,0,0.0,386000,11000,1006000,0,29000,25800000,16527670.0,734000,248000,0,4,0,929849.0,C,0,70831,4351202,3500000,270189.0
9,201910,1259390,19830725,36.169747,M,SOLTERO,TECNOLOGO,OTROS,Empleado,FAMILIAR,20190429,1,10,N,N,MEDELLIN ...,ANTIOQUIA ...,\N,\N,\N,0,0,0,0,6000000.0,SI,126000.0,X,X,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,2629942.3,0,0,0.0,595000,17000,497000,0,95000,3855000,2666.667,5000,21423000,501000,10,0,1006462.495,A,0,\N,\N,\N,84040.0


In [107]:
df['gasto_familiar'].dtype

dtype('float64')

In [116]:
df.head(20)['ingreso_segurida_social'].replace("\\N", np.nan)

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
5         NaN
6         NaN
7         NaN
8     3500000
9         NaN
10        NaN
11        NaN
12        NaN
13        NaN
14        NaN
15        NaN
16        NaN
17    1327000
18        NaN
19        NaN
Name: ingreso_segurida_social, dtype: object

In [122]:
df

Unnamed: 0,periodo,id_cli,fecha_nacimiento,edad,genero,estado_civil,nivel_academico,profesion,ocupacion,tipo_vivienda,ult_actual,categoria,codigo_ciiu,ind_mora_vigente,cartera_castigada,ciudad_residencia,departamento_residencia,ciudad_laboral,departamento_laboral,rechazo_credito,mora_max,cant_moras_30_ult_12_meses,cant_moras_60_ult_12_meses,cant_moras_90_ult_12_meses,cupo_total_tc,tenencia_tc,cuota_tc_bancolombia,tiene_consumo,tiene_crediagil,nro_tot_cuentas,ctas_activas,tiene_ctas_activas,ctas_embargadas,tiene_ctas_embargadas,pension_fopep,cuota_cred_hipot,tiene_cred_hipo_1,tiene_cred_hipo_2,mediana_nom3,mediana_pen3,ingreso_nompen,cat_ingreso,ingreso_final,cant_mora_30_tdc_ult_3m_sf,cant_mora_30_consum_ult_3m_sf,cuota_de_vivienda,cuota_de_consumo,cuota_rotativos,cuota_tarjeta_de_credito,cuota_de_sector_solidario,cuota_sector_real_comercio,cupo_tc_mdo,saldo_prom3_tdc_mdo,cuota_tc_mdo,saldo_no_rot_mdo,cuota_libranza_sf,cant_oblig_tot_sf,cant_cast_ult_12m_sr,ind,rep_calif_cred,pol_centr_ext,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,201910,2089776,19840630,35.236140,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,20180526,1,10,N,N,CALI ...,VALLE ...,CALI ...,VALLE ...,\N,\N,\N,\N,\N,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,1255032.0,0.0,1255032.0,NOM,1259738,0,0,0.0,0,0,0,0,0,0,0.0,0,0,0,0,0,629869,C,0,\N,1255032,\N,95511.00
1,201910,2088434,19880109,31.709788,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,20190710,4,10,N,N,GARZON ...,HUILA ...,\N,\N,\N,\N,\N,\N,\N,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7970188,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,4742261.859999999,,\N,\N,\N,\N,1929721.00
2,201910,2088089,19860727,33.163587,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,20190312,4,8230,N,N,PALMIRA ...,VALLE ...,\N,\N,\N,1,0,0,0,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,3073390,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,1698047.975,,\N,\N,\N,\N,374940.00
3,201910,4780572,19940208,25.626283,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,20190719,2,90,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,0,0,0,0,15700000.0,SI,2.796635e+06,X,X,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7749000,0,0,0.0,1750000,0,675000,0,0,0,0.0,0,9517000,0,5,0,2185655,B,0,\N,\N,\N,2165030.00
4,201910,3894402,19910809,28.128679,M,SOLTERO,TECNOLOGO,DISEÑO Y PUBLICIDAD,Independiente,\N,20190311,4,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,1,0,0,0,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,1200000,0,0,0.0,134000,0,0,0,244000,0,0.0,0,2181000,0,2,0,-42000,,0,\N,\N,\N,77469.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118881,201910,2892117,19840610,35.290897,M,UNION LIBRE,BACHILLER,\N,Comerciante,\N,20190726,4,4724,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,0,0,0,0,4200000.0,SI,1.218000e+05,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,3810000.0,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,2145150.000000,A,\N,\N,\N,\N,488768.00
1118882,201910,4772894,19860721,33.180014,F,SOLTERO,NO INFORMA,\N,Independiente,\N,20190609,4,10,N,N,BOGOTA D.C. ...,CUNDINAMARCA ...,BOGOTA D.C. ...,AMAZONAS ...,\N,1,0,0,0,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,2002230.0,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,1021137.300000,,\N,\N,\N,\N,114924.71
1118883,201910,4774849,19960318,23.520876,F,DIVORCIADO,NO INFORMA,\N,Empleado,\N,20190410,1,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,AMAZONAS ...,\N,0,0,0,0,0.0,NO,0.000000e+00,\N,\N,0,0,\N,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,2110000.0,0,0,0.0,144000,0,0,0,144000,0,0.0,0,0,0,1,0,788100.000000,E,0,\N,\N,\N,0.00
1118884,201910,3895049,19910224,28.583162,M,UNION LIBRE,NO INFORMA,\N,Empleado,\N,20190909,1,10,N,N,APARTADO ...,ANTIOQUIA ...,NECOCLI ...,ANTIOQUIA ...,\N,0,0,0,0,0.0,NO,0.000000e+00,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,2371104.0,0.0,2371104.0,NOM,1955559.0,0,0,0.0,0,0,157000,0,0,7500000,0.0,0,68000,0,2,0,1016335.400000,H,0,72080,2324915,1416107,153150.00


In [138]:
df2 = df.copy()
df2['mora_max'] = df2['mora_max'].replace("\\N", np.nan).astype('float')

df2['mora_max'] = np.where(
               df2['mora_max'] < 30, "Entre 0 y 30 dias",
               np.where(
                   df2['mora_max'] < 60, "Entre 31 y 60 dias",
                   np.where(
                       df2['mora_max'] > 60, "Mas de 60", "NA"
                   )
               )
)

df2['mora_max'].value_counts()

Entreo 0 y 30 dias    776039
NA                    319400
Entre 31 y 60 dias     14283
Mas de 60               9164
Name: mora_max, dtype: int64

In [136]:
df2 = df.copy()
df2['mora_max'] = df2['mora_max'].replace("\\N", np.nan).astype('float64')

df2['mora_max'].value_counts()

0.0       551146
1.0       119640
16.0       14189
17.0        8872
13.0        7647
           ...  
1233.0         1
2971.0         1
386.0          1
369.0          1
294.0          1
Name: mora_max, Length: 582, dtype: int64

In [127]:

df['mora_max'].value_counts()

0       551146
\N      319034
1       119640
16       14189
17        8872
         ...  
824          1
728          1
580          1
369          1
1170         1
Name: mora_max, Length: 583, dtype: int64