# BC Dataton

In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns



# Pre-processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

#modeling

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

# Feature engineerin and Dimension reduction
from sklearn.feature_selection import VarianceThreshold


# Utils
from tqdm import tqdm
from pandas_profiling import ProfileReport
from pycaret.regression import *


%matplotlib inline

sns.set_style("white")
matplotlib.rc('xtick', labelsize=15)
matplotlib.rc('ytick', labelsize=15)
plt.rcParams['figure.figsize'] = [16.0, 10.0]

# Data Processing class

In [7]:
class DataFrameBuilder:
    
    HEADER="https://bc-dataton2020.s3.amazonaws.com/dataton_all_data/header.txt"
    
    def __init__(self, dataframe):
        self.original_dataframe = self._assign_columns(dataframe)
        
    def _assign_columns(self, dataframe):
        column_names = pd.read_csv(DataFrameBuilder.HEADER).columns.to_list()
        dataframe.columns = column_names
        return dataframe
        
    def handle_missing_data(self):
        pass
    
    def process_columns(self):
        pass
    
    def remove_columns(self):
        pass
    
    def create_columns(self):
        pass
    
    def build(self):
        pass
        

In [8]:
test_df = DataFrameBuilder(data)
test_df.head()

Unnamed: 0,periodo,id_cli,fecha_nacimiento,edad,genero,estado_civil,nivel_academico,profesion,ocupacion,tipo_vivienda,ult_actual,categoria,codigo_ciiu,ind_mora_vigente,cartera_castigada,ciudad_residencia,departamento_residencia,ciudad_laboral,departamento_laboral,rechazo_credito,mora_max,cant_moras_30_ult_12_meses,cant_moras_60_ult_12_meses,cant_moras_90_ult_12_meses,cupo_total_tc,tenencia_tc,cuota_tc_bancolombia,tiene_consumo,tiene_crediagil,nro_tot_cuentas,ctas_activas,tiene_ctas_activas,ctas_embargadas,tiene_ctas_embargadas,pension_fopep,cuota_cred_hipot,tiene_cred_hipo_1,tiene_cred_hipo_2,mediana_nom3,mediana_pen3,ingreso_nompen,cat_ingreso,ingreso_final,cant_mora_30_tdc_ult_3m_sf,cant_mora_30_consum_ult_3m_sf,cuota_de_vivienda,cuota_de_consumo,cuota_rotativos,cuota_tarjeta_de_credito,cuota_de_sector_solidario,cuota_sector_real_comercio,cupo_tc_mdo,saldo_prom3_tdc_mdo,cuota_tc_mdo,saldo_no_rot_mdo,cuota_libranza_sf,cant_oblig_tot_sf,cant_cast_ult_12m_sr,ind,rep_calif_cred,pol_centr_ext,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,201910,2089776,19840630,35.23614,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,20180526,1,10,N,N,CALI ...,VALLE ...,CALI ...,VALLE ...,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,1255032.0,0.0,1255032.0,NOM,1259738,0,0,0.0,0,0,0,0,0,0,0.0,0,0,0,0,0,629869.0,C,0,\N,1255032,\N,95511.0
1,201910,2088434,19880109,31.709788,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,20190710,4,10,N,N,GARZON ...,HUILA ...,\N,\N,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7970188,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,4742261.859999999,SIN INFO,\N,\N,\N,\N,1929721.0
2,201910,2088089,19860727,33.163587,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,20190312,4,8230,N,N,PALMIRA ...,VALLE ...,\N,\N,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,3073390,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,1698047.975,SIN INFO,\N,\N,\N,\N,374940.0
3,201910,4780572,19940208,25.626283,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,20190719,2,90,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,0,0,0,0,15700000.0,SI,2796635.0,X,X,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7749000,0,0,0.0,1750000,0,675000,0,0,0,0.0,0,9517000,0,5,0,2185655.0,B,0,\N,\N,\N,2165030.0
4,201910,3894402,19910809,28.128679,M,SOLTERO,TECNOLOGO,DISEÑO Y PUBLICIDAD,Independiente,\N,20190311,4,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,1200000,0,0,0.0,134000,0,0,0,244000,0,0.0,0,2181000,0,2,0,-42000.0,SIN INFO,0,\N,\N,\N,77469.29


In [29]:
data = pd.read_csv("https://bc-dataton2020.s3.amazonaws.com/dataton_all_data/train/train_201910.csv")
data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,aa,bb,cc,dd,ee,ff,gg,hh,ii,jj,kk,ll,mm,nn,oo,pp,qq,rr,ss,tt,uu,vv,ww,xx,yy,zz,aaa,bbb,ccc,ddd,eee,fff,ggg,hhh,iii,jjj,kkk,lll,mmm
0,201910,2089776,19840630,35.23614,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,20180526,1,10,N,N,CALI ...,VALLE ...,CALI ...,VALLE ...,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,1255032.0,0.0,1255032.0,NOM,1259738,0,0,0.0,0,0,0,0,0,0,0.0,0,0,0,0,0,629869.0,C,0,\N,1255032,\N,95511.0
1,201910,2088434,19880109,31.709788,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,20190710,4,10,N,N,GARZON ...,HUILA ...,\N,\N,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7970188,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,4742261.859999999,SIN INFO,\N,\N,\N,\N,1929721.0
2,201910,2088089,19860727,33.163587,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,20190312,4,8230,N,N,PALMIRA ...,VALLE ...,\N,\N,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,3073390,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,1698047.975,SIN INFO,\N,\N,\N,\N,374940.0
3,201910,4780572,19940208,25.626283,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,20190719,2,90,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,0,0,0,0,15700000.0,SI,2796635.0,X,X,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7749000,0,0,0.0,1750000,0,675000,0,0,0,0.0,0,9517000,0,5,0,2185655.0,B,0,\N,\N,\N,2165030.0
4,201910,3894402,19910809,28.128679,M,SOLTERO,TECNOLOGO,DISEÑO Y PUBLICIDAD,Independiente,\N,20190311,4,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,1200000,0,0,0.0,134000,0,0,0,244000,0,0.0,0,2181000,0,2,0,-42000.0,SIN INFO,0,\N,\N,\N,77469.29


In [37]:
data.shape

(1118886, 65)

In [46]:
df = test_df.original_dataframe
df['rep_calif_cred'].value_counts()

A           250429
B           159792
C           154790
G           142218
D           120494
SIN INFO    100069
E            80530
H            62127
F            48437
Name: rep_calif_cred, dtype: int64

In [48]:
pd.crosstab(df['ind_mora_vigente'], df['rep_calif_cred'])

rep_calif_cred,A,B,C,D,E,F,G,H,SIN INFO
ind_mora_vigente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
N,249144,158664,152652,118046,78455,46784,138942,59662,99024
S,1284,1122,2095,2389,2047,1636,3184,2420,1033
\N,1,6,43,59,28,17,92,45,12


In [28]:
df['mora_max'].value_counts()[1] / len(df) * 100

28.513539359684543

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1118886 entries, 0 to 1118885
Data columns (total 65 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   periodo                        1118886 non-null  int64  
 1   id_cli                         1118886 non-null  int64  
 2   fecha_nacimiento               1118886 non-null  int64  
 3   edad                           1118886 non-null  object 
 4   genero                         1118886 non-null  object 
 5   estado_civil                   1118886 non-null  object 
 6   nivel_academico                1118886 non-null  object 
 7   profesion                      1118886 non-null  object 
 8   ocupacion                      1118886 non-null  object 
 9   tipo_vivienda                  1118886 non-null  object 
 10  ult_actual                     1118886 non-null  int64  
 11  categoria                      1118886 non-null  object 
 12  codigo_ciiu   

# Metadata report

In [22]:
class MetaDataReport:
    
    
    def __init__(self):
        pass

In [56]:
test =pd.read_csv("../data/dt2020_base_evaluar.csv")

In [57]:
test['profesion'].unique()

array(['DISEÑO Y PUBLICIDAD', 'ADMINISTRACION', 'OTROS', nan, 'MEDICINA',
       'INGENIERO ELECTRONICO', 'ODONTOLOGIA', 'ZOOTECNIA',
       'INGENIERIA DE SISTEMAS', 'CONTADURIA', 'DERECHO',
       'COMUNICACION SOCIAL', 'BACTERIOLOGIA', 'MERCADOTECNIA',
       'BIBLIOTECARIOS, ARCHIVEROS Y ENCARGA DE MUSEO', 'SACERDOTE',
       'PSICOLOGIA', 'EDUCACION', 'INGENIERIA CIVIL', 'INGENIERO',
       'ECONOMIA', 'ARQUITECTURA', 'COMERCIO INTERNACIONAL',
       'INGENIERIA MECANICA', 'AGRONOMIA', 'ENFERMERIA',
       'FOTOGRAFOS Y OPERADORES DE CAMARA CINE Y TV', 'NINGUNA',
       'TECNOLOGIA EN CONSTRUCCION', 'TECNOLOGIA INDUSTRIAL', 'PILOTOS',
       'INGENIERIA INDUSTRIAL', 'INGENIERIA ELECTRICA', 'SECRETARIADO',
       'TECNOLOGIA ELECTRICIDAD', 'VETERINARIA',
       'TECNOLOGIA EN ADMINISTRACION', 'INGENIERIA AGRICOLA',
       'INGENIERIA AMBIENTAL', 'INGENIRIA QUIMICA', 'TECNOLOGIA MECANICA',
       'CARRERA MILITAR', 'INGENIERIA FORESTAL', 'AUXILIAR CONTABLE',
       'DECORADORES Y DI

In [55]:
test['profesion'].unique()

array(['DISEÑO Y PUBLICIDAD', 'ADMINISTRACION', 'OTROS', nan, 'MEDICINA',
       'INGENIERO ELECTRONICO', 'ODONTOLOGIA', 'ZOOTECNIA',
       'INGENIERIA DE SISTEMAS', 'CONTADURIA', 'DERECHO',
       'COMUNICACION SOCIAL', 'BACTERIOLOGIA', 'MERCADOTECNIA',
       'BIBLIOTECARIOS ARCHIVEROS Y ENCARGA DE MUSEO', 'SACERDOTE',
       'PSICOLOGIA', 'EDUCACION', 'INGENIERIA CIVIL', 'INGENIERO',
       'ECONOMIA', 'ARQUITECTURA', 'COMERCIO INTERNACIONAL',
       'INGENIERIA MECANICA', 'AGRONOMIA', 'ENFERMERIA',
       'FOTOGRAFOS Y OPERADORES DE CAMARA CINE Y TV', 'NINGUNA',
       'TECNOLOGIA EN CONSTRUCCION', 'TECNOLOGIA INDUSTRIAL', 'PILOTOS',
       'INGENIERIA INDUSTRIAL', 'INGENIERIA ELECTRICA', 'SECRETARIADO',
       'TECNOLOGIA ELECTRICIDAD', 'VETERINARIA',
       'TECNOLOGIA EN ADMINISTRACION', 'INGENIERIA AGRICOLA',
       'INGENIERIA AMBIENTAL', 'INGENIRIA QUIMICA', 'TECNOLOGIA MECANICA',
       'CARRERA MILITAR', 'INGENIERIA FORESTAL', 'AUXILIAR CONTABLE',
       'DECORADORES Y DIS

In [53]:
test.shape

(281666, 65)