# BC Dataton

In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns



# Pre-processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

#modeling

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

# Feature engineerin and Dimension reduction
from sklearn.feature_selection import VarianceThreshold


# Utils
from tqdm import tqdm
from pandas_profiling import ProfileReport
from pycaret.regression import *


%matplotlib inline

sns.set_style("white")
matplotlib.rc('xtick', labelsize=15)
matplotlib.rc('ytick', labelsize=15)
plt.rcParams['figure.figsize'] = [16.0, 10.0]

# Data Processing class

In [7]:
class DataFrameBuilder:
    
    HEADER="https://bc-dataton2020.s3.amazonaws.com/dataton_all_data/header.txt"
    
    def __init__(self, dataframe):
        self.original_dataframe = self._assign_columns(dataframe)
        
    def _assign_columns(self, dataframe):
        column_names = pd.read_csv(DataFrameBuilder.HEADER).columns.to_list()
        dataframe.columns = column_names
        return dataframe
        
    def handle_missing_data(self):
        pass
    
    def process_columns(self):
        pass
    
    def remove_columns(self):
        pass
    
    def create_columns(self):
        pass
    
    def build(self):
        pass
        

In [8]:
test_df = DataFrameBuilder(data)
data.head()

Unnamed: 0,periodo,id_cli,fecha_nacimiento,edad,genero,estado_civil,nivel_academico,profesion,ocupacion,tipo_vivienda,ult_actual,categoria,codigo_ciiu,ind_mora_vigente,cartera_castigada,ciudad_residencia,departamento_residencia,ciudad_laboral,departamento_laboral,rechazo_credito,mora_max,cant_moras_30_ult_12_meses,cant_moras_60_ult_12_meses,cant_moras_90_ult_12_meses,cupo_total_tc,tenencia_tc,cuota_tc_bancolombia,tiene_consumo,tiene_crediagil,nro_tot_cuentas,ctas_activas,tiene_ctas_activas,ctas_embargadas,tiene_ctas_embargadas,pension_fopep,cuota_cred_hipot,tiene_cred_hipo_1,tiene_cred_hipo_2,mediana_nom3,mediana_pen3,ingreso_nompen,cat_ingreso,ingreso_final,cant_mora_30_tdc_ult_3m_sf,cant_mora_30_consum_ult_3m_sf,cuota_de_vivienda,cuota_de_consumo,cuota_rotativos,cuota_tarjeta_de_credito,cuota_de_sector_solidario,cuota_sector_real_comercio,cupo_tc_mdo,saldo_prom3_tdc_mdo,cuota_tc_mdo,saldo_no_rot_mdo,cuota_libranza_sf,cant_oblig_tot_sf,cant_cast_ult_12m_sr,ind,rep_calif_cred,pol_centr_ext,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,201910,2089776,19840630,35.23614,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,20180526,1,10,N,N,CALI ...,VALLE ...,CALI ...,VALLE ...,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,1255032.0,0.0,1255032.0,NOM,1259738,0,0,0.0,0,0,0,0,0,0,0.0,0,0,0,0,0,629869.0,C,0,\N,1255032,\N,95511.0
1,201910,2088434,19880109,31.709788,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,20190710,4,10,N,N,GARZON ...,HUILA ...,\N,\N,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7970188,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,4742261.859999999,SIN INFO,\N,\N,\N,\N,1929721.0
2,201910,2088089,19860727,33.163587,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,20190312,4,8230,N,N,PALMIRA ...,VALLE ...,\N,\N,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,3073390,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,1698047.975,SIN INFO,\N,\N,\N,\N,374940.0
3,201910,4780572,19940208,25.626283,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,20190719,2,90,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,0,0,0,0,15700000.0,SI,2796635.0,X,X,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7749000,0,0,0.0,1750000,0,675000,0,0,0,0.0,0,9517000,0,5,0,2185655.0,B,0,\N,\N,\N,2165030.0
4,201910,3894402,19910809,28.128679,M,SOLTERO,TECNOLOGO,DISEÑO Y PUBLICIDAD,Independiente,\N,20190311,4,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,1200000,0,0,0.0,134000,0,0,0,244000,0,0.0,0,2181000,0,2,0,-42000.0,SIN INFO,0,\N,\N,\N,77469.29


In [2]:
data = pd.read_csv("https://bc-dataton2020.s3.amazonaws.com/dataton_all_data/train/train_201910.csv")
data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,aa,bb,cc,dd,ee,ff,gg,hh,ii,jj,kk,ll,mm,nn,oo,pp,qq,rr,ss,tt,uu,vv,ww,xx,yy,zz,aaa,bbb,ccc,ddd,eee,fff,ggg,hhh,iii,jjj,kkk,lll,mmm
0,201910,2089776,19840630,35.23614,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,20180526,1,10,N,N,CALI ...,VALLE ...,CALI ...,VALLE ...,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,1255032.0,0.0,1255032.0,NOM,1259738,0,0,0.0,0,0,0,0,0,0,0.0,0,0,0,0,0,629869.0,C,0,\N,1255032,\N,95511.0
1,201910,2088434,19880109,31.709788,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,20190710,4,10,N,N,GARZON ...,HUILA ...,\N,\N,\N,\N,\N,\N,\N,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7970188,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,4742261.859999999,SIN INFO,\N,\N,\N,\N,1929721.0
2,201910,2088089,19860727,33.163587,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,20190312,4,8230,N,N,PALMIRA ...,VALLE ...,\N,\N,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,3073390,\N,\N,0.0,0,0,0,0,0,0,0.0,0,0,0,\N,\N,1698047.975,SIN INFO,\N,\N,\N,\N,374940.0
3,201910,4780572,19940208,25.626283,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,20190719,2,90,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,0,0,0,0,15700000.0,SI,2796635.0,X,X,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,7749000,0,0,0.0,1750000,0,675000,0,0,0,0.0,0,9517000,0,5,0,2185655.0,B,0,\N,\N,\N,2165030.0
4,201910,3894402,19910809,28.128679,M,SOLTERO,TECNOLOGO,DISEÑO Y PUBLICIDAD,Independiente,\N,20190311,4,10,N,N,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,BOGOTA D.C. ...,\N,1,0,0,0,0.0,NO,0.0,\N,\N,1,1,X,0,\N,\N,\N,\N,\N,0.0,0.0,0.0,\N,1200000,0,0,0.0,134000,0,0,0,244000,0,0.0,0,2181000,0,2,0,-42000.0,SIN INFO,0,\N,\N,\N,77469.29


In [4]:
data.shape

(1042180, 65)

In [9]:
column_names = pd.read_csv("../data/header.txt").columns.to_list()
data.columns = column_names

In [10]:
data.head()

Unnamed: 0,periodo,id_cli,fecha_nacimiento,edad,genero,estado_civil,nivel_academico,profesion,ocupacion,tipo_vivienda,...,cuota_libranza_sf,cant_oblig_tot_sf,cant_cast_ult_12m_sr,ind,rep_calif_cred,pol_centr_ext,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,201902,2089776,19840630,34.54893908281998,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,1,0,311306.0,C,0,\N,1172612,\N,170490.0
1,201902,2088434,19880109,31.02258726899384,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,...,0,\N,\N,947070.0,SIN INFO,\N,\N,\N,\N,41041.0
2,201902,4780572,19940208,24.93908281998631,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,...,0,7,0,1114487.0775,G,0,\N,\N,\N,959126.0
3,201902,2088089,19860727,32.47638603696099,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,...,0,\N,\N,1187025.0,SIN INFO,\N,\N,\N,\N,187990.0
4,201902,3892351,19910108,28.02464065708419,M,SOLTERO,TECNOLOGO,OTROS,Independiente,\N,...,0,\N,\N,4020204.37,SIN INFO,\N,70395,\N,\N,1323439.0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042180 entries, 0 to 1042179
Data columns (total 65 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   periodo                        1042180 non-null  int64  
 1   id_cli                         1042180 non-null  int64  
 2   fecha_nacimiento               1042180 non-null  int64  
 3   edad                           1042180 non-null  object 
 4   genero                         1042180 non-null  object 
 5   estado_civil                   1042180 non-null  object 
 6   nivel_academico                1042180 non-null  object 
 7   profesion                      1042180 non-null  object 
 8   ocupacion                      1042180 non-null  object 
 9   tipo_vivienda                  1042180 non-null  object 
 10  ult_actual                     1042180 non-null  int64  
 11  categoria                      1042180 non-null  object 
 12  codigo_ciiu   

In [5]:
import pandas as pd
data = pd.read_csv("../data/train_202011.csv")
data.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,ddd,eee,fff,ggg,hhh,iii,jjj,kkk,lll,mmm
0,202011,2089776,19840630,36.306639,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,0,0,569250.0,C,0,\N,\N,1150000,0.0
1,202011,2088434,19880109,32.780287,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Profesional Independiente,NO INFORMA,...,0,3,0,2104093.335,D,0,\N,\N,\N,2679975.0
2,202011,2088089,19860727,34.234086,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,...,0,\N,\N,1076100.0,SIN INFO,\N,\N,\N,\N,998342.0
3,202011,3894402,19910809,29.199179,M,SOLTERO,TECNOLOGO,DISEÑO Y PUBLICIDAD,Independiente,\N,...,0,1,0,291375.0,SIN INFO,0,\N,\N,\N,337689.99
4,202011,3892351,19910108,29.782341,M,SOLTERO,TECNOLOGO,OTROS,Independiente,\N,...,0,1,0,932844.32,B,0,\N,\N,2800000,818171.0
