# Seleção de características para aprovação de crédito

In [242]:
from csv import reader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import time
import statsmodels.api as sm
from sklearn.feature_selection import SelectFromModel

In [182]:
# fixar a semente do gerador de números aleatórios
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [183]:
# mostrar todas as colunas do dataset
pd.set_option('display.max_columns', None)

### 1. Explorando o dataset

In [184]:
# carregando o dataset
df = pd.read_csv("datasets/application_data.csv", sep=",", encoding="latin-1")
df.head(10)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
5,100008,0,Cash loans,M,N,Y,0,99000.0,490495.5,27517.5,454500.0,"Spouse, partner",State servant,Secondary / secondary special,Married,House / apartment,0.035792,-16941,-1588,-4970.0,-477,,1,1,1,1,1,0,Laborers,2.0,2,2,WEDNESDAY,16,0,0,0,0,0,0,Other,,0.354225,0.621226,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-2536.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
6,100009,0,Cash loans,F,Y,Y,1,171000.0,1560726.0,41301.0,1395000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.035792,-13778,-3130,-1213.0,-619,17.0,1,1,0,1,1,0,Accountants,3.0,2,2,SUNDAY,16,0,0,0,0,0,0,Business Entity Type 3,0.774761,0.724,0.49206,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,-1562.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,1.0,2.0
7,100010,0,Cash loans,M,Y,Y,0,360000.0,1530000.0,42075.0,1530000.0,Unaccompanied,State servant,Higher education,Married,House / apartment,0.003122,-18850,-449,-4597.0,-2379,8.0,1,1,1,1,0,0,Managers,2.0,3,3,MONDAY,16,0,0,0,0,1,1,Other,,0.714279,0.540654,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-1070.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
8,100011,0,Cash loans,F,N,Y,0,112500.0,1019610.0,33826.5,913500.0,Children,Pensioner,Secondary / secondary special,Married,House / apartment,0.018634,-20099,365243,-7427.0,-3514,,1,0,0,1,0,0,,2.0,2,2,WEDNESDAY,14,0,0,0,0,0,0,XNA,0.587334,0.205747,0.751724,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
9,100012,0,Revolving loans,M,N,Y,0,135000.0,405000.0,20250.0,405000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.019689,-14469,-2019,-14437.0,-3992,,1,1,0,1,0,0,Laborers,1.0,2,2,THURSDAY,8,0,0,0,0,0,0,Electricity,,0.746644,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-1673.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,


In [185]:
df.describe()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,307511.0,307511.0,104582.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307509.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,134133.0,306851.0,246546.0,151450.0,127568.0,157504.0,103023.0,92646.0,143620.0,152683.0,154491.0,98869.0,124921.0,97312.0,153161.0,93997.0,137829.0,151450.0,127568.0,157504.0,103023.0,92646.0,143620.0,152683.0,154491.0,98869.0,124921.0,97312.0,153161.0,93997.0,137829.0,151450.0,127568.0,157504.0,103023.0,92646.0,143620.0,152683.0,154491.0,98869.0,124921.0,97312.0,153161.0,93997.0,137829.0,159080.0,306490.0,306490.0,306490.0,306490.0,307510.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0
mean,278180.518577,0.080729,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,-4986.120328,-2994.202373,12.061091,0.999997,0.819889,0.199368,0.998133,0.281066,0.05672,2.152665,2.052463,2.031521,12.063419,0.015144,0.050769,0.040659,0.078173,0.230454,0.179555,0.50213,0.5143927,0.510853,0.11744,0.088442,0.977735,0.752471,0.044621,0.078942,0.149725,0.226282,0.231894,0.066333,0.100775,0.107399,0.008809,0.028358,0.114231,0.087543,0.977065,0.759637,0.042553,0.07449,0.145193,0.222315,0.228058,0.064958,0.105645,0.105975,0.008076,0.027022,0.11785,0.087955,0.977752,0.755746,0.044595,0.078078,0.149213,0.225897,0.231625,0.067169,0.101954,0.108607,0.008651,0.028236,0.102547,1.422245,0.143421,1.405292,0.100049,-962.858788,4.2e-05,0.710023,8.1e-05,0.015115,0.088055,0.000192,0.081376,0.003896,2.3e-05,0.003912,7e-06,0.003525,0.002936,0.00121,0.009928,0.000267,0.00813,0.000595,0.000507,0.000335,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
std,102790.175348,0.272419,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,141275.766519,3522.886321,1509.450419,11.944812,0.001803,0.38428,0.399526,0.043164,0.449521,0.231307,0.910682,0.509034,0.502737,3.265832,0.122126,0.219526,0.197499,0.268444,0.421124,0.383817,0.211062,0.1910602,0.194844,0.10824,0.082438,0.059223,0.11328,0.076036,0.134576,0.100049,0.144641,0.16138,0.081184,0.092576,0.110565,0.047732,0.069523,0.107936,0.084307,0.064575,0.110111,0.074445,0.132256,0.100977,0.143709,0.16116,0.08175,0.09788,0.111845,0.046276,0.070254,0.109076,0.082179,0.059897,0.112066,0.076144,0.134467,0.100368,0.145067,0.161934,0.082167,0.093642,0.11226,0.047415,0.070166,0.107462,2.400989,0.446698,2.379803,0.362291,826.808487,0.006502,0.453752,0.009016,0.12201,0.283376,0.01385,0.273412,0.062295,0.004771,0.062424,0.00255,0.059268,0.05411,0.03476,0.099144,0.016327,0.089798,0.024387,0.022518,0.018299,0.083849,0.110757,0.204685,0.916002,0.794056,1.869295
min,100002.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,-24672.0,-7197.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014568,8.173617e-08,0.000527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4292.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,-7479.5,-4299.0,5.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,2.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.334007,0.3924574,0.37065,0.0577,0.0442,0.9767,0.6872,0.0078,0.0,0.069,0.1667,0.0833,0.0187,0.0504,0.0453,0.0,0.0,0.0525,0.0407,0.9767,0.6994,0.0072,0.0,0.069,0.1667,0.0833,0.0166,0.0542,0.0427,0.0,0.0,0.0583,0.0437,0.9767,0.6914,0.0079,0.0,0.069,0.1667,0.0833,0.0187,0.0513,0.0457,0.0,0.0,0.0412,0.0,0.0,0.0,0.0,-1570.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,-4504.0,-3254.0,9.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.505998,0.5659614,0.535276,0.0876,0.0763,0.9816,0.7552,0.0211,0.0,0.1379,0.1667,0.2083,0.0481,0.0756,0.0745,0.0,0.0036,0.084,0.0746,0.9816,0.7648,0.019,0.0,0.1379,0.1667,0.2083,0.0458,0.0771,0.0731,0.0,0.0011,0.0864,0.0758,0.9816,0.7585,0.0208,0.0,0.1379,0.1667,0.2083,0.0487,0.0761,0.0749,0.0,0.0031,0.0688,0.0,0.0,0.0,0.0,-757.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,367142.5,0.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,-2010.0,-1720.0,15.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,2.0,2.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.675053,0.6636171,0.669057,0.1485,0.1122,0.9866,0.8232,0.0515,0.12,0.2069,0.3333,0.375,0.0856,0.121,0.1299,0.0039,0.0277,0.1439,0.1124,0.9866,0.8236,0.049,0.1208,0.2069,0.3333,0.375,0.0841,0.1313,0.1252,0.0039,0.0231,0.1489,0.1116,0.9866,0.8256,0.0513,0.12,0.2069,0.3333,0.375,0.0868,0.1231,0.1303,0.0039,0.0266,0.1276,2.0,0.0,2.0,0.0,-274.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,456255.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,0.0,0.0,91.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,3.0,3.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0,0.962693,0.8549997,0.89601,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,348.0,34.0,344.0,24.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,9.0,8.0,27.0,261.0,25.0


In [186]:
df["NAME_CONTRACT_TYPE"].value_counts()

Cash loans         278232
Revolving loans     29279
Name: NAME_CONTRACT_TYPE, dtype: int64

In [187]:
df["CODE_GENDER"].value_counts()

F      202448
M      105059
XNA         4
Name: CODE_GENDER, dtype: int64

In [188]:
df["FLAG_OWN_CAR"].value_counts()

N    202924
Y    104587
Name: FLAG_OWN_CAR, dtype: int64

In [189]:
df["FLAG_OWN_REALTY"].value_counts()

Y    213312
N     94199
Name: FLAG_OWN_REALTY, dtype: int64

### 2. Dividindo conjuntos de teste e treinamento

In [264]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
print("{} train + {} test".format(len(train_set), len(test_set)))

246008 train + 61503 test


In [266]:
train_set.head(10)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
123473,243191,0,Cash loans,F,Y,N,0,171000.0,555273.0,16366.5,463500.0,Unaccompanied,Pensioner,Secondary / secondary special,Widow,House / apartment,0.035792,-23349,365243,-3595.0,-4408,31.0,1,0,0,1,0,0,,1.0,2,2,TUESDAY,9,0,0,0,0,0,0,XNA,0.524685,0.358568,0.563835,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-2058.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
10118,111778,0,Cash loans,M,N,Y,1,157500.0,198085.5,23638.5,171000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010032,-10921,-117,-4281.0,-3399,,1,1,1,1,1,0,Laborers,3.0,2,2,SATURDAY,7,0,0,0,0,0,0,Business Entity Type 2,0.244926,0.490305,0.595456,0.0784,0.0633,0.9742,0.6464,0.0266,0.0,0.1379,0.1667,0.2083,0.0409,0.063,0.0594,0.0039,0.0149,0.0798,0.0657,0.9742,0.6602,0.0269,0.0,0.1379,0.1667,0.2083,0.0418,0.0689,0.0619,0.0039,0.0158,0.0791,0.0633,0.9742,0.6511,0.0268,0.0,0.1379,0.1667,0.2083,0.0416,0.0641,0.0605,0.0039,0.0153,reg oper account,block of flats,0.0645,"Stone, brick",No,1.0,0.0,1.0,0.0,-73.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
64716,175057,1,Cash loans,M,Y,Y,0,135000.0,776304.0,25173.0,648000.0,Unaccompanied,Working,Lower secondary,Civil marriage,House / apartment,0.035792,-23213,-2157,-5680.0,-5009,8.0,1,1,0,1,0,0,Drivers,2.0,2,2,FRIDAY,13,0,0,0,0,0,0,Self-employed,,0.643404,0.706205,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-1959.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
234940,372147,0,Cash loans,M,Y,Y,1,164133.0,900000.0,36787.5,900000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.030755,-10703,-2530,-2618.0,-2751,15.0,1,1,1,1,1,0,High skill tech staff,3.0,2,2,TUESDAY,10,0,0,0,0,1,1,Trade: type 3,0.288642,0.426431,0.506484,0.1495,0.1136,0.9841,0.7824,0.0942,0.16,0.1379,0.3333,0.0417,0.0374,0.121,0.0917,0.0039,0.2368,0.1523,0.1179,0.9841,0.7909,0.0951,0.1611,0.1379,0.3333,0.0417,0.0382,0.1322,0.0955,0.0039,0.2507,0.1509,0.1136,0.9841,0.7853,0.0948,0.16,0.1379,0.3333,0.0417,0.038,0.1231,0.0933,0.0039,0.2418,reg oper account,terraced house,0.1222,Panel,No,0.0,0.0,0.0,0.0,-531.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0
236051,373412,0,Cash loans,M,N,Y,0,225000.0,533668.5,21294.0,477000.0,"Spouse, partner",Commercial associate,Secondary / secondary special,Married,House / apartment,0.025164,-15798,-3520,-8006.0,-5001,,1,1,0,1,0,0,Laborers,2.0,2,2,SATURDAY,12,0,0,0,0,0,0,Industry: type 11,0.79021,0.445701,0.528093,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.0,1.0,6.0,0.0,-9.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
30611,135529,0,Cash loans,F,Y,N,0,405000.0,675000.0,30631.5,675000.0,,Working,Higher education,Married,House / apartment,0.010006,-16721,-1482,-1432.0,-265,0.0,1,1,0,1,0,0,Accountants,2.0,2,1,SATURDAY,11,0,0,0,0,0,0,Industry: type 9,,0.71603,0.546023,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.0,4.0,0.0,-128.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
871,100999,0,Cash loans,M,Y,N,0,450000.0,2125953.0,81108.0,1984500.0,Unaccompanied,Commercial associate,Higher education,Single / not married,With parents,0.072508,-10121,-1354,-1393.0,-2672,2.0,1,1,0,1,1,0,High skill tech staff,1.0,1,1,FRIDAY,16,0,0,0,0,0,0,Business Entity Type 1,0.386333,0.683097,,0.1962,0.0974,0.9806,0.7348,0.046,0.32,0.1379,0.4583,0.5,0.0188,0.16,0.1721,0.0,0.0017,0.2006,0.0951,0.9806,0.7452,0.0464,0.3222,0.1379,0.4583,0.5,0.0188,0.1754,0.1788,0.0,0.0012,0.1988,0.0985,0.9806,0.7383,0.0463,0.32,0.1379,0.4583,0.5,0.0191,0.1633,0.1752,0.0,0.0012,reg oper account,block of flats,0.1356,Panel,No,0.0,0.0,0.0,0.0,-4.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
153082,277437,0,Cash loans,F,N,Y,0,180000.0,679500.0,24201.0,679500.0,Unaccompanied,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,0.0228,-17676,-2535,-472.0,-1229,,1,1,0,1,1,0,Private service staff,1.0,2,2,FRIDAY,11,0,0,0,0,0,0,Services,,0.181508,0.7463,0.1907,0.1166,0.9985,0.9796,,0.2,0.1724,0.375,0.4167,0.0904,0.1555,0.2344,,0.0107,0.1943,0.121,0.9985,0.9804,,0.2014,0.1724,0.375,0.4167,0.0925,0.1699,0.2442,,0.0113,0.1926,0.1166,0.9985,0.9799,,0.2,0.1724,0.375,0.4167,0.092,0.1582,0.2386,,0.0109,reg oper account,block of flats,0.1867,"Stone, brick",No,1.0,0.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
188110,318071,0,Revolving loans,M,N,Y,1,180000.0,270000.0,13500.0,270000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.020246,-16568,-2155,-3417.0,-122,,1,1,0,1,1,0,High skill tech staff,3.0,3,3,FRIDAY,13,0,0,0,0,0,0,Business Entity Type 3,,0.501046,0.725276,0.1082,,0.9955,,,,0.2414,0.1667,,,,,,,0.1103,,0.9955,,,,0.2414,0.1667,,,,,,,0.1093,,0.9955,,,,0.2414,0.1667,,,,,,,reg oper account,block of flats,0.087,"Stone, brick",No,0.0,0.0,0.0,0.0,-1043.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
278046,422174,0,Cash loans,M,Y,Y,0,252000.0,1762110.0,48586.5,1575000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.006008,-19355,-1731,-4277.0,-2751,9.0,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,11,0,0,0,0,1,1,Construction,,0.288642,0.636376,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-552.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


### 3. Correlação entre as features

In [192]:
def correlation_color(value):
    if value == 1:
        color = 'gold'
    elif abs(value) > 0.75:
        color = 'royalblue'
    elif value < 0:
        color = 'red'
    else:
        color = 'green'
    return 'color: %s' % color

In [193]:
pd.set_option('display.max_rows', None)
correlation = train_set.corr().style.applymap(correlation_color)
correlation

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR,1.0,-0.002024,-0.001221,0.001372,-0.001609,-0.001063,-0.001312,-0.000942,-0.001882,0.002071,-0.001187,0.00071,0.000981,0.003136,-0.002049,0.000281,0.004034,0.002148,-0.000863,-0.003137,-0.000763,-0.000786,0.000946,0.000397,0.000776,0.002279,-0.000118,-0.001267,-0.000359,-9.5e-05,0.003092,0.000274,0.000393,-0.00559,0.003596,0.00731,-0.000823,0.004035,-0.004504,0.0046,0.005596,0.001168,0.002482,0.001134,-0.002142,-0.0005,0.000243,-0.005069,0.003892,0.006794,-0.000352,0.003914,-0.004811,0.004272,0.004256,0.001,0.001921,0.001086,-0.001827,-0.001862,0.000784,-0.005241,0.003624,0.007266,-0.000409,0.004046,-0.004133,0.004287,0.005188,0.001618,0.00253,0.001632,-0.002441,-0.001136,0.001014,0.000243,0.001163,0.00026,0.001735,-0.000299,0.000209,-0.003957,-0.005857,-0.001409,0.003358,-0.003075,0.001067,0.001086,-0.001243,-0.004128,0.000702,-6.9e-05,-0.000309,0.00327,-0.00142,-9.3e-05,0.00053,-0.000847,0.001247,0.002815,-0.000585,-0.001942,0.000252,-0.000607,0.001854,0.005662
TARGET,-0.002024,1.0,0.019642,-0.020385,-0.031533,-0.012582,-0.04066,-0.036909,0.077632,-0.04464,0.041712,0.052218,0.036835,0.000598,0.045673,0.029074,0.000512,-0.023587,-1.1e-05,0.009801,0.058058,0.060598,-0.022704,0.006011,0.006303,0.002366,0.044483,0.051017,0.033188,-0.154135,-0.158824,-0.180096,-0.028946,-0.02331,-0.011198,-0.022327,-0.020627,-0.03436,-0.019233,-0.044281,-0.033305,-0.00984,-0.023828,-0.033383,-0.002949,-0.012936,-0.02685,-0.02009,-0.010345,-0.021977,-0.01793,-0.032071,-0.017558,-0.043344,-0.032158,-0.009462,-0.022209,-0.031312,-0.001535,-0.01235,-0.028737,-0.02268,-0.011609,-0.022363,-0.020712,-0.033986,-0.019226,-0.043838,-0.032772,-0.010387,-0.023392,-0.033315,-0.002569,-0.012798,-0.03265,0.008703,0.032613,0.008589,0.032088,0.053825,0.008772,0.044548,-0.002536,-0.001123,-0.028355,-0.002006,-0.00762,-0.003793,-0.001195,-0.003962,-0.000598,-0.011088,-0.009514,-0.005737,-0.012169,-0.00272,-0.008237,-0.000343,-0.001244,0.002427,0.001335,0.003512,0.000275,-0.012397,-0.002196,0.019169
CNT_CHILDREN,-0.001221,0.019642,1.0,0.025701,0.002881,0.021247,-0.000991,-0.026918,0.331833,-0.240403,0.18393,-0.029351,0.007759,0.001164,0.24133,0.054704,-0.002713,-0.029915,0.02291,0.878804,0.02548,0.02493,-0.007067,-0.013321,0.008592,0.014845,0.021868,0.070409,0.068292,-0.139433,-0.017404,-0.04202,-0.013528,-0.009431,0.007781,0.031814,0.002389,-0.006763,-0.009089,-0.008641,-0.0066,-0.003715,-0.00787,-0.009431,0.004291,0.000556,-0.012625,-0.009721,0.006711,0.031408,0.00234,-0.006014,-0.007615,-0.008722,-0.006083,-0.002982,-0.007314,-0.009044,0.004211,0.00082,-0.013156,-0.009509,0.007121,0.031815,0.002826,-0.006416,-0.008925,-0.008505,-0.00619,-0.003267,-0.006913,-0.009404,0.004382,0.000787,-0.007798,0.01636,-0.00029,0.015985,-0.001003,-0.005158,0.000663,0.058563,-0.003622,-0.017716,-0.15752,-0.002415,0.050594,-0.001918,-0.002329,-0.005119,0.001633,0.005831,-0.005074,0.003138,0.011008,-0.000713,0.00531,-0.000356,0.001322,-0.002129,4.3e-05,0.000196,-0.001255,-0.009388,-0.007181,-0.042203
AMT_INCOME_TOTAL,0.001372,-0.020385,0.025701,1.0,0.357562,0.435325,0.364242,0.171751,0.060355,-0.145206,0.066522,0.021762,-0.12721,0.000819,0.144617,-0.038207,-0.020141,0.00327,0.087453,0.03307,-0.194235,-0.208422,0.078376,0.070569,0.14187,0.13231,0.00909,0.016074,0.020054,0.083831,0.145754,-0.066601,0.101516,0.055962,0.015463,0.042071,0.088932,0.139472,0.011244,0.179103,0.140101,-0.006186,0.105384,0.120164,0.03031,0.078203,0.087351,0.041127,0.014917,0.03699,0.074413,0.127176,0.00098,0.171566,0.132046,-0.013727,0.09083,0.104515,0.025537,0.06495,0.099237,0.052979,0.015419,0.041628,0.087013,0.136647,0.009193,0.177395,0.138553,-0.007306,0.103214,0.117901,0.028881,0.074243,0.124444,-0.027297,-0.029278,-0.027128,-0.029182,-0.046999,-0.001448,-0.040541,-0.000119,0.004564,-0.10277,0.006936,0.166581,0.04491,-0.000206,0.003544,0.001091,0.051193,0.046669,0.023187,0.016738,0.006281,0.005306,0.006831,0.001324,-0.0003,0.001632,0.005068,0.005168,0.062008,0.013163,0.027036
AMT_CREDIT,-0.001609,-0.031533,0.002881,0.357562,1.0,0.770359,0.986955,0.097953,-0.055252,-0.067502,0.010038,-0.006741,-0.094738,0.001606,0.06616,-0.021872,0.023976,0.026688,0.019223,0.064167,-0.102131,-0.111017,0.05089,0.023056,0.052601,0.053394,-0.027407,-0.018162,0.001254,0.167989,0.132473,0.045875,0.060286,0.041331,0.006499,0.036643,0.050339,0.080349,0.014001,0.104794,0.079595,0.005929,0.059052,0.072824,0.01386,0.039871,0.052874,0.032721,0.005283,0.034081,0.043185,0.074875,0.008781,0.10171,0.076396,0.001933,0.05147,0.064825,0.010615,0.034127,0.058656,0.0395,0.005974,0.03631,0.049209,0.079038,0.012864,0.10418,0.079089,0.005214,0.057185,0.07174,0.01276,0.037783,0.074149,-0.000702,-0.021488,-0.000661,-0.023755,-0.073785,0.009645,0.096557,-0.002194,-0.011339,-0.046796,-0.004959,0.083729,0.022309,-0.003072,0.028218,0.001502,0.052741,0.048107,0.031575,0.06239,0.011302,0.034515,0.021769,0.032664,-0.015925,-0.003843,0.006114,-0.001935,0.054155,0.01551,-0.04968
AMT_ANNUITY,-0.001063,-0.012582,0.021247,0.435325,0.770359,1.0,0.77535,0.117095,0.009566,-0.105318,0.039087,0.011685,-0.098027,0.000168,0.104482,-0.025251,0.022208,0.01157,0.07394,0.075526,-0.128484,-0.141539,0.051109,0.041336,0.080677,0.075795,-0.006503,0.002112,0.011454,0.120596,0.126774,0.031494,0.075991,0.046145,0.013609,0.035165,0.057093,0.101733,0.013687,0.132377,0.101352,0.008598,0.074147,0.090557,0.020475,0.051852,0.065603,0.035516,0.013132,0.032006,0.048065,0.093331,0.006521,0.128272,0.096049,0.003151,0.064066,0.079882,0.015538,0.042943,0.073744,0.044104,0.013233,0.034753,0.056311,0.099999,0.0123,0.131273,0.100285,0.007575,0.071805,0.089075,0.019677,0.049119,0.092114,-0.01187,-0.023452,-0.011587,-0.024215,-0.063902,0.00458,0.102753,-0.001142,-0.004693,-0.073746,-0.004132,0.131168,0.033336,-0.003472,-0.003764,-0.000451,0.024961,0.034671,0.015757,0.006809,0.003604,-0.010289,0.005176,0.012892,-0.01686,0.003953,0.003006,0.013476,0.039238,0.009561,-0.013983
AMT_GOODS_PRICE,-0.001312,-0.04066,-0.000991,0.364242,0.986955,0.77535,1.0,0.101619,-0.05304,-0.06584,0.012074,-0.009614,-0.104689,0.001564,0.06446,0.000343,0.020996,0.042694,0.019625,0.06221,-0.103992,-0.112264,0.060574,0.025236,0.053797,0.053499,-0.027706,-0.019556,-7.1e-05,0.175015,0.140432,0.050056,0.064779,0.046251,0.007477,0.042663,0.05115,0.083719,0.017826,0.110123,0.082142,0.013075,0.062171,0.078,0.015106,0.043709,0.057284,0.037436,0.006273,0.040052,0.043771,0.078328,0.012665,0.106967,0.078738,0.008849,0.05407,0.070015,0.01157,0.037972,0.06316,0.044338,0.007043,0.042272,0.05001,0.082447,0.016733,0.109467,0.081548,0.012419,0.060284,0.076915,0.013929,0.041634,0.078937,-0.000549,-0.022551,-0.000547,-0.024578,-0.076661,0.011446,0.075196,-0.001861,-0.004907,-0.050575,-0.000817,0.08142,0.022141,-0.002044,0.034433,0.001968,0.052631,0.047906,0.032398,0.058453,0.011679,0.033009,0.0205,0.03334,-0.014268,-0.003291,0.006328,-0.001759,0.056327,0.015887,-0.052335
REGION_POPULATION_RELATIVE,-0.000942,-0.036909,-0.026918,0.171751,0.097953,0.117095,0.101619,1.0,-0.030404,-0.003382,-0.052156,-0.003835,-0.081586,0.00149,0.00345,-0.015024,-0.012543,0.09126,0.038903,-0.025705,-0.532844,-0.531486,0.171033,0.000648,0.055935,0.080127,-0.050418,-0.043247,-0.014754,0.101479,0.198595,-0.00589,0.205467,0.099546,-0.006774,-0.055003,0.162871,0.28055,0.036625,0.322333,0.293579,-0.0515,0.192911,0.212695,0.025893,0.078004,0.173883,0.066255,-0.007163,-0.062653,0.128436,0.251236,0.015436,0.303513,0.274465,-0.061374,0.160769,0.17937,0.017268,0.051839,0.200754,0.095084,-0.006751,-0.056329,0.157778,0.273925,0.032995,0.317932,0.290075,-0.05281,0.187316,0.208327,0.023173,0.068591,0.202933,-0.009375,0.007467,-0.008782,0.002727,-0.044572,-0.002112,-0.083606,0.006696,0.016205,0.000586,0.001711,0.087742,0.038103,-0.000661,0.025717,0.000808,0.029038,0.031321,0.00843,0.006269,0.00647,0.013253,0.001156,0.000744,0.004158,-0.003658,-0.000332,-0.002581,0.07884,-0.000441,-1e-06
DAYS_BIRTH,-0.001882,0.077632,0.331833,0.060355,-0.055252,0.009566,-0.05304,-0.030404,1.0,-0.615473,0.33188,0.270902,0.007282,-0.003451,0.619504,0.173095,-0.015977,-0.043145,0.086757,0.278884,0.009365,0.008432,0.091464,0.067227,0.096825,0.069691,0.182104,0.242249,0.157873,-0.600012,-0.092279,-0.204244,0.003219,-0.00548,0.000217,0.027228,0.009552,-0.002772,-0.010992,0.001144,0.001769,0.002558,0.009724,-0.002469,0.000996,0.004054,0.00338,-0.005403,0.000377,0.026457,0.010765,-0.002146,-0.009891,0.000777,0.001027,0.00376,0.009681,-0.002101,0.001722,0.00428,0.003565,-0.005295,0.000261,0.027541,0.01035,-0.002669,-0.010429,0.001226,0.002015,0.003341,0.010278,-0.002081,0.000839,0.004814,-0.000474,0.007609,0.000344,0.007229,0.001943,0.082662,-0.00107,0.110863,-0.00337,0.016436,-0.407413,0.000924,0.111947,0.017816,5.1e-05,0.043657,0.000284,0.027111,0.032061,0.011893,0.024133,0.006978,0.043548,0.00337,0.009233,0.025883,0.005395,0.003978,-1.2e-05,0.002286,-0.012064,-0.072844
DAYS_EMPLOYED,0.002071,-0.04464,-0.240403,-0.145206,-0.067502,-0.105318,-0.06584,-0.003382,-0.615473,1.0,-0.211192,-0.271333,0.029482,0.000915,-0.999749,-0.233728,0.013469,0.015928,-0.061621,-0.233676,0.033272,0.03491,-0.091249,-0.036136,-0.107352,-0.095593,-0.090577,-0.254442,-0.217898,0.288099,-0.021045,0.111481,-0.013783,-0.000824,0.009922,-0.006633,-0.013009,-0.007494,0.005622,-0.015983,-0.015858,-0.008927,-0.01846,-0.010793,-0.001662,-0.010012,-0.011797,0.00021,0.009739,-0.00596,-0.013462,-0.006209,0.006642,-0.01461,-0.01483,-0.007903,-0.017275,-0.009213,-0.002342,-0.009156,-0.013565,-0.00112,0.009663,-0.007129,-0.0136,-0.007505,0.005539,-0.015796,-0.016262,-0.009121,-0.018611,-0.010934,-0.001701,-0.010183,-0.014008,0.005923,0.016765,0.006027,0.013542,0.024598,-0.002692,-0.249541,0.002142,-0.020646,0.599076,-0.002014,-0.121442,-0.023934,-0.001881,-0.028896,-0.000917,-0.025496,-0.02335,-0.014359,-0.043186,-0.006708,-0.0404,-0.009969,-0.009906,-0.008067,-0.004436,-0.000671,0.001549,-0.035635,0.01537,0.051682


In [194]:
# corr_list = []

# for col in correlation.columns:  
#     value = train_set.corrwith(col, axis=0)
#     if abs(value) > 0.75:
#             corr_list.append(col)
            
# print(corr_list)

### 4. Separando a variável dependente: TARGET

In [195]:
x_train = train_set.drop(columns=["TARGET"])
y_train = train_set["TARGET"]
x_test = test_set.drop(columns=["TARGET"])
y_test = test_set["TARGET"]

In [196]:
y_train.head(10)

123473    0
10118     0
64716     1
234940    0
236051    0
30611     0
871       0
153082    0
188110    0
278046    0
Name: TARGET, dtype: int64

### 5. Limpando o dataset (NaN)

In [197]:
x_train.isnull().sum().sort_values(ascending = False)
x_test.isnull().sum().sort_values(ascending = False)

COMMONAREA_MEDI                 42936
COMMONAREA_AVG                  42936
COMMONAREA_MODE                 42936
NONLIVINGAPARTMENTS_MODE        42646
NONLIVINGAPARTMENTS_MEDI        42646
NONLIVINGAPARTMENTS_AVG         42646
FONDKAPREMONT_MODE              42009
LIVINGAPARTMENTS_MEDI           42003
LIVINGAPARTMENTS_MODE           42003
LIVINGAPARTMENTS_AVG            42003
FLOORSMIN_AVG                   41688
FLOORSMIN_MEDI                  41688
FLOORSMIN_MODE                  41688
YEARS_BUILD_MEDI                40805
YEARS_BUILD_MODE                40805
YEARS_BUILD_AVG                 40805
OWN_CAR_AGE                     40742
LANDAREA_AVG                    36461
LANDAREA_MEDI                   36461
LANDAREA_MODE                   36461
BASEMENTAREA_AVG                35928
BASEMENTAREA_MODE               35928
BASEMENTAREA_MEDI               35928
EXT_SOURCE_1                    34854
NONLIVINGAREA_MEDI              33739
NONLIVINGAREA_AVG               33739
NONLIVINGARE

In [198]:
#pd.set_option('display.max_rows', 10)
#pd.set_option('display.max_columns', 20)

In [199]:
x_train.describe(include=['O'])

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE
count,246008,246008,246008,246008,244962,246008,246008,246008,246008,168902,246008,246008,77722,122469,120829,129317
unique,2,3,2,2,7,8,5,6,6,18,7,58,4,3,7,2
top,Cash loans,F,N,Y,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,Laborers,TUESDAY,Business Entity Type 3,reg oper account,block of flats,Panel,No
freq,222490,161855,162183,170843,198741,126929,174522,157320,218389,44201,43128,54311,58964,120320,52873,127434


In [200]:
columns_with_null_values = []
    
for column in train_set:
    if train_set[column].isnull().any():
        columns_with_null_values.append(column)

len(columns_with_null_values)

67

In [201]:
x_train[columns_with_null_values].isnull().sum().sort_values(ascending = False)

COMMONAREA_MEDI                 171929
COMMONAREA_AVG                  171929
COMMONAREA_MODE                 171929
NONLIVINGAPARTMENTS_AVG         170868
NONLIVINGAPARTMENTS_MODE        170868
NONLIVINGAPARTMENTS_MEDI        170868
FONDKAPREMONT_MODE              168286
LIVINGAPARTMENTS_AVG            168196
LIVINGAPARTMENTS_MEDI           168196
LIVINGAPARTMENTS_MODE           168196
FLOORSMIN_AVG                   166954
FLOORSMIN_MEDI                  166954
FLOORSMIN_MODE                  166954
YEARS_BUILD_AVG                 163683
YEARS_BUILD_MEDI                163683
YEARS_BUILD_MODE                163683
OWN_CAR_AGE                     162187
LANDAREA_AVG                    146129
LANDAREA_MEDI                   146129
LANDAREA_MODE                   146129
BASEMENTAREA_MODE               144015
BASEMENTAREA_AVG                144015
BASEMENTAREA_MEDI               144015
EXT_SOURCE_1                    138524
NONLIVINGAREA_AVG               135943
NONLIVINGAREA_MODE       

In [202]:
drop_columns = []
columns_to_analyze = []

for column in columns_with_null_values:
    if x_train[column].isnull().sum() > len(x_train)/2:
        drop_columns.append(column)
    else:
        columns_to_analyze.append(column)
        
print(drop_columns)

['OWN_CAR_AGE', 'EXT_SOURCE_1', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE']


In [203]:
len(x_train.columns)

121

In [204]:
x_train = x_train.drop(columns = drop_columns)
len(x_train.columns)

80

In [205]:
x_test = x_test.drop(columns = drop_columns)
len(x_test.columns)

80

In [206]:
x_train[columns_to_analyze].isnull().sum().sort_values(ascending = False)

FLOORSMAX_MEDI                  122532
FLOORSMAX_AVG                   122532
FLOORSMAX_MODE                  122532
YEARS_BEGINEXPLUATATION_AVG     120081
YEARS_BEGINEXPLUATATION_MODE    120081
YEARS_BEGINEXPLUATATION_MEDI    120081
TOTALAREA_MODE                  118818
EMERGENCYSTATE_MODE             116691
OCCUPATION_TYPE                  77106
EXT_SOURCE_3                     48824
AMT_REQ_CREDIT_BUREAU_QRT        33211
AMT_REQ_CREDIT_BUREAU_YEAR       33211
AMT_REQ_CREDIT_BUREAU_HOUR       33211
AMT_REQ_CREDIT_BUREAU_DAY        33211
AMT_REQ_CREDIT_BUREAU_WEEK       33211
AMT_REQ_CREDIT_BUREAU_MON        33211
NAME_TYPE_SUITE                   1046
OBS_30_CNT_SOCIAL_CIRCLE           805
DEF_30_CNT_SOCIAL_CIRCLE           805
OBS_60_CNT_SOCIAL_CIRCLE           805
DEF_60_CNT_SOCIAL_CIRCLE           805
EXT_SOURCE_2                       529
AMT_GOODS_PRICE                    224
AMT_ANNUITY                         10
DAYS_LAST_PHONE_CHANGE               1
CNT_FAM_MEMBERS          

In [207]:
x_train[columns_to_analyze].describe()

Unnamed: 0,AMT_ANNUITY,AMT_GOODS_PRICE,CNT_FAM_MEMBERS,EXT_SOURCE_2,EXT_SOURCE_3,YEARS_BEGINEXPLUATATION_AVG,FLOORSMAX_AVG,YEARS_BEGINEXPLUATATION_MODE,FLOORSMAX_MODE,YEARS_BEGINEXPLUATATION_MEDI,FLOORSMAX_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,245998.0,245784.0,246007.0,245479.0,197184.0,125927.0,123476.0,125927.0,123476.0,125927.0,123476.0,127190.0,245203.0,245203.0,245203.0,245203.0,246007.0,212797.0,212797.0,212797.0,212797.0,212797.0,212797.0
mean,27120.236898,538983.1,2.152642,0.5143846,0.511034,0.977713,0.226512,0.977049,0.22251,0.977718,0.226118,0.102577,1.421834,0.143155,1.404787,0.100113,-962.679249,0.00648,0.006729,0.034211,0.266644,0.266921,1.901126
std,14510.664192,370052.7,0.908734,0.1910533,0.194838,0.059673,0.144689,0.06495,0.143716,0.060396,0.145097,0.107539,2.41552,0.447389,2.394182,0.363496,827.636821,0.08424,0.106693,0.204336,0.911958,0.834133,1.870273
min,1615.5,40500.0,1.0,8.173617e-08,0.000527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4292.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16506.0,238500.0,2.0,0.3922228,0.37065,0.9767,0.1667,0.9767,0.1667,0.9767,0.1667,0.0414,0.0,0.0,0.0,0.0,-1570.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,24903.0,450000.0,2.0,0.5658916,0.53707,0.9821,0.1667,0.9816,0.1667,0.9816,0.1667,0.0687,0.0,0.0,0.0,0.0,-756.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,34654.5,679500.0,3.0,0.663744,0.669057,0.9866,0.3333,0.9866,0.3333,0.9866,0.3333,0.1278,2.0,0.0,2.0,0.0,-273.0,0.0,0.0,0.0,0.0,0.0,3.0
max,258025.5,4050000.0,20.0,0.8549997,0.89601,1.0,1.0,1.0,1.0,1.0,1.0,1.0,348.0,34.0,344.0,24.0,0.0,3.0,9.0,8.0,27.0,261.0,25.0


In [208]:
# Listas com colunas de moda e mediana
mode_list = []
median_list = []

for column in columns_to_analyze:
    suffix = column[-4:]
    if suffix == "MODE":
        mode_list.append(column)
    elif suffix == "MEDI":
        median_list.append(column)

In [209]:
# MODA
mode_list.remove('EMERGENCYSTATE_MODE')

for column in mode_list:
    x_train[column] = x_train[column].fillna(x_train[column].mode()[0])
    x_test[column] = x_test[column].fillna(x_test[column].mode()[0])

    columns_to_analyze.remove(column)

In [210]:
# MEDIANA
for column in median_list:
    x_train[column] = x_train[column].fillna(x_train[column].median())
    x_test[column] = x_test[column].fillna(x_test[column].median())
    columns_to_analyze.remove(column)

In [211]:
# Lista com colunas médias
avg_list = []

for column in columns_to_analyze:
    suffix = column[-3:]
    prefix = column[:3]
    if suffix == "AVG" or prefix == "AMT" or prefix == "EXT" or x_train[column].isnull().sum() < 1000:
        avg_list.append(column)        

In [212]:
# MÉDIA
for column in avg_list:
    x_train[column] = x_train[column].fillna(x_train[column].mean())
    x_test[column] = x_test[column].fillna(x_test[column].mean())
    columns_to_analyze.remove(column)

In [213]:
x_train[columns_to_analyze].isnull().sum().sort_values(ascending = False)

EMERGENCYSTATE_MODE    116691
OCCUPATION_TYPE         77106
NAME_TYPE_SUITE          1046
dtype: int64

In [214]:
x_train['NAME_TYPE_SUITE'] = x_train['NAME_TYPE_SUITE'].fillna('Unaccompanied')
x_train['EMERGENCYSTATE_MODE'] = x_train['EMERGENCYSTATE_MODE'].fillna('Undefined')
x_train['OCCUPATION_TYPE'] = x_train['OCCUPATION_TYPE'].fillna('Other')

x_test['NAME_TYPE_SUITE'] = x_test['NAME_TYPE_SUITE'].fillna('Unaccompanied')
x_test['EMERGENCYSTATE_MODE'] = x_test['EMERGENCYSTATE_MODE'].fillna('Undefined')
x_test['OCCUPATION_TYPE'] = x_test['OCCUPATION_TYPE'].fillna('Other')

In [215]:
x_train.isnull().any()

SK_ID_CURR                      False
NAME_CONTRACT_TYPE              False
CODE_GENDER                     False
FLAG_OWN_CAR                    False
FLAG_OWN_REALTY                 False
CNT_CHILDREN                    False
AMT_INCOME_TOTAL                False
AMT_CREDIT                      False
AMT_ANNUITY                     False
AMT_GOODS_PRICE                 False
NAME_TYPE_SUITE                 False
NAME_INCOME_TYPE                False
NAME_EDUCATION_TYPE             False
NAME_FAMILY_STATUS              False
NAME_HOUSING_TYPE               False
REGION_POPULATION_RELATIVE      False
DAYS_BIRTH                      False
DAYS_EMPLOYED                   False
DAYS_REGISTRATION               False
DAYS_ID_PUBLISH                 False
FLAG_MOBIL                      False
FLAG_EMP_PHONE                  False
FLAG_WORK_PHONE                 False
FLAG_CONT_MOBILE                False
FLAG_PHONE                      False
FLAG_EMAIL                      False
OCCUPATION_T

### 6. Ajustando variáveis categóricas

In [216]:
categoricas = list(x_train.select_dtypes('object').columns)
categoricas

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'EMERGENCYSTATE_MODE']

In [217]:
# flags = []
# categories = []

# for column in categoricas:
#     prefix = column[:4]
#     suffix = column[-4:]
#     if prefix == 'CODE' or prefix == 'FLAG' or suffix == 'MODE':
#         flags.append(column)
#     else:
#         categories.append(column)

--------------------------------------------------------------------------------------------------------------------------------

DÚVIDAS

- jeito melhor de fazer o encoding das flags?
- usar one hot encoding?


--------------------------------------------------------------------------------------------------------------------------------


In [218]:
for column in categoricas:
    x_train["{0}_CAT".format(column)] = LabelEncoder().fit_transform(x_train[column])
    x_train = x_train.drop(columns = column, axis=1)
    
x_train.head(10)

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_2,EXT_SOURCE_3,YEARS_BEGINEXPLUATATION_AVG,FLOORSMAX_AVG,YEARS_BEGINEXPLUATATION_MODE,FLOORSMAX_MODE,YEARS_BEGINEXPLUATATION_MEDI,FLOORSMAX_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_TYPE_CAT,CODE_GENDER_CAT,FLAG_OWN_CAR_CAT,FLAG_OWN_REALTY_CAT,NAME_TYPE_SUITE_CAT,NAME_INCOME_TYPE_CAT,NAME_EDUCATION_TYPE_CAT,NAME_FAMILY_STATUS_CAT,NAME_HOUSING_TYPE_CAT,OCCUPATION_TYPE_CAT,WEEKDAY_APPR_PROCESS_START_CAT,ORGANIZATION_TYPE_CAT,EMERGENCYSTATE_MODE_CAT
123473,243191,0,171000.0,555273.0,16366.5,463500.0,0.035792,-23349,365243,-3595.0,-4408,1,0,0,1,0,0,1.0,2,2,9,0,0,0,0,0,0,0.358568,0.563835,0.977713,0.226512,0.9871,0.1667,0.9816,0.1667,0.0,0.0,0.0,0.0,0.0,-2058.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,1,0,6,3,4,5,1,12,5,57,1
10118,111778,1,157500.0,198085.5,23638.5,171000.0,0.010032,-10921,-117,-4281.0,-3399,1,1,1,1,1,0,3.0,2,2,7,0,0,0,0,0,0,0.490305,0.595456,0.9742,0.1667,0.9742,0.1667,0.9742,0.1667,0.0645,1.0,0.0,1.0,0.0,-73.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0,1,0,1,6,7,4,1,1,8,2,4,0
64716,175057,0,135000.0,776304.0,25173.0,648000.0,0.035792,-23213,-2157,-5680.0,-5009,1,1,0,1,0,0,2.0,2,2,13,0,0,0,0,0,0,0.643404,0.706205,0.977713,0.226512,0.9871,0.1667,0.9816,0.1667,0.0,2.0,0.0,2.0,0.0,-1959.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0,0,1,1,1,6,7,3,0,1,4,0,42,1
234940,372147,1,164133.0,900000.0,36787.5,900000.0,0.030755,-10703,-2530,-2618.0,-2751,1,1,1,1,1,0,3.0,2,2,10,0,0,0,0,1,1,0.426431,0.506484,0.9841,0.3333,0.9841,0.3333,0.9841,0.3333,0.1222,0.0,0.0,0.0,0.0,-531.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0,0,1,1,1,6,1,4,1,1,6,5,47,0
236051,373412,0,225000.0,533668.5,21294.0,477000.0,0.025164,-15798,-3520,-8006.0,-5001,1,1,0,1,0,0,2.0,2,2,12,0,0,0,0,0,0,0.445701,0.528093,0.977713,0.226512,0.9871,0.1667,0.9816,0.1667,0.0,6.0,1.0,6.0,0.0,-9.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0,0,1,0,1,5,1,4,1,1,8,2,16,1
30611,135529,0,405000.0,675000.0,30631.5,675000.0,0.010006,-16721,-1482,-1432.0,-265,1,1,0,1,0,0,2.0,2,1,11,0,0,0,0,0,0,0.71603,0.546023,0.977713,0.226512,0.9871,0.1667,0.9816,0.1667,0.0,4.0,0.0,4.0,0.0,-128.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,1,0,6,7,1,1,1,0,2,26,1
871,100999,0,450000.0,2125953.0,81108.0,1984500.0,0.072508,-10121,-1354,-1393.0,-2672,1,1,0,1,1,0,1.0,1,1,16,0,0,0,0,0,0,0.683097,0.511034,0.9806,0.4583,0.9806,0.4583,0.9806,0.4583,0.1356,0.0,0.0,0.0,0.0,-4.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1,0,6,1,1,3,5,6,0,3,0
153082,277437,0,180000.0,679500.0,24201.0,679500.0,0.0228,-17676,-2535,-472.0,-1229,1,1,0,1,1,0,1.0,2,2,11,0,0,0,0,0,0,0.181508,0.7463,0.9985,0.375,0.9985,0.375,0.9985,0.375,0.1867,1.0,0.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,6,1,4,3,1,13,0,43,0
188110,318071,1,180000.0,270000.0,13500.0,270000.0,0.020246,-16568,-2155,-3417.0,-122,1,1,0,1,1,0,3.0,3,3,13,0,0,0,0,0,0,0.501046,0.725276,0.9955,0.1667,0.9955,0.1667,0.9955,0.1667,0.087,0.0,0.0,0.0,0.0,-1043.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0,1,6,7,1,1,1,6,0,5,0
278046,422174,0,252000.0,1762110.0,48586.5,1575000.0,0.006008,-19355,-1731,-4277.0,-2751,1,1,0,1,0,0,2.0,2,2,11,0,0,0,0,1,1,0.288642,0.636376,0.977713,0.226512,0.9871,0.1667,0.9816,0.1667,0.0,0.0,0.0,0.0,0.0,-552.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0,1,1,1,6,1,4,1,1,8,6,7,1


In [221]:
for column in categoricas:
    x_test["{0}_CAT".format(column)] = LabelEncoder().fit_transform(x_test[column])
    x_test = x_test.drop(columns = column, axis=1)
    
x_test.head(10)

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_2,EXT_SOURCE_3,YEARS_BEGINEXPLUATATION_AVG,FLOORSMAX_AVG,YEARS_BEGINEXPLUATATION_MODE,FLOORSMAX_MODE,YEARS_BEGINEXPLUATATION_MEDI,FLOORSMAX_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_TYPE_CAT,CODE_GENDER_CAT,FLAG_OWN_CAR_CAT,FLAG_OWN_REALTY_CAT,NAME_TYPE_SUITE_CAT,NAME_INCOME_TYPE_CAT,NAME_EDUCATION_TYPE_CAT,NAME_FAMILY_STATUS_CAT,NAME_HOUSING_TYPE_CAT,OCCUPATION_TYPE_CAT,WEEKDAY_APPR_PROCESS_START_CAT,ORGANIZATION_TYPE_CAT,EMERGENCYSTATE_MODE_CAT
245895,384575,2,207000.0,465457.5,52641.0,418500.0,0.00963,-13297,-762,-637.0,-4307,1,1,0,1,0,0,4.0,2,2,11,0,0,0,0,1,1,0.604894,0.000527,0.977823,0.225365,0.9791,0.1667,0.9816,0.1667,0.0,0.0,0.0,0.0,0.0,-2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,1.0,0,1,1,0,6,1,4,1,1,15,4,5,1
98194,214010,0,247500.0,1281712.5,48946.5,1179000.0,0.006852,-14778,-1141,-1610.0,-4546,1,1,0,1,0,1,1.0,3,3,10,0,0,0,0,0,0,0.425351,0.712155,0.997,0.4167,0.994,0.3333,0.997,0.4167,0.0754,2.0,0.0,2.0,0.0,-1071.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0,0,0,1,1,6,1,1,3,1,10,4,5,0
36463,142232,0,202500.0,495000.0,39109.5,495000.0,0.035792,-17907,-639,-2507.0,-1461,1,1,1,1,0,0,2.0,2,2,16,0,0,0,0,0,0,0.53176,0.207964,0.977823,0.225365,0.9791,0.1667,0.9816,0.1667,0.0,5.0,0.0,5.0,0.0,-1435.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0,0,0,1,0,6,7,4,1,1,15,5,42,1
249923,389171,0,247500.0,254700.0,24939.0,225000.0,0.04622,-19626,-6982,-11167.0,-3158,1,1,0,1,0,0,1.0,1,1,14,0,0,0,0,0,0,0.693521,0.614414,0.9846,0.625,0.9846,0.625,0.9846,0.625,0.1285,0.0,0.0,0.0,0.0,-2000.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,6,4,4,5,1,6,0,5,0
158389,283617,0,112500.0,308133.0,15862.5,234000.0,0.01885,-20327,-1105,-7299.0,-494,1,1,0,1,0,0,1.0,2,2,11,0,0,0,0,0,0,0.56069,0.636376,0.9717,0.1667,0.9717,0.1667,0.9717,0.1667,0.0765,0.0,0.0,0.0,0.0,-173.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0,0,1,0,1,6,7,4,3,1,8,6,5,0
226343,362171,0,85500.0,152820.0,16456.5,135000.0,0.006629,-19130,365243,-8549.0,-959,1,0,0,1,1,0,2.0,2,2,6,0,0,0,0,0,0,0.519127,0.104795,0.977823,0.225365,0.9791,0.1667,0.9816,0.1667,0.0,3.0,0.0,3.0,0.0,-1011.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0,1,0,1,6,3,4,1,1,12,4,57,1
69571,180689,1,112500.0,900000.0,24750.0,900000.0,0.015221,-15722,-345,-492.0,-4672,1,1,0,1,1,0,3.0,2,2,12,0,0,0,1,0,1,0.547963,0.510129,0.977823,0.225365,0.9791,0.1667,0.9816,0.1667,0.0,0.0,0.0,0.0,0.0,-440.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,2.0,0.0,0,0,0,0,1,1,4,1,1,12,6,5,1
181469,310328,0,141606.0,810000.0,33120.0,810000.0,0.018801,-10801,-260,-263.0,-3152,1,1,1,1,0,0,1.0,2,2,10,0,0,0,0,0,0,0.012588,0.490258,0.9747,0.1667,0.9747,0.1667,0.9747,0.1667,0.0901,5.0,0.0,5.0,0.0,-222.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1,1,6,1,4,3,1,8,0,47,0
114736,233043,0,130500.0,781920.0,34443.0,675000.0,0.0228,-23516,365243,-1663.0,-4763,1,0,0,1,0,0,2.0,2,2,8,0,0,0,0,0,0,0.077535,0.633032,0.977823,0.225365,0.9791,0.1667,0.9816,0.1667,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,5.0,0,0,0,1,6,3,4,1,1,12,2,57,1
114007,232220,1,99000.0,900000.0,35824.5,900000.0,0.026392,-8755,-649,-8733.0,-1436,1,1,0,1,0,0,2.0,2,2,14,0,0,0,0,0,0,0.740082,0.510129,0.9762,0.1667,0.9762,0.1667,0.9762,0.1667,0.0478,3.0,0.0,3.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.006091,0.008083,0.034966,0.270401,0.259686,1.895366,0,0,0,1,6,7,4,2,1,8,6,51,0


--------------------------------------------------------------------------------------------------------------------------------

DÚVIDAS

- quais outras variáveis podem ser relevantes para incluirmos na análise?
- usar dataset de previous_applications?


--------------------------------------------------------------------------------------------------------------------------------


In [219]:
x_train.to_csv("datasets/train_dataset.csv", index=True)

### 7. Lasso Regression

In [257]:
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures



def experiment(msg, pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    
    model = pipeline.named_steps['lin_reg']
    print('{}: \nintercept = {},\ncoefs = {}'.format(msg, model.intercept_, model.coef_))
    
    y_pred = pipeline.predict(X_test)
    RMSE = np.sqrt(mean_squared_error(y_pred, y_test))
    print('RMSE: {}'.format(RMSE))
    print()
    
alpha = 1

# Test o fit da regularização lasso.
# Test o fit da regularização lasso.
poly_reg_lasso = Pipeline([
        ("std_scaler", StandardScaler()), 
        ("lin_reg", Lasso(alpha=alpha))
    ])
experiment('Regularização Lasso', reg_lasso, x_train, y_train, x_test, y_test)
    



Regularização Lasso: 
intercept = 0.08079412051640597,
coefs = [-0.  0. -0. -0. -0. -0. -0.  0. -0.  0.  0.  0.  0.  0.  0. -0. -0.  0.
  0.  0. -0.  0.  0.  0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0. -0.
  0.  0.  0.  0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0.
 -0. -0. -0. -0. -0. -0.  0.  0.  0.  0. -0. -0.  0. -0.  0. -0. -0.  0.
  0.  0. -0.  0. -0.  0. -0.  0.]
RMSE: 0.27201597064542693



### 8. Forward Stepwise Selection

In [277]:
x_train_short = x_train[0:30000]
x_test_short = x_test[0:30000]
y_train_short = y_train[0:30000]
y_test_short = y_test[0:30000]

In [282]:
def processSubset(feature_set):
    # Fit model on feature_set and calculate RSS
    model = sm.OLS(y_train_short,x_train_short[list(feature_set)])
    regr = model.fit()
    RSS = ((regr.predict(x_test_short[list(feature_set)]) - y_test_short) ** 2).sum()
    return {"model":regr, "RSS":RSS}

In [279]:
def forward(predictors):

    # Pull out predictors we still need to process
    remaining_predictors = [p for p in x_train_short.columns if p not in predictors]
    
    tic = time.time()
    
    results = []
    
    for p in remaining_predictors:
        results.append(processSubset(predictors+[p]))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

In [280]:
models_fwd = pd.DataFrame(columns=["RSS", "model"])

predictors = []

for i in range(1,len(x_train_short.columns)+1):    
    models_fwd.loc[i] = forward(predictors)
    predictors = models_fwd.loc[i]["model"].model.exog_names



Processed  80 models on 1 predictors in 1.2151122093200684 seconds.
Processed  79 models on 2 predictors in 1.3953793048858643 seconds.
Processed  78 models on 3 predictors in 1.510258436203003 seconds.
Processed  77 models on 4 predictors in 1.6325626373291016 seconds.
Processed  76 models on 5 predictors in 2.0422754287719727 seconds.
Processed  75 models on 6 predictors in 1.853034257888794 seconds.
Processed  74 models on 7 predictors in 2.284928560256958 seconds.
Processed  73 models on 8 predictors in 2.4488494396209717 seconds.
Processed  72 models on 9 predictors in 2.5578901767730713 seconds.
Processed  71 models on 10 predictors in 2.566617012023926 seconds.
Processed  70 models on 11 predictors in 2.7113425731658936 seconds.
Processed  69 models on 12 predictors in 2.927248001098633 seconds.
Processed  68 models on 13 predictors in 3.04858660697937 seconds.
Processed  67 models on 14 predictors in 3.26340651512146 seconds.
Processed  66 models on 15 predictors in 3.445952892

In [281]:
print(models_fwd.loc[10, "model"].summary())

                            OLS Regression Results                            
Dep. Variable:                 TARGET   R-squared:                       0.053
Model:                            OLS   Adj. R-squared:                  0.053
Method:                 Least Squares   F-statistic:                     187.1
Date:                Mon, 23 Nov 2020   Prob (F-statistic):               0.00
Time:                        15:14:19   Log-Likelihood:                -2668.9
No. Observations:               30000   AIC:                             5358.
Df Residuals:                   29990   BIC:                             5441.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
REGION_RATING_CLIE

### 9. Backward Elimination

In [283]:
def backward(predictors):
    
    tic = time.time()
    
    results = []
    
    for combo in itertools.combinations(predictors, len(predictors)-1):
        results.append(processSubset(combo))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)-1, "predictors in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

In [284]:
models_bwd = pd.DataFrame(columns=["RSS", "model"], index = range(1,len(x_train_short.columns)))

tic = time.time()
predictors = x_train_short.columns

while(len(predictors) > 1):  
    models_bwd.loc[len(predictors)-1] = backward(predictors)
    predictors = models_bwd.loc[len(predictors)-1]["model"].model.exog_names

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

Processed  80 models on 79 predictors in 46.165106534957886 seconds.
Processed  79 models on 78 predictors in 37.325110912323 seconds.
Processed  78 models on 77 predictors in 34.28974366188049 seconds.
Processed  77 models on 76 predictors in 65.30765080451965 seconds.
Processed  76 models on 75 predictors in 59.45371603965759 seconds.
Processed  75 models on 74 predictors in 59.65544843673706 seconds.
Processed  74 models on 73 predictors in 58.366652727127075 seconds.
Processed  73 models on 72 predictors in 52.738723039627075 seconds.
Processed  72 models on 71 predictors in 60.850531339645386 seconds.
Processed  71 models on 70 predictors in 53.02118277549744 seconds.
Processed  70 models on 69 predictors in 52.958906412124634 seconds.
Processed  69 models on 68 predictors in 45.840383529663086 seconds.
Processed  68 models on 67 predictors in 36.911710262298584 seconds.
Processed  67 models on 66 predictors in 32.132951974868774 seconds.
Processed  66 models on 65 predictors in 2

In [285]:
print(models_bwd.loc[10, "model"].summary())

                                 OLS Regression Results                                
Dep. Variable:                 TARGET   R-squared (uncentered):                   0.129
Model:                            OLS   Adj. R-squared (uncentered):              0.129
Method:                 Least Squares   F-statistic:                              444.7
Date:                Mon, 23 Nov 2020   Prob (F-statistic):                        0.00
Time:                        15:37:23   Log-Likelihood:                         -2670.7
No. Observations:               30000   AIC:                                      5361.
Df Residuals:                   29990   BIC:                                      5444.
Df Model:                          10                                                  
Covariance Type:            nonrobust                                                  
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------

In [286]:
print("-----------------")
print("Foward Selection:")
print("-----------------")
print(models_fwd.loc[10, "model"].params)

-----------------
Foward Selection:
-----------------
REGION_RATING_CLIENT_W_CITY    0.010314
EXT_SOURCE_3                  -0.220968
FLAG_MOBIL                     0.238574
EXT_SOURCE_2                  -0.167734
CODE_GENDER_CAT                0.034494
NAME_EDUCATION_TYPE_CAT        0.008950
FLAG_DOCUMENT_3                0.018900
FLAG_OWN_CAR_CAT              -0.022638
DAYS_BIRTH                     0.000002
REG_CITY_NOT_WORK_CITY         0.015034
dtype: float64


In [287]:
print("-------------------")
print("Backward Selection:")
print("-------------------")
print(models_bwd.loc[10, "model"].params)

-------------------
Backward Selection:
-------------------
AMT_CREDIT                 1.614350e-07
AMT_GOODS_PRICE           -1.817205e-07
DAYS_BIRTH                 1.490737e-06
DAYS_EMPLOYED              7.218241e-07
FLAG_EMP_PHONE             2.802077e-01
EXT_SOURCE_2              -1.756071e-01
EXT_SOURCE_3              -2.194009e-01
CODE_GENDER_CAT            3.309777e-02
FLAG_OWN_CAR_CAT          -2.304141e-02
NAME_EDUCATION_TYPE_CAT    9.010953e-03
dtype: float64
