In [2]:
import pandas as pd
pd.options.display.max_rows = 300
pd.options.display.float_format = '{:,.4f}'.format

import numpy as np

from scipy import stats

import seaborn as sns
sns.set(style="white")

import matplotlib.pyplot as plt
%matplotlib inline  
from IPython.display import Image
from IPython.core.display import HTML 

import os

In [3]:
# Read dataframe from local computer and format columns

PATH = '/Users/brunofbessa/Documents/study/mestrado/MAI5003/home_credit'
os.chdir(PATH)

df = pd.read_csv('1.Data Prep/Dados Processados/modelling_table_2.csv')
df = df.rename(columns=str.lower)
df.drop('unnamed: 0', axis='columns', inplace=True)

df_default = df[df['target']==1]
df_non_default = df[df['target']==0]

# Read metadata information from excel

df_meta = pd.read_excel('metadados_base_final.xlsx') 
df_meta = df_meta.rename(columns=str.lower)
df_meta['variable'] = df_meta['variable'].str.lower()

df_meta_float_columns = df_meta[df_meta['type']=='FLOAT']['variable'].values.tolist()

In [142]:
df_feat_eng = pd.read_csv('1.Data Prep/Dados Processados/modelling_table_5.csv')
df_feat_eng = df_feat_eng.rename(columns=str.lower)
df_feat_eng.drop('unnamed: 0', axis='columns', inplace=True)
df_feat_eng.columns = df_feat_eng.columns.str.replace('[(,),.]', '_') 


df_fe_default = df_feat_eng[df_feat_eng['target']==1]
df_fe_non_default = df_feat_eng[df_feat_eng['target']==0]

df_fe_meta_float_columns = df_feat_eng.select_dtypes(include=[np.float]).columns.to_list()

In [143]:
# Select random sample from data frame because p-value is not precise for N > 500
np.random.seed(123)

df_sample = df.sample(n=5000)
df_default_sample = df_default.sample(n=5000)
df_non_default_sample = df_non_default.sample(n=5000)

df_fe_sample = df_feat_eng.sample(n=5000)
df_fe_default_sample = df_fe_default.sample(n=5000)
df_fe_non_default_sample = df_fe_non_default.sample(n=5000)

In [128]:
# Perform hypothesis test for sample (column)
def is_normal(dataframe: pd.core.frame.DataFrame) -> bool:
    # The null hypotesis: the sample comes from a normal distribution.
    alpha = 0.05
    _, p_normaltest = stats.normaltest(dataframe)
    _, p_shapiro = stats.shapiro(dataframe)
    _, p_kstest = stats.kstest(dataframe, 'norm')
    # print('p_normaltest: {},  p_shapiro: {}, , p_kstest: {}'.format(p_normaltest, p_shapiro, p_kstest))

    if p_normaltest < alpha and p_shapiro < alpha and p_kstest < alpha:
        # The null hipothesis can be rejected: i.e.: the sample is not normal.
        return False
    # The null hypothesis cannot be rejected: i.e.: the sample may come from a normal distribution.
    return True

# List not normal variables elegible to treatment
def list_non_normal_columns(dataframe: pd.core.frame.DataFrame, metadata: list) -> list:
    df_meta_float_columns_not_normal = []
    for col_name in dataframe.columns:
        if col_name in metadata:
            if is_normal(dataframe[col_name]) == False:
                df_meta_float_columns_not_normal.append(col_name)            
    return df_meta_float_columns_not_normal

# List not normal variables elegible to check if treatment was successfull
def list_normal_columns(dataframe: pd.core.frame.DataFrame, metadata: list) -> list:
    df_meta_float_columns_not_normal = []
    for col_name in dataframe.columns:
        if col_name in metadata:
            if is_normal(dataframe[col_name]) == True:
                df_meta_float_columns_not_normal.append(col_name)            
    return df_meta_float_columns_not_normal

In [130]:
# Transform numeric variables (not counts - another transformation will do that)
def boxcox_transformation(dataframe: pd.core.frame.DataFrame, metadata: list) -> pd.core.frame.DataFrame:
    for col_name in dataframe.columns:
        if col_name in metadata:
            if 'cnt' not in col_name:
                new_col = 'transf_box_cox_' + col_name
                dataframe[new_col] = stats.boxcox(dataframe[col], 0)
    return dataframe

In [144]:
boxcox_transformation(df_sample, df_meta_float_columns)
boxcox_transformation(df_default_sample, df_meta_float_columns)
boxcox_transformation(df_non_default_sample, df_meta_float_columns)

boxcox_transformation(df_fe_sample, df_fe_meta_float_columns)
boxcox_transformation(df_fe_default_sample, df_fe_meta_float_columns)
boxcox_transformation(df_fe_non_default_sample, df_fe_meta_float_columns)

pass

In [134]:
print('Two targets:')
print(list_normal_columns(df_sample, df_meta_float_columns))
print('target default:')
print(list_normal_columns(df_default_sample, df_meta_float_columns))
print('target non default:')
print(list_normal_columns(df_non_default_sample, df_meta_float_columns))

Two targets:




['amt_credit_sum_limit_consumer_credit_bureau', 'amt_credit_sum_limit_working_capital_bureau', 'amt_credit_sum_overdue_working_capital_bureau', 'amt_credit_sum_limit_microloan_bureau']
target default:
['amt_credit_sum_limit_consumer_credit_bureau', 'amt_credit_sum_limit_working_capital_bureau', 'amt_credit_sum_overdue_working_capital_bureau', 'amt_credit_sum_limit_microloan_bureau']
target non default:
['amt_credit_sum_limit_consumer_credit_bureau', 'amt_credit_sum_debt_working_capital_bureau', 'amt_credit_sum_limit_working_capital_bureau', 'amt_credit_sum_overdue_working_capital_bureau', 'amt_credit_sum_limit_microloan_bureau', 'amt_credit_sum_overdue_microloan_bureau']


In [145]:
print('Two targets:')
print(list_normal_columns(df_fe_sample, df_fe_meta_float_columns))
print('target default:')
print(list_normal_columns(df_fe_default_sample, df_fe_meta_float_columns))
print('target non default:')
print(list_normal_columns(df_fe_non_default_sample, df_fe_meta_float_columns))

Two targets:
['cnt_purpose_repair_previous', 'amt_credit_sum_limit_consumer_credit_bureau', 'amt_credit_sum_limit_working_capital_bureau', 'amt_credit_sum_overdue_working_capital_bureau', 'amt_credit_sum_limit_microloan_bureau', 'amt_credit_sum_overdue_microloan_bureau', 'delinquency_6_mths_working_capital_bureau', 'delinquency_6_12_mths_working_capital_bureau', 'delinquency_12_24_mths_working_capital_bureau', 'delinquency_24_36_mths_working_capital_bureau', 'delinquency_long_working_capital_bureau', 'delinquency_6_mths_microloan_bureau', 'delinquency_6_12_mths_microloan_bureau', 'delinquency_12_24_mths_microloan_bureau', 'delinquency_24_36_mths_microloan_bureau', 'delinquency_long_microloan_bureau', 'ui_consumer_credit_bureau', 'ui_working_capital_bureau', 'ui_microloan_bureau', 'max_previous_amt_annuity_', 'max_previous_amt_application_', 'max_previous_amt_credit_', 'max_previous_amt_down_payment_', 'max_previous_amt_goods_price_', 'max_previous_cnt_payment_', 'max_previous_days_deci

['cnt_purpose_repair_previous', 'amt_credit_sum_limit_consumer_credit_bureau', 'amt_credit_sum_limit_working_capital_bureau', 'amt_credit_sum_overdue_working_capital_bureau', 'amt_credit_sum_limit_microloan_bureau', 'amt_credit_sum_overdue_microloan_bureau', 'delinquency_6_mths_working_capital_bureau', 'delinquency_6_12_mths_working_capital_bureau', 'delinquency_12_24_mths_working_capital_bureau', 'delinquency_24_36_mths_working_capital_bureau', 'delinquency_long_working_capital_bureau', 'delinquency_6_mths_microloan_bureau', 'delinquency_6_12_mths_microloan_bureau', 'delinquency_12_24_mths_microloan_bureau', 'delinquency_24_36_mths_microloan_bureau', 'delinquency_long_microloan_bureau', 'ui_consumer_credit_bureau', 'ui_working_capital_bureau', 'ui_microloan_bureau', 'max_previous_amt_annuity_', 'max_previous_amt_application_', 'max_previous_amt_credit_', 'max_previous_amt_down_payment_', 'max_previous_amt_goods_price_', 'max_previous_cnt_payment_', 'max_previous_days_decision_', 'max_

In [146]:
df_fe_sample.columns.tolist()

['sk_id_curr',
 'cnt_children',
 'amt_income_total',
 'amt_credit',
 'amt_annuity',
 'amt_goods_price',
 'region_population_relative',
 'days_employed',
 'days_registration',
 'days_id_publish',
 'own_car_age',
 'flag_mobil',
 'flag_emp_phone',
 'flag_work_phone',
 'flag_cont_mobile',
 'flag_phone',
 'flag_email',
 'cnt_fam_members',
 'hour_appr_process_start',
 'reg_region_not_live_region',
 'reg_region_not_work_region',
 'live_region_not_work_region',
 'reg_city_not_live_city',
 'reg_city_not_work_city',
 'live_city_not_work_city',
 'ext_source_1',
 'ext_source_2',
 'ext_source_3',
 'apartments_avg',
 'basementarea_avg',
 'years_beginexpluatation_avg',
 'years_build_avg',
 'commonarea_avg',
 'elevators_avg',
 'entrances_avg',
 'floorsmax_avg',
 'floorsmin_avg',
 'landarea_avg',
 'livingapartments_avg',
 'livingarea_avg',
 'nonlivingapartments_avg',
 'nonlivingarea_avg',
 'apartments_mode',
 'basementarea_mode',
 'years_beginexpluatation_mode',
 'years_build_mode',
 'commonarea_mode',

In [152]:
df_fe_sample.shape

(5000, 1054)