# Setup

In [1]:
#!pip install --pre pycaret
#!pip install  boruta

In [2]:
# libs


from boruta import BorutaPy
from pycaret.datasets import get_data
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV, f_classif, chi2
import warnings
warnings.filterwarnings('ignore')

In [3]:
# dados
df_credit = get_data('credit')


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,90000,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
2,50000,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
3,50000,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
4,50000,1,1,2,37,0,0,0,0,0,...,19394.0,19619.0,20024.0,2500.0,1815.0,657.0,1000.0,1000.0,800.0,0


## Boruta

Para ilustrar o funcionamento dos métodos, vamos criar um dataset. Isso mesmo! Datasets sintéticos são legais para demonstrar esse tipo de coisa. Na prática, não conhecemos a verdadeira relação funcional entre as variáveis que estamos modelando, certo? Tampouco sabemos se as features que ali estão ajudam ou não o nosso modelo. A ideia de criar um dataset sintético é justamente ter a certeza de quais features têm relação com o target e quais não têm!. Dessa formaa, podemos checar a validade desse método.

vamos criar uma lista de variáveis.

In [4]:
variaveis_dataset = [
   'linear',            # 1
   'quadrado',          # 2
   'seno',              # 3
   'interacao1',        # 4
   'interacao2',        # 5
   'interacao3',        # 6
   'aleatoria_1',       # 7
   'aleatoria_2',       # 8
   'aleatoria_3',       # 9
   'aleatoria_4']       # 10

Cada variavel será uma distribuição normal e o dataset é composto por 20000 linhas.

In [5]:
np.random.seed(0)
X = pd.DataFrame(np.random.normal(size = (20000 , len(variaveis_dataset))) , columns =  variaveis_dataset)

Y será uma função de algumas colunas de X (todas, menos as aleatorias)

In [6]:
y = X['linear'] + X['quadrado']**2 + np.sin(3*X['seno']) + (X['interacao1'] * X['interacao2'] * X['interacao3'])

Assim como a análise exploratória e os testes de hipótese, vamos aplicar a seleção de features sempre nos dados de treino.

In [7]:
X_train , X_test, y_train , y_test = train_test_split(X, y , test_size=0.85 , random_state=0)

In [8]:
forest = RandomForestRegressor(n_jobs= -1 , max_depth = 5)

# fit boruta
boruta_selector = BorutaPy(forest , n_estimators = 50 , max_iter=10 , random_state=0)
boruta_selector.fit(np.array(X_train) , np.array(y_train))

In [9]:
# Terceiro filtro com as features selecionadas pelo boruta
cols_drop_boruta= [not x for x in boruta_selector.support_.tolist()] # apenas invertendo o vetor de true/false
cols_drop_boruta= X_train.loc[:,cols_drop_boruta].columns.tolist()

In [10]:
cols_drop_boruta

['seno',
 'interacao3',
 'aleatoria_1',
 'aleatoria_2',
 'aleatoria_3',
 'aleatoria_4']

## Recursive Feature Elimination

In [11]:
# 1 a Seleção Random Forest
rfecv_RFC = RFECV(estimator= forest , scoring = 'neg_mean_squared_error')
rfecv_RFC.fit(X_train,y_train)

In [12]:
print(rfecv_RFC.n_features_) # Número de features selecionadas como as mais importantes 
print(rfecv_RFC.support_) # Vetor de booleans. True se a variável deve ser mantida e False caso contrário
mask_RFC = rfecv_RFC.support_

2
[ True  True False False False False False False False False]


In [13]:
cols_drop_RFE= [not x for x in mask_RFC] # apenas invertendo o vetor de true/false
cols_drop_RFE= X_train.loc[:,cols_drop_RFE].columns.tolist()

In [14]:
cols_drop_RFE

['seno',
 'interacao1',
 'interacao2',
 'interacao3',
 'aleatoria_1',
 'aleatoria_2',
 'aleatoria_3',
 'aleatoria_4']

# Filter

O dataset credit se refere a clientes e seus empréstimos. A coluna Default indica se o cliente honrou com seus compromissos ou não.

In [15]:
X = df_credit.drop('default',axis=1)

In [16]:
Y = df_credit['default']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, random_state=123)

In [18]:
X_train_num = X_train.filter(regex='AGE|BILL|PAY|LIMIT_BAL')
X_train_cat = X_train.loc[:,['SEX', 'EDUCATION', 'MARRIAGE']]

In [19]:
X_train.dtypes

LIMIT_BAL      int64
SEX            int64
EDUCATION      int64
MARRIAGE       int64
AGE            int64
PAY_1          int64
PAY_2          int64
PAY_3          int64
PAY_4          int64
PAY_5          int64
PAY_6          int64
BILL_AMT1    float64
BILL_AMT2    float64
BILL_AMT3    float64
BILL_AMT4    float64
BILL_AMT5    float64
BILL_AMT6    float64
PAY_AMT1     float64
PAY_AMT2     float64
PAY_AMT3     float64
PAY_AMT4     float64
PAY_AMT5     float64
PAY_AMT6     float64
dtype: object

AS colunas com Sufixo PAY e BILL e a coluna LIMIT_BAL são numéricas. Assim, podemos testar as médias dessas features em relação aos grupos da variável target.

## ANOVA

In [20]:
selected_anova = f_classif(X_train_num , y_train)

In [21]:
selected_anova

(array([4.02240288e+02, 1.64865189e+01, 6.79545994e+00, 1.92800920e+03,
        1.22092348e+03, 9.69289080e+02, 8.52752744e+02, 7.43016875e+02,
        6.45306855e+02, 6.27048311e+00, 2.79522433e+00, 2.45270678e+00,
        6.69974724e-01, 4.92312883e-05, 4.29811079e-03, 8.71682739e+01,
        4.49881357e+01, 4.92571911e+01, 5.27855073e+01, 4.81984983e+01,
        5.50188473e+01]),
 array([1.93981945e-088, 4.92193470e-005, 9.14707068e-003, 0.00000000e+000,
        2.78819123e-258, 6.17498623e-207, 6.61875297e-183, 3.95964552e-160,
        1.00574685e-139, 1.22859699e-002, 9.45641585e-002, 1.17341237e-001,
        4.13072030e-001, 9.94401771e-001, 9.47728933e-001, 1.11846938e-020,
        2.04553566e-011, 2.33085494e-012, 3.88358380e-013, 3.99273620e-012,
        1.25071173e-013]))

In [22]:
p_values_num = pd.Series(selected_anova[1])

In [23]:
p_values_num.index = X_train_num.columns

p_values_num.sort_values(ascending=True , inplace=True)

In [24]:
p_values_num

PAY_1         0.000000e+00
PAY_2        2.788191e-258
PAY_3        6.174986e-207
PAY_4        6.618753e-183
PAY_5        3.959646e-160
PAY_6        1.005747e-139
LIMIT_BAL     1.939819e-88
PAY_AMT1      1.118469e-20
PAY_AMT6      1.250712e-13
PAY_AMT4      3.883584e-13
PAY_AMT3      2.330855e-12
PAY_AMT5      3.992736e-12
PAY_AMT2      2.045536e-11
MARRIAGE      4.921935e-05
AGE           9.147071e-03
BILL_AMT1     1.228597e-02
BILL_AMT2     9.456416e-02
BILL_AMT3     1.173412e-01
BILL_AMT4     4.130720e-01
BILL_AMT6     9.477289e-01
BILL_AMT5     9.944018e-01
dtype: float64

In [27]:
p_values_num = p_values_num[p_values_num<0.05]
# Selecionar apenas as variaveis diferentes de 0

In [28]:
p_values_num.index

Index(['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'LIMIT_BAL',
       'PAY_AMT1', 'PAY_AMT6', 'PAY_AMT4', 'PAY_AMT3', 'PAY_AMT5', 'PAY_AMT2',
       'MARRIAGE', 'AGE', 'BILL_AMT1'],
      dtype='object')

In [31]:
X_train_num_anova = X_train_num.loc[: , p_values_num.index]

In [32]:
X_train_num_anova

Unnamed: 0,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,LIMIT_BAL,PAY_AMT1,PAY_AMT6,PAY_AMT4,PAY_AMT3,PAY_AMT5,PAY_AMT2,MARRIAGE,AGE,BILL_AMT1
17118,0,0,0,0,0,0,190000,2000.0,1400.0,2000.0,2000.0,25921.0,3000.0,2,27,14671.0
16570,5,4,3,2,2,-2,20000,0.0,0.0,0.0,0.0,0.0,0.0,2,24,21151.0
3801,1,2,2,2,2,2,110000,3000.0,3000.0,3000.0,0.0,2000.0,2000.0,1,27,46657.0
18325,0,0,0,0,0,0,210000,6149.0,2500.0,2500.0,4547.0,3000.0,4100.0,1,46,134700.0
7907,-1,-1,-1,-2,-1,-1,210000,1624.0,0.0,312.0,0.0,0.0,0.0,2,26,231.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15377,-1,-1,-1,-1,-1,-1,300000,4790.0,9196.0,0.0,456.0,14447.0,6462.0,1,39,6192.0
21602,-2,-2,-2,-2,-2,-2,310000,14000.0,3613.0,5539.0,6286.0,5478.0,6129.0,1,45,8918.0
17730,0,0,0,0,0,0,30000,2300.0,2000.0,1100.0,1718.0,1000.0,2390.0,2,23,28347.0
15725,-1,-1,-1,-1,-1,-1,320000,49619.0,128424.0,44242.0,82691.0,24010.0,143588.0,2,44,29227.0


## Chi2

In [33]:
selected_chi2 = chi2(X_train_cat, y_train)

In [34]:
selected_chi2

(array([6.59238042, 6.10639033, 2.87655167]),
 array([0.01024161, 0.01346939, 0.08987831]))

In [35]:
p_values_cat = pd.Series(selected_chi2[1])

In [36]:
p_values_cat.index = X_train_cat.columns

In [37]:
p_values_cat = p_values_cat[p_values_cat<0.05]

In [38]:
X_train_cat_chi2 = X_train_cat[p_values_cat.index]

In [39]:
p_values_cat

SEX          0.010242
EDUCATION    0.013469
dtype: float64

In [40]:
X_train_filtered = pd.concat([X_train_num_anova, X_train_cat_chi2], axis=1)

In [41]:
X_train_filtered

Unnamed: 0,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,LIMIT_BAL,PAY_AMT1,PAY_AMT6,PAY_AMT4,PAY_AMT3,PAY_AMT5,PAY_AMT2,MARRIAGE,AGE,BILL_AMT1,SEX,EDUCATION
17118,0,0,0,0,0,0,190000,2000.0,1400.0,2000.0,2000.0,25921.0,3000.0,2,27,14671.0,2,1
16570,5,4,3,2,2,-2,20000,0.0,0.0,0.0,0.0,0.0,0.0,2,24,21151.0,1,3
3801,1,2,2,2,2,2,110000,3000.0,3000.0,3000.0,0.0,2000.0,2000.0,1,27,46657.0,1,2
18325,0,0,0,0,0,0,210000,6149.0,2500.0,2500.0,4547.0,3000.0,4100.0,1,46,134700.0,2,2
7907,-1,-1,-1,-2,-1,-1,210000,1624.0,0.0,312.0,0.0,0.0,0.0,2,26,231.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15377,-1,-1,-1,-1,-1,-1,300000,4790.0,9196.0,0.0,456.0,14447.0,6462.0,1,39,6192.0,2,2
21602,-2,-2,-2,-2,-2,-2,310000,14000.0,3613.0,5539.0,6286.0,5478.0,6129.0,1,45,8918.0,1,1
17730,0,0,0,0,0,0,30000,2300.0,2000.0,1100.0,1718.0,1000.0,2390.0,2,23,28347.0,2,2
15725,-1,-1,-1,-1,-1,-1,320000,49619.0,128424.0,44242.0,82691.0,24010.0,143588.0,2,44,29227.0,2,1
