# Seleção de features através do coeficiente de correlação de Pearson

## Importando as bibliotecas

In [1]:
import pandas as pd
from scipy import stats

## Carregando os dados

In [2]:
# Fonte:  https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
data = pd.read_csv('credit-card-default.csv')

In [3]:
data.shape

(30000, 24)

In [4]:
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [5]:
data.isnull().sum()

LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

In [6]:
y = data['default payment next month']
x = data.drop('default payment next month', axis=1)

In [7]:
x.shape

(30000, 23)

In [8]:
y.shape

(30000,)

In [9]:
y.value_counts()

0    23364
1     6636
Name: default payment next month, dtype: int64

## Calcular a correlação entre os atributos

In [10]:
data.corr()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
LIMIT_BAL,1.0,0.024755,-0.219161,-0.108139,0.144713,-0.271214,-0.296382,-0.286123,-0.26746,-0.249411,...,0.293988,0.295562,0.290389,0.195236,0.178408,0.210167,0.203242,0.217202,0.219595,-0.15352
SEX,0.024755,1.0,0.014232,-0.031389,-0.090874,-0.057643,-0.070771,-0.066096,-0.060173,-0.055064,...,-0.02188,-0.017005,-0.016733,-0.000242,-0.001391,-0.008597,-0.002229,-0.001667,-0.002766,-0.039961
EDUCATION,-0.219161,0.014232,1.0,-0.143464,0.175061,0.105364,0.121566,0.114025,0.108793,0.09752,...,-0.000451,-0.007567,-0.009099,-0.037456,-0.030038,-0.039943,-0.038218,-0.040358,-0.0372,0.028006
MARRIAGE,-0.108139,-0.031389,-0.143464,1.0,-0.41417,0.019917,0.024199,0.032688,0.033122,0.035629,...,-0.023344,-0.025393,-0.021207,-0.005979,-0.008093,-0.003541,-0.012659,-0.001205,-0.006641,-0.024339
AGE,0.144713,-0.090874,0.175061,-0.41417,1.0,-0.039447,-0.050148,-0.053048,-0.049722,-0.053826,...,0.051353,0.049345,0.047613,0.026147,0.021785,0.029247,0.021379,0.02285,0.019478,0.01389
PAY_0,-0.271214,-0.057643,0.105364,0.019917,-0.039447,1.0,0.672164,0.574245,0.538841,0.509426,...,0.179125,0.180635,0.17698,-0.079269,-0.070101,-0.070561,-0.064005,-0.05819,-0.058673,0.324794
PAY_2,-0.296382,-0.070771,0.121566,0.024199,-0.050148,0.672164,1.0,0.766552,0.662067,0.62278,...,0.222237,0.221348,0.219403,-0.080701,-0.05899,-0.055901,-0.046858,-0.037093,-0.0365,0.263551
PAY_3,-0.286123,-0.066096,0.114025,0.032688,-0.053048,0.574245,0.766552,1.0,0.777359,0.686775,...,0.227202,0.225145,0.222327,0.001295,-0.066793,-0.053311,-0.046067,-0.035863,-0.035861,0.235253
PAY_4,-0.26746,-0.060173,0.108793,0.033122,-0.049722,0.538841,0.662067,0.777359,1.0,0.819835,...,0.245917,0.242902,0.239154,-0.009362,-0.001944,-0.069235,-0.043461,-0.03359,-0.026565,0.216614
PAY_5,-0.249411,-0.055064,0.09752,0.035629,-0.053826,0.509426,0.62278,0.686775,0.819835,1.0,...,0.271915,0.269783,0.262509,-0.006089,-0.003191,0.009062,-0.058299,-0.033337,-0.023027,0.204149


In [11]:
data.corr()["default payment next month"]

LIMIT_BAL                    -0.153520
SEX                          -0.039961
EDUCATION                     0.028006
MARRIAGE                     -0.024339
AGE                           0.013890
PAY_0                         0.324794
PAY_2                         0.263551
PAY_3                         0.235253
PAY_4                         0.216614
PAY_5                         0.204149
PAY_6                         0.186866
BILL_AMT1                    -0.019644
BILL_AMT2                    -0.014193
BILL_AMT3                    -0.014076
BILL_AMT4                    -0.010156
BILL_AMT5                    -0.006760
BILL_AMT6                    -0.005372
PAY_AMT1                     -0.072929
PAY_AMT2                     -0.058579
PAY_AMT3                     -0.056250
PAY_AMT4                     -0.056827
PAY_AMT5                     -0.055124
PAY_AMT6                     -0.053183
default payment next month    1.000000
Name: default payment next month, dtype: float64

In [12]:
stats.pearsonr(x["PAY_0"], y)

(0.32479372847861243, 0.0)

### A correlação calculada pelo pandas é o mesmo coeficiente de correlação de Pearson do stats

In [13]:
data.corr()["default payment next month"].abs() > 0.1

LIMIT_BAL                      True
SEX                           False
EDUCATION                     False
MARRIAGE                      False
AGE                           False
PAY_0                          True
PAY_2                          True
PAY_3                          True
PAY_4                          True
PAY_5                          True
PAY_6                          True
BILL_AMT1                     False
BILL_AMT2                     False
BILL_AMT3                     False
BILL_AMT4                     False
BILL_AMT5                     False
BILL_AMT6                     False
PAY_AMT1                      False
PAY_AMT2                      False
PAY_AMT3                      False
PAY_AMT4                      False
PAY_AMT5                      False
PAY_AMT6                      False
default payment next month     True
Name: default payment next month, dtype: bool

In [14]:
highly_corr_feat = data.columns[data.corr()['default payment next month'].abs() > .2]

In [15]:
highly_corr_feat

Index(['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5',
       'default payment next month'],
      dtype='object')

In [16]:
highly_corr_feat = highly_corr_feat.drop('default payment next month')

In [17]:
x_corr = x[highly_corr_feat]

In [18]:
x_corr

Unnamed: 0,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5
0,2,2,-1,-1,-2
1,-1,2,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,-1,0,-1,0,0
...,...,...,...,...,...
29995,0,0,0,0,0
29996,-1,-1,-1,-1,0
29997,4,3,2,-1,0
29998,1,-1,0,0,0


## Avaliação do desempenho de classificação com os datasets

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [20]:
def get_best_model(model_name, x, y):
    
    if model_name == 'd_tree':
        params = {'max_depth': [None, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
        model = DecisionTreeClassifier()
    elif model_name == 'r_forest':
        params = {'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 1, 3, 5, 7]}
        model = RandomForestClassifier()
    else:
        print("Oops! That was no valid model. Try again...")
    
    
    grid = GridSearchCV(model,           
                        params,          
                        error_score=0.)  
    grid.fit(x, y) 
    
    print("Maior acurácia: {}".format(grid.best_score_))
    print("Melhores parâmetros: {}".format(grid.best_params_))
    print("Tempo médio para o otimizador fitar os dados (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    print("Tempo médio para o otimizador realizar a predição (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

In [21]:
get_best_model("d_tree", x_corr, y)

Maior acurácia: 0.8213333333333332
Melhores parâmetros: {'max_depth': 3}
Tempo médio para o otimizador fitar os dados (s): 0.021
Tempo médio para o otimizador realizar a predição (s): 0.003


In [22]:
get_best_model("d_tree", x, y)

Maior acurácia: 0.8206333333333333
Melhores parâmetros: {'max_depth': 3}
Tempo médio para o otimizador fitar os dados (s): 0.395
Tempo médio para o otimizador realizar a predição (s): 0.003


In [23]:
get_best_model("r_forest", x_corr, y)

Maior acurácia: 0.8208333333333334
Melhores parâmetros: {'max_depth': 5, 'n_estimators': 100}
Tempo médio para o otimizador fitar os dados (s): 0.575
Tempo médio para o otimizador realizar a predição (s): 0.056


In [24]:
get_best_model("r_forest", x, y)

Maior acurácia: 0.8207333333333333
Melhores parâmetros: {'max_depth': 7, 'n_estimators': 50}
Tempo médio para o otimizador fitar os dados (s): 2.918
Tempo médio para o otimizador realizar a predição (s): 0.071


Atividade (1.25): realizar a seleção de de características baseada no coeficiente de correlação de Pearson atendendo aos seguintes itens:
<ol>
        <li> Criar uma função chamada feature_selector que irá receber um limiar como parâmetro de entrada e deverá retornar um subconjunto dos dados referente aquelas features cuja correlação está acima do limiar passado; </li> 
        <li> Extrapolar a função get_best_model para abranger os classificadores svm e gxboost;</li> 
        <li> Extrapolar a função get_best_model para abranger a otimização Bayesiana (Scikit-Optimize);</li> 
        <li> Apresentar uma tabela de comparação dos resultado da classificação considerando cada 1 dos otimizadores: acurácia de teste, kappa de teste e tempo médio do otimizador para fitar os dados. Dica: divisão treino é a sua escolha</li>    
</ol 