# Seleção de Características - Método Recursive Feature Elimination (RFE)

As previously noted, recursive feature elimination (RFE) is basically a backward selection of the predictors. This technique begins by building a model on the entire set of predictors and computing an importance score for each predictor. The least important predictor(s) are then removed, the model is re-built, and importance scores are computed again. In practice, the analyst specifies the number of predictor subsets to evaluate as well as each subset’s size. Therefore, the subset size is a tuning parameter for RFE. The subset size that optimizes the performance criteria is used to select the predictors based on the importance rankings. The optimal subset is then used to train the final model.

### Bibliotecas Necessárias

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualize
import matplotlib.pyplot as plt

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [2]:
# Carga da base original
dataset = pd.read_csv('dataset/dataset-normalizado.csv', header = 0)

# Reordena colocando a coluna is_approved no final da tabela
dataset = dataset[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus',
       'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid',
       'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'Medu_0',
       'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 'Fedu_2',
       'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other',
       'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course',
       'reason_home', 'reason_other', 'reason_reputation', 'guardian_father',
       'guardian_mother', 'guardian_other', 'is_approved']]

In [19]:
print("Rows: {}".format(len(dataset)))
print("Columns: {}" .format(len(dataset.columns)))

dataset.head()

Rows: 1044
Columns: 52


Unnamed: 0,school,sex,age,address,famsize,Pstatus,traveltime,studytime,failures,schoolsup,...,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,guardian_father,guardian_mother,guardian_other,is_approved
0,0.0,0.0,0.428571,0.0,0.0,0.0,0.333333,0.333333,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.285714,0.0,0.0,1.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.333333,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.142857,0.0,0.0,1.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


### Aplicação da Seleção de Características

In [20]:
# Gera o array com os valores do dataset
array = dataset.values

# Separa o dataset original em atributos e classe
features = array[:, 0:51]
target = array[:, 51]

# Cria o classificador que é utilizado para avaliar o subconjunto de características
model = LogisticRegression(solver='lbfgs')

# Cria o modelo RFE e seleciona o número de características k
selected_features = 10
rfe = RFE(model, selected_features)
fit = rfe.fit(features, target)

### Descrição das características selecionadas

In [21]:
# Nome das características do dataset original
columns = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'traveltime', 'studytime', 'failures', 'schoolsup', 
            'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 
            'Dalc', 'Walc', 'health', 'absences', 'Medu_0', 'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 
            'Fedu_2', 'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 
            'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 
            'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other']

# Obtém as caracteristicas selecionadas e o seu ranking de priorização
features_selected = fit.support_
features_ranking = fit.ranking_

# Gera a tabela com as informações das características selecionadas
dataset_features = pd.DataFrame(columns, columns=['Features'])
dataset_features['Features Selected'] = features_selected
dataset_features['Features Ranking'] = features_ranking

# Exibe a tabela com as características selecionadas
print("Number of features: {} features selected" .format(fit.n_features_))
# dataset_features.sort_values(by='Features Ranking')

Number of features: 10 features selected


In [23]:
support = list(fit.get_support())
import pandas
df = pd.DataFrame(support, columns=["WRAPPER - RFE"])
df.head()

Unnamed: 0,WRAPPER - RFE
0,True
1,False
2,False
3,False
4,False


In [24]:
df.to_csv("results/rfe.csv", sep=',',index=False)

In [15]:
dataset_features.sort_values(by='Features Ranking')

Unnamed: 0,Features,Features Selected,Features Ranking
0,school,True,1
34,Mjob_at_home,True,1
33,Fedu_4,True,1
23,absences,True,1
20,Dalc,True,1
14,higher,True,1
9,schoolsup,True,1
8,failures,True,1
19,goout,True,1
7,studytime,True,1


## Criamos um novo dataset apenas com as características selecionadas e a coluna de target

In [16]:
fs_dataset = dataset[['school',
'Mjob_at_home',
'Fedu_4',
'absences',
'Dalc',
'higher',
'schoolsup',
'failures',
'goout',
'studytime',
'is_approved']]

fs_dataset.head()

Unnamed: 0,school,Mjob_at_home,Fedu_4,absences,Dalc,higher,schoolsup,failures,goout,studytime,is_approved
0,0.0,1.0,1.0,0.08,0.0,1.0,1.0,0.0,0.75,0.333333,0.0
1,0.0,1.0,0.0,0.053333,0.0,1.0,0.0,0.0,0.5,0.333333,0.0
2,0.0,1.0,0.0,0.133333,0.25,1.0,1.0,1.0,0.25,0.333333,0.0
3,0.0,0.0,0.0,0.026667,0.0,1.0,0.0,0.0,0.25,0.666667,1.0
4,0.0,0.0,0.0,0.053333,0.0,1.0,0.0,0.0,0.25,0.333333,0.0


In [8]:
fs_dataset.to_csv("results/dataset-fs-rfe.csv", index=False)