# Seleção de Características - Método Recursive Feature Elimination (RFE)
----------------------------------------------------------------------------------------------------------------------------

##### Método do tipo CAMADA

# Bibliotecas Necessárias

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualize
import matplotlib.pyplot as plt

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Carga base de dados normalizada

In [2]:
# Carga da base original
dataset = pd.read_csv('dataset-normalizado.csv', header = 0)

# Reordena colocando a coluna is_approved no final do dataset
dataset = dataset[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus',
       'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid',
       'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'Medu_0',
       'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 'Fedu_2',
       'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other',
       'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course',
       'reason_home', 'reason_other', 'reason_reputation', 'guardian_father',
       'guardian_mother', 'guardian_other', 'is_approved']]


# Nome das características do dataset original
columns = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'traveltime', 'studytime', 'failures', 'schoolsup', 
            'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 
            'Dalc', 'Walc', 'health', 'absences', 'Medu_0', 'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 
            'Fedu_2', 'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 
            'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 
            'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other']

In [3]:
print("Rows: {}".format(len(dataset)))
print("Columns: {}" .format(len(dataset.columns)))

dataset.head()

Rows: 649
Columns: 52


Unnamed: 0,school,sex,age,address,famsize,Pstatus,traveltime,studytime,failures,schoolsup,...,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,guardian_father,guardian_mother,guardian_other,is_approved
0,0.0,0.0,0.428571,0.0,0.0,0.0,0.333333,0.333333,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.285714,0.0,0.0,1.0,0.0,0.333333,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.333333,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.142857,0.0,0.0,1.0,0.0,0.333333,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


# Aplicação da seleção de características
Seleciona as k_features

In [4]:
k_features = 20

In [5]:
# Gera o array com os valores do dataset
array = dataset.values


# Separa o dataset original em atributos e classe
features = array[:, 0:51]
target = array[:, 51]


# Cria o classificador que é utilizado para avaliar o subconjunto de características
model = LogisticRegression(solver='lbfgs')


# Cria o modelo RFE e seleciona o número de características k
rfe = RFE(model, k_features)
fit = rfe.fit(features, target)


# Obtém as caracteristicas selecionadas e o seu ranking de priorização
features_selected = fit.support_
features_ranking = fit.ranking_


# Gera a tabela com as informações das características selecionadas
dataset_features = pd.DataFrame(columns, columns=['Feature'])
dataset_features['Features Selected'] = features_selected
dataset_features['Features Ranking'] = features_ranking


# Ordena o dataset_features pelo score
dataset_features = dataset_features.sort_values(by='Features Ranking')


# Gera a coluna com o id da coluna original
dataset_features['Id column'] = dataset_features.index


# Reinicia o número da linha
dataset_features.reset_index(drop=True, inplace=True)


# Seleciona as k_features com maior score
dataset_k_features = dataset_features.head(k_features).copy()


# Exibe a tabela com as características selecionadas
print("Número de características selecionadas: {}" .format(fit.n_features_))
dataset_k_features

Número de características selecionadas: 20


Unnamed: 0,Feature,Features Selected,Features Ranking,Id column
0,school,True,1,0
1,reason_reputation,True,1,47
2,reason_course,True,1,44
3,Fjob_services,True,1,42
4,Mjob_teacher,True,1,38
5,Fedu_0,True,1,29
6,Medu_3,True,1,27
7,guardian_mother,True,1,49
8,absences,True,1,23
9,Dalc,True,1,20


# Gera o dataset com as características selecionadas

In [6]:
# Obtém o nome das características selecionadas
array_features = dataset_k_features['Feature'].copy()


# Reduz o dataset para o número de características selecionadas
dataset_selected = fit.transform(features)  


# Converte o dataset_selected em um DataFrame
dataset_reduced = pd.DataFrame(dataset_selected, columns=[array_features])


# Adiciona a coluna is_approved
dataset_target = pd.DataFrame(target, columns=['is_approved'])
dataset_reduced['is_approved'] = dataset_target


# Exibe o dataset reduzido
print("Número de características selecionadas: {}" .format(fit.n_features_))
dataset_reduced.head()

Número de características selecionadas: 20


Feature,school,reason_reputation,reason_course,Fjob_services,Mjob_teacher,Fedu_0,Medu_3,guardian_mother,absences,Dalc,...,higher,nursery,guardian_other,paid,age,famsup,Pstatus,traveltime,failures,is_approved
0,0.0,0.428571,0.0,0.333333,0.0,0.0,0.0,1.0,1.0,0.0,...,0.125,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.285714,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0625,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.1875,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.142857,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


# Descrição das características selecionadas
Abaixo contém todas as características ordenadas pela prioridade de seleção

In [7]:
print('Características ordenadas pela prioridade de seleção')
dataset_features

Características ordenadas pela prioridade de seleção


Unnamed: 0,Feature,Features Selected,Features Ranking,Id column
0,school,True,1,0
1,reason_reputation,True,1,47
2,reason_course,True,1,44
3,Fjob_services,True,1,42
4,Mjob_teacher,True,1,38
5,Fedu_0,True,1,29
6,Medu_3,True,1,27
7,guardian_mother,True,1,49
8,absences,True,1,23
9,Dalc,True,1,20


# Exporta o dataset reduzido com os características selecionadas

In [8]:
dataset_reduced.to_csv("dataset-fs-recursive-feature.csv", index=False)
print("Rows: {}".format(len(dataset_reduced)))
print("Columns: {}" .format(len(dataset_reduced.columns)))
dataset_reduced.head()

Rows: 649
Columns: 21


Feature,school,reason_reputation,reason_course,Fjob_services,Mjob_teacher,Fedu_0,Medu_3,guardian_mother,absences,Dalc,...,higher,nursery,guardian_other,paid,age,famsup,Pstatus,traveltime,failures,is_approved
0,0.0,0.428571,0.0,0.333333,0.0,0.0,0.0,1.0,1.0,0.0,...,0.125,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.285714,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0625,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.1875,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.142857,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
