# Seleção de Características - Método Feature Importance
----------------------------------------------------------------------------------------------------------------------


##### Método do tipo EMBUTIDO

# Bibliotecas Necessárias

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualize
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

# Carga base de dados normalizada

In [2]:
# Carga da base original
dataset = pd.read_csv('dataset-normalizado.csv', header = 0)

# Reordena colocando a coluna is_approved no final do dataset
dataset = dataset[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus',
       'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid',
       'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'Medu_0',
       'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 'Fedu_2',
       'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other',
       'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course',
       'reason_home', 'reason_other', 'reason_reputation', 'guardian_father',
       'guardian_mother', 'guardian_other', 'is_approved']]


# Nome das características do dataset original
columns = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'traveltime', 'studytime', 'failures', 'schoolsup', 
            'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 
            'Dalc', 'Walc', 'health', 'absences', 'Medu_0', 'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 
            'Fedu_2', 'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 
            'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 
            'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other']

In [3]:
print("Rows: {}".format(len(dataset)))
print("Columns: {}" .format(len(dataset.columns)))

dataset.head()

Rows: 649
Columns: 52


Unnamed: 0,school,sex,age,address,famsize,Pstatus,traveltime,studytime,failures,schoolsup,...,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,guardian_father,guardian_mother,guardian_other,is_approved
0,0.0,0.0,0.428571,0.0,0.0,0.0,0.333333,0.333333,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.285714,0.0,0.0,1.0,0.0,0.333333,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.333333,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.142857,0.0,0.0,1.0,0.0,0.333333,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


# Aplicação da seleção de características
Seleciona as k_features

In [4]:
k_features = 20

In [5]:
# Gera o array com os valores do dataset
array = dataset.values

# Separa o dataset original em atributos e classe
features = array[:, 0:51]
target = array[:, 51]

# Cria o classificador que é utilizado para avaliar o subconjunto de características
# fit an Extra Trees model to the data
model = ExtraTreesClassifier(n_estimators = 10)
model.fit(features, target)


# Obtém as caracteristicas selecionadas e o seu ranking de priorização
features_importance = model.feature_importances_


# Gera a tabela com as informações das características selecionadas
dataset_features = pd.DataFrame(columns, columns=['Feature'])
dataset_features['Feature Importance'] = features_importance


# Ordenadas pela importância das características
dataset_features = dataset_features.sort_values(by='Feature Importance', ascending=False)


# Gera a coluna com o id da coluna original
dataset_features['Id column'] = dataset_features.index


# Reinicia o número da linha
dataset_features.reset_index(drop=True, inplace=True)


# Seleciona as k_features com maior score
dataset_k_features = dataset_features.head(k_features).copy()


# Exibe a tabela com as características selecionadas
print("Número de características selecionadas: {}" .format(k_features))
dataset_k_features

Número de características selecionadas: 20


Unnamed: 0,Feature,Feature Importance,Id column
0,guardian_other,0.223246,50
1,guardian_mother,0.206727,49
2,age,0.05678,2
3,failures,0.047443,8
4,absences,0.025092,23
5,Dalc,0.02112,20
6,activities,0.021082,12
7,goout,0.01807,19
8,freetime,0.017657,18
9,Mjob_services,0.016658,37


# Gera o dataset com as características selecionadas

In [6]:
# Obtém o nome das características selecionadas
array_features = dataset_k_features['Feature'].copy()


# Reduz o dataset para o número de características selecionadas
dataset_reduced = dataset[array_features].copy()


# Adiciona a coluna is_approved
dataset_target = pd.DataFrame(target, columns=['is_approved'])
dataset_reduced['is_approved'] = dataset_target


# Exibe o dataset reduzido
print("Número de características selecionadas: {}" .format(k_features))
dataset_reduced.head()

Número de características selecionadas: 20


Unnamed: 0,guardian_other,guardian_mother,age,failures,absences,Dalc,activities,goout,freetime,Mjob_services,...,studytime,nursery,famrel,romantic,sex,Fedu_2,traveltime,famsize,Pstatus,is_approved
0,1.0,0.0,0.428571,0.0,0.125,0.0,0.0,0.75,0.5,0.0,...,0.333333,1.0,0.75,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
1,0.0,1.0,0.285714,0.0,0.0625,0.0,0.0,0.5,0.5,0.0,...,0.333333,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.1875,0.25,0.0,0.25,0.5,0.0,...,0.333333,1.0,0.75,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.25,0.0,...,0.666667,1.0,0.5,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.142857,0.0,0.0,0.0,0.0,0.25,0.5,1.0,...,0.333333,1.0,0.75,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Descrição das características
Abaixo contém todas as características ordenadas pela prioridade de seleção

In [7]:
print('Características ordenadas pela prioridade de seleção')
dataset_features

Características ordenadas pela prioridade de seleção


Unnamed: 0,Feature,Feature Importance,Id column
0,guardian_other,0.223246,50
1,guardian_mother,0.206727,49
2,age,0.05678,2
3,failures,0.047443,8
4,absences,0.025092,23
5,Dalc,0.02112,20
6,activities,0.021082,12
7,goout,0.01807,19
8,freetime,0.017657,18
9,Mjob_services,0.016658,37


# Exporta o dataset reduzido com os características selecionadas

In [8]:
dataset_reduced.to_csv("dataset-fs-feature_importance.csv", index=False)
print("Rows: {}".format(len(dataset_reduced)))
print("Columns: {}" .format(len(dataset_reduced.columns)))
dataset_reduced.head()

Rows: 649
Columns: 21


Unnamed: 0,guardian_other,guardian_mother,age,failures,absences,Dalc,activities,goout,freetime,Mjob_services,...,studytime,nursery,famrel,romantic,sex,Fedu_2,traveltime,famsize,Pstatus,is_approved
0,1.0,0.0,0.428571,0.0,0.125,0.0,0.0,0.75,0.5,0.0,...,0.333333,1.0,0.75,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
1,0.0,1.0,0.285714,0.0,0.0625,0.0,0.0,0.5,0.5,0.0,...,0.333333,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.1875,0.25,0.0,0.25,0.5,0.0,...,0.333333,1.0,0.75,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.25,0.0,...,0.666667,1.0,0.5,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.142857,0.0,0.0,0.0,0.0,0.25,0.5,1.0,...,0.333333,1.0,0.75,0.0,0.0,0.0,0.0,0.0,1.0,0.0
