# Seleção de Características - Método Feature Importance

### Bibliotecas Utilizadas

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualize
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
# Carga da base original
dataset = pd.read_csv('dataset-normalizado.csv', header = 0)

# Reordena colocando a coluna is_approved no final da tabela
dataset = dataset[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus',
       'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid',
       'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'Medu_0',
       'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 'Fedu_2',
       'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other',
       'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course',
       'reason_home', 'reason_other', 'reason_reputation', 'guardian_father',
       'guardian_mother', 'guardian_other', 'is_approved']]

In [3]:
print("Rows: {}".format(len(dataset)))
print("Columns: {}" .format(len(dataset.columns)))

dataset.head()

Rows: 1044
Columns: 52


Unnamed: 0,school,sex,age,address,famsize,Pstatus,traveltime,studytime,failures,schoolsup,...,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,guardian_father,guardian_mother,guardian_other,is_approved
0,0.0,0.0,0.428571,0.0,0.0,0.0,0.333333,0.333333,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.285714,0.0,0.0,1.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.333333,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.142857,0.0,0.0,1.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


### Aplicação da Seleção de Características

In [30]:
# Gera o array com os valores do dataset
array = dataset.values

# Separa o dataset original em atributos e classe
features = array[:, 0:51]
target = array[:, 51]

# Cria o classificador que é utilizado para avaliar o subconjunto de características
# fit an Extra Trees model to the data
model = ExtraTreesClassifier(n_estimators=10)
model.fit(features, target)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### Descrição das características selecionadas

In [25]:
# Nome das características do dataset original
columns = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'traveltime', 'studytime', 'failures', 'schoolsup', 
            'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 
            'Dalc', 'Walc', 'health', 'absences', 'Medu_0', 'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 
            'Fedu_2', 'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 
            'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 
            'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other']


# Obtém as caracteristicas selecionadas e o seu ranking de priorização
features_importance = model.feature_importances_


# Gera a tabela com as informações das características selecionadas
dataset_features = pd.DataFrame(columns, columns=['Features'])
dataset_features['Feature Importance'] = features_importance


# Exibe a tabela de características, ordenadas pela importância das características
dataset_features.sort_values(by='Feature Importance', ascending=False)

Unnamed: 0,Features,Feature Importance
23,absences,0.11132
8,failures,0.070457
11,paid,0.051104
14,higher,0.040491
21,Walc,0.033648
0,school,0.03078
7,studytime,0.030471
2,age,0.029106
19,goout,0.027324
22,health,0.024918
