In [1]:
### Bibliotecas Utilizadas 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
# Carga da base original
dataset = pd.read_csv('dataset-normalizado.csv', header = 0)

# Reordena colocando a coluna is_approved no final da tabela
dataset = dataset[['school', 'sex', 'age', 'address', 'famsize', 'Pstatus',
       'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid',
       'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'Medu_0',
       'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 'Fedu_2',
       'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other',
       'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course',
       'reason_home', 'reason_other', 'reason_reputation', 'guardian_father',
       'guardian_mother', 'guardian_other', 'is_approved']]

In [3]:
len(dataset.columns)

52

## Biblioteca para seleção de características

In [4]:
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier

In [5]:
X = dataset.drop(['is_approved'], axis=1)
y = dataset.is_approved
feature_name = X.columns.tolist()

In [6]:
clf = DecisionTreeClassifier()

### Definimos que o threshold a ser considerado deve ser a mediana e o seletor deve selecionar as 10 melhores características

In [7]:
dt_selector = SelectFromModel(clf, threshold='median', max_features=10)

### Parâmetros utilizados no seletor de características

In [8]:
dt_selector.get_params()

{'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': None,
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__presort': 'deprecated',
 'estimator__random_state': None,
 'estimator__splitter': 'best',
 'estimator': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),
 'max_features': 10,
 'norm_order': 1,
 'prefit': Fals

In [9]:
dt_selector.fit(X, y)

SelectFromModel(estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                 class_weight=None,
                                                 criterion='gini',
                                                 max_depth=None,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 presort='deprecated',
                                                 random_state=None,
                                                 splitter='best'),
                max_feat

In [10]:
dt_support = dt_selector.get_support()

In [11]:
print(dt_support)

[False False  True False False False False  True  True False False  True
 False False  True False False False  True  True False  True  True  True
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False]


In [12]:
dt_threshold = dt_selector.threshold_
print(dt_threshold)

0.010579260031289953


In [13]:
dt_coef = dt_selector.estimator_.feature_importances_
print(dt_coef)

[0.01262789 0.0138429  0.0320128  0.00725656 0.00640024 0.00353181
 0.02001899 0.04294269 0.13103857 0.02086143 0.02272182 0.04753319
 0.00981059 0.01365213 0.0364111  0.00670274 0.0195834  0.01648933
 0.06009732 0.03328945 0.0273948  0.03252535 0.03254968 0.1451266
 0.00160537 0.00656472 0.01510884 0.00745525 0.01895676 0.
 0.01302303 0.         0.00065404 0.01350341 0.02949574 0.00623673
 0.00905493 0.00392424 0.0030586  0.01057926 0.00787566 0.00971975
 0.01060659 0.00149495 0.00842819 0.00526269 0.         0.00945243
 0.00379343 0.00777809 0.00194595]


### Apenas as características com `feature_importance >= dt_threshold` foram selecionadas

In [14]:
dt_features = X.loc[:,dt_support].columns.tolist()
print(str(len(dt_features)), 'selected features')
dt_features

10 selected features


['age',
 'studytime',
 'failures',
 'paid',
 'higher',
 'freetime',
 'goout',
 'Walc',
 'health',
 'absences']

## Criamos um novo dataset apenas com as características selecionadas e a coluna de target

In [15]:
fs_5_dataset = dataset[['age',
 'studytime',
 'failures',
 'paid',
 'higher',
 'freetime',
 'goout',
 'health',
 'absences',
 'Mjob_at_home',
 'is_approved']]

fs_5_dataset.head()

Unnamed: 0,age,studytime,failures,paid,higher,freetime,goout,health,absences,Mjob_at_home,is_approved
0,0.428571,0.333333,0.0,0.0,1.0,0.5,0.75,0.5,0.08,1.0,0.0
1,0.285714,0.333333,0.0,0.0,1.0,0.5,0.5,0.5,0.053333,1.0,0.0
2,0.0,0.333333,1.0,1.0,1.0,0.5,0.25,0.5,0.133333,1.0,0.0
3,0.0,0.666667,0.0,1.0,1.0,0.25,0.25,1.0,0.026667,0.0,1.0
4,0.142857,0.333333,0.0,1.0,1.0,0.5,0.25,1.0,0.053333,0.0,0.0


In [16]:
## Logistic Regression

# lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), '1.25*median')
# lr_selector.fit(X, y)
# embeded_lr_support = embeded_lr_selector.get_support()
# print(embeded_lr_support)

# embeded_lr_threshold = embeded_lr_selector.threshold_
# # print(embeded_lr_threshold)

# embeded_lr_coef = embeded_lr_selector.estimator_.coef_
# print(embeded_lr_coef)

# embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
# print(str(len(embeded_lr_feature)), 'selected features')

# embeded_lr_feature