# **Setup**

## Import notebooks

In [1]:
%%capture
# Note the python import here
import reuse, sys

# This is the Ipython hook
sys.meta_path.append(reuse.NotebookFinder())
from dataset_balancing import X_train, X_test, y_train, y_test

## Import libraries

In [2]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

import pandas as pd
import numpy as np

# **Feature selection**

## Filter Methods

### Variance Threshold

In [3]:
selector = VarianceThreshold()
selector.fit_transform(X_train)
selector.variances_

array([3.90469751e-01, 1.47093600e+06, 6.50225850e+00, 2.49570622e-01,
       1.95351134e-01, 1.20550821e-01, 2.19134909e-01, 9.99990000e+04,
       4.35600000e+03, 5.83263907e-01, 8.03582675e-02, 1.92937907e-01,
       1.21978296e-01, 4.63944757e-04, 1.04427468e-01, 2.42466115e-01,
       1.90537237e-01, 2.86054459e-02, 1.26143237e-01, 9.51511485e-02,
       4.44419928e-02])

### SelectKBest

In [4]:
selector = SelectKBest(f_classif, k=21)
selector.fit(X_train, y_train)
cols = selector.get_support(indices=True)
cols_names = list(X_train.columns[cols])

for idx, (ci, cn) in enumerate(zip(cols, cols_names)):
    print("*" * (len(cols) - idx) + " " * idx, ci, cn)

temp_train = X_train[cols_names]
temp_test = X_test[cols_names]

scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
values_standardized_train = scaler.fit_transform(temp_train.values)
values_standardized_test = scaler.fit_transform(temp_test.values)
temp_train = pd.DataFrame(values_standardized_train, columns = temp_train.columns)
temp_test = pd.DataFrame(values_standardized_test, columns = temp_test.columns)

clf_model = SVC()

clf_model.fit(temp_train,y_train)
score = clf_model.score(temp_test,y_test)
print(score)

********************* 0 age
********************  1 fnlwgt
*******************   2 education-num
******************    3 marital-status
*****************     4 occupation
****************      5 race
***************       6 sex
**************        7 capital-gain
*************         8 capital-loss
************          9 hours-per-week
***********           10 native-country
**********            11 workclass_Private
*********             12 workclass_gov
********              13 workclass_others
*******               14 workclass_selfempl
******                15 relationship_ Husband
*****                 16 relationship_ Not-in-family
****                  17 relationship_ Other-relative
***                   18 relationship_ Own-child
**                    19 relationship_ Unmarried
*                     20 relationship_ Wife
0.8359229747675962


In [5]:
pd.set_option('display.max_columns',50)
X_train.head()

Unnamed: 0,age,fnlwgt,education-num,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country,workclass_Private,workclass_gov,workclass_others,workclass_selfempl,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife
0,1,77516,13,1,0,0,0,2174,0,1,0,0,1,0,0,0,1,0,0,0,0
1,1,83311,13,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
2,1,215646,9,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0
3,1,234721,7,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0
4,0,338409,13,0,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,1


## Wrapper Methods

### Recursive Feature Elimination

In [6]:
model = LogisticRegression(solver='lbfgs',max_iter=5000)
rfe = RFE(model,step=1)
fit = rfe.fit(X_train,y_train)
print('Optimal number of features: {}'.format(rfe.n_features_))
print(fit.support_)
print(fit.ranking_)
cols = rfe.get_support(indices=True)
cols_names = list(X_train.columns[cols])
cols
selected_features = pd.DataFrame({'Feature':list(X_train.columns),'Ranking':rfe.ranking_})
tabela = selected_features.sort_values(by='Ranking')
tabela.to_latex()
tabela

Optimal number of features: 10
[ True False False  True  True False  True False False  True False False
 False  True False  True False  True  True False  True]
[ 1 12  2  1  1  9  1 11 10  1  3  6  7  1  8  1  5  1  1  4  1]


Unnamed: 0,Feature,Ranking
0,age,1
18,relationship_ Own-child,1
17,relationship_ Other-relative,1
15,relationship_ Husband,1
13,workclass_others,1
9,hours-per-week,1
20,relationship_ Wife,1
4,occupation,1
3,marital-status,1
6,sex,1


## Embedded methods

### Principal Component Analysis

In [7]:
pca=PCA()
fit=pca.fit(X_train)
print(fit.explained_variance_ratio_)

[9.95095424e-01 4.89001982e-03 1.45549767e-05 5.72084675e-10
 7.14526794e-11 4.15036059e-11 3.33964274e-11 2.31616435e-11
 1.76309685e-11 1.31882121e-11 1.20419291e-11 1.12753088e-11
 1.03728446e-11 8.53501662e-12 6.31137939e-12 4.51555007e-12
 2.99302136e-12 6.93257553e-13 5.50341227e-14 3.73904504e-41
 1.04040741e-42]


### Feature Importance

In [8]:
model = ExtraTreesClassifier(n_estimators=21)
model.fit(X_train, y_train)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d - %s (%f)" % (f + 1, indices[f], X_train.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature 1 - fnlwgt (0.399386)
2. feature 2 - education-num (0.112862)
3. feature 7 - capital-gain (0.085794)
4. feature 3 - marital-status (0.074855)
5. feature 4 - occupation (0.062269)
6. feature 15 - relationship_ Husband (0.050354)
7. feature 9 - hours-per-week (0.043479)
8. feature 0 - age (0.041123)
9. feature 8 - capital-loss (0.029579)
10. feature 18 - relationship_ Own-child (0.019238)
11. feature 16 - relationship_ Not-in-family (0.017224)
12. feature 6 - sex (0.014859)
13. feature 20 - relationship_ Wife (0.011242)
14. feature 5 - race (0.008214)
15. feature 10 - native-country (0.007552)
16. feature 19 - relationship_ Unmarried (0.006917)
17. feature 11 - workclass_Private (0.004773)
18. feature 14 - workclass_selfempl (0.004261)
19. feature 12 - workclass_gov (0.003700)
20. feature 17 - relationship_ Other-relative (0.002174)
21. feature 13 - workclass_others (0.000145)
