# **Setup**

## Import notebooks

In [1]:
%%capture
# Note the python import here
import reuse, sys

# This is the Ipython hook
sys.meta_path.append(reuse.NotebookFinder())
from dataset_balancing import X_train, X_test, y_train, y_test

## Import libraries

In [2]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

import pandas as pd
import numpy as np

# **Feature selection**

## Filter Methods

### Variance Threshold

In [3]:
selector = VarianceThreshold()
selector.fit_transform(X_train)
selector.variances_

array([3.52729416e-01, 1.44055700e+06, 6.73852724e+00, 2.38703153e-01,
       2.24836271e-01, 1.08825049e-01, 1.95080065e-01, 9.99990000e+04,
       3.90000000e+03, 5.75279633e-01, 7.50358515e-02, 2.05433348e-01,
       1.28875976e-01, 3.32867282e-04, 1.17931456e-01, 2.49172327e-01,
       1.65654163e-01, 2.13662344e-02, 9.04472097e-02, 7.13395278e-02,
       5.80981246e-02])

### SelectKBest

In [4]:
selector = SelectKBest(f_classif, k=21)
selector.fit(X_train, y_train)
cols = selector.get_support(indices=True)
cols_names = list(X_train.columns[cols])

for idx, (ci, cn) in enumerate(zip(cols, cols_names)):
    print("*" * (len(cols) - idx) + " " * idx, ci, cn)

temp_train = X_train[cols_names]
temp_test = X_test[cols_names]

scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
values_standardized_train = scaler.fit_transform(temp_train.values)
values_standardized_test = scaler.fit_transform(temp_test.values)
temp_train = pd.DataFrame(values_standardized_train, columns = temp_train.columns)
temp_test = pd.DataFrame(values_standardized_test, columns = temp_test.columns)

clf_model = SVC(random_state=0)

clf_model.fit(temp_train,y_train)
score = clf_model.score(temp_test,y_test)
print(score)

********************* 0 age
********************  1 fnlwgt
*******************   2 education-num
******************    3 marital-status
*****************     4 occupation
****************      5 race
***************       6 sex
**************        7 capital-gain
*************         8 capital-loss
************          9 hours-per-week
***********           10 native-country
**********            11 workclass_Private
*********             12 workclass_gov
********              13 workclass_others
*******               14 workclass_selfempl
******                15 relationship_ Husband
*****                 16 relationship_ Not-in-family
****                  17 relationship_ Other-relative
***                   18 relationship_ Own-child
**                    19 relationship_ Unmarried
*                     20 relationship_ Wife
0.7693227091633467


## Wrapper Methods

### Recursive Feature Elimination

In [5]:
model = LogisticRegression(solver='lbfgs',max_iter=5000, random_state=0)
rfe = RFE(model,step=1)
fit = rfe.fit(X_train,y_train)
print('Optimal number of features: {}'.format(rfe.n_features_))
print(fit.support_)
print(fit.ranking_)
cols = rfe.get_support(indices=True)
cols_names = list(X_train.columns[cols])
cols
selected_features = pd.DataFrame({'Feature':list(X_train.columns),'Ranking':rfe.ranking_})
tabela = selected_features.sort_values(by='Ranking')
tabela.to_latex()
tabela

Optimal number of features: 10
[ True False False  True  True False  True False False  True False False
 False  True False  True False  True  True False  True]
[ 1 12  2  1  1  9  1 11 10  1  3  5  4  1  6  1  8  1  1  7  1]


Unnamed: 0,Feature,Ranking
0,age,1
18,relationship_ Own-child,1
17,relationship_ Other-relative,1
15,relationship_ Husband,1
13,workclass_others,1
9,hours-per-week,1
20,relationship_ Wife,1
4,occupation,1
3,marital-status,1
6,sex,1


## Embedded methods

### Principal Component Analysis

In [6]:
pca=PCA(random_state=0)
fit=pca.fit(X_train)
print(fit.explained_variance_ratio_)

[9.90435135e-01 9.54483595e-03 2.00282945e-05 5.88927030e-10
 6.80909694e-11 4.06950846e-11 3.04007114e-11 2.34911629e-11
 1.60073833e-11 1.38173897e-11 1.23285073e-11 1.03572695e-11
 9.71851162e-12 6.73610605e-12 5.71120937e-12 4.22135527e-12
 2.20138933e-12 5.58508122e-13 3.92330257e-14 5.60404743e-42
 1.01975070e-42]


### Feature Importance

In [7]:
model = ExtraTreesClassifier(n_estimators=21, random_state=0)
model.fit(X_train, y_train)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d - %s (%f)" % (f + 1, indices[f], X_train.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature 1 - fnlwgt (0.330851)
2. feature 2 - education-num (0.115385)
3. feature 3 - marital-status (0.097309)
4. feature 15 - relationship_ Husband (0.065748)
5. feature 7 - capital-gain (0.064775)
6. feature 4 - occupation (0.064027)
7. feature 0 - age (0.052612)
8. feature 9 - hours-per-week (0.048835)
9. feature 16 - relationship_ Not-in-family (0.030828)
10. feature 18 - relationship_ Own-child (0.026898)
11. feature 8 - capital-loss (0.021018)
12. feature 20 - relationship_ Wife (0.017708)
13. feature 6 - sex (0.015783)
14. feature 5 - race (0.010138)
15. feature 19 - relationship_ Unmarried (0.009578)
16. feature 10 - native-country (0.009058)
17. feature 11 - workclass_Private (0.006268)
18. feature 14 - workclass_selfempl (0.004944)
19. feature 12 - workclass_gov (0.004864)
20. feature 17 - relationship_ Other-relative (0.003096)
21. feature 13 - workclass_others (0.000276)


## Drop irrelevant features

In [8]:
X_train = X_train.drop(['fnlwgt', 'relationship_ Not-in-family', 'workclass_selfempl'],1)
X_test = X_test.drop(['fnlwgt', 'relationship_ Not-in-family', 'workclass_selfempl'],1)