# **Setup**

## Import notebooks

In [1]:
%%capture
# Note the python import here
import reuse, sys

# This is the Ipython hook
sys.meta_path.append(reuse.NotebookFinder())
from visualization import training, test

## Import libraries

In [2]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

import pandas as pd
import numpy as np

## Define X_train, y_train, X_test, y_test

In [3]:
X_train = training.drop('salary-classification',1)
y_train = training['salary-classification']
X_test = test.drop('salary-classification',1) 
y_test = test['salary-classification']

In [4]:
#pd.crosstab(training['occupation'], training['salary-classification'])

# **Feature selection**

## Filter Methods

### Variance Threshold

In [5]:
selector = VarianceThreshold()
selector.fit_transform(X_train)
selector.variances_

array([3.90469751e-01, 1.47093600e+06, 6.50225850e+00, 2.49570622e-01,
       1.95351134e-01, 1.20550821e-01, 2.19134909e-01, 9.99990000e+04,
       4.35600000e+03, 5.83263907e-01, 8.03582675e-02, 1.92937907e-01,
       1.21978296e-01, 4.63944757e-04, 1.04427468e-01, 2.42466115e-01,
       1.90537237e-01, 2.86054459e-02, 1.26143237e-01, 9.51511485e-02,
       4.44419928e-02])

In [6]:
X_train

Unnamed: 0,age,fnlwgt,education-num,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,...,workclass_Private,workclass_gov,workclass_others,workclass_selfempl,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife
0,1,77516,13,1,0,0,0,2174,0,1,...,0,1,0,0,0,1,0,0,0,0
1,1,83311,13,0,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
2,1,215646,9,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
3,1,234721,7,0,0,1,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
4,0,338409,13,0,1,1,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,257302,12,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
32557,1,154374,9,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
32558,2,151910,9,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
32559,0,201490,9,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


### SelectKBest

In [7]:
selector = SelectKBest(f_classif, k=21)
selector.fit(X_train, y_train)
cols = selector.get_support(indices=True)
cols_names = list(X_train.columns[cols])

for idx, (ci, cn) in enumerate(zip(cols, cols_names)):
    print("*" * (len(cols) - idx) + " " * idx, ci, cn)

X_train = X_train[cols_names]
X_test = X_test[cols_names]

scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
values_standardized_train = scaler.fit_transform(X_train.values)
values_standardized_test = scaler.fit_transform(X_test.values)
X_train = pd.DataFrame(values_standardized_train, columns = X_train.columns)
X_test = pd.DataFrame(values_standardized_test, columns = X_test.columns)

clf_model = SVC()

clf_model.fit(X_train,y_train)
score = clf_model.score(X_test,y_test)
print(score)

X_train

********************* 0 age
********************  1 fnlwgt
*******************   2 education-num
******************    3 marital-status
*****************     4 occupation
****************      5 race
***************       6 sex
**************        7 capital-gain
*************         8 capital-loss
************          9 hours-per-week
***********           10 native-country
**********            11 workclass_Private
*********             12 workclass_gov
********              13 workclass_others
*******               14 workclass_selfempl
******                15 relationship_ Husband
*****                 16 relationship_ Not-in-family
****                  17 relationship_ Other-relative
***                   18 relationship_ Own-child
**                    19 relationship_ Unmarried
*                     20 relationship_ Wife
0.8359229747675962


Unnamed: 0,age,fnlwgt,education-num,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,...,workclass_Private,workclass_gov,workclass_others,workclass_selfempl,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife
0,0.5,0.043338,0.800000,1.0,0.0,0.0,0.0,0.021740,0.0,0.333333,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.5,0.047277,0.800000,0.0,1.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.5,0.137244,0.533333,1.0,0.0,0.0,0.0,0.000000,0.0,0.333333,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.5,0.150212,0.400000,0.0,0.0,1.0,0.0,0.000000,0.0,0.333333,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.220703,0.800000,0.0,1.0,1.0,1.0,0.000000,0.0,0.333333,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30157,0.0,0.165563,0.733333,0.0,0.0,0.0,1.0,0.000000,0.0,0.333333,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30158,0.5,0.095589,0.533333,0.0,0.0,0.0,0.0,0.000000,0.0,0.333333,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
30159,1.0,0.093914,0.533333,1.0,0.0,0.0,1.0,0.000000,0.0,0.333333,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
30160,0.0,0.127620,0.533333,1.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Wrapper Methods

### Recursive Feature Elimination

In [8]:
model = LogisticRegression(solver='lbfgs',max_iter=5000)
rfe = RFE(model,step=1)
fit = rfe.fit(X_train,y_train)
print('Optimal number of features: {}'.format(rfe.n_features_))
print(fit.support_)
print(fit.ranking_)
cols = rfe.get_support(indices=True)
cols_names = list(X_train.columns[cols])
cols
selected_features = pd.DataFrame({'Feature':list(X_train.columns),'Ranking':rfe.ranking_})
tabela = selected_features.sort_values(by='Ranking')
tabela.to_latex()
tabela

Optimal number of features: 10
[ True  True  True False False False False  True  True  True False False
 False  True False  True False False  True False  True]
[ 1  1  1  2  4 12  3  1  1  1  6 10  9  1 11  1  8  5  1  7  1]


Unnamed: 0,Feature,Ranking
0,age,1
18,relationship_ Own-child,1
15,relationship_ Husband,1
13,workclass_others,1
9,hours-per-week,1
8,capital-loss,1
7,capital-gain,1
20,relationship_ Wife,1
2,education-num,1
1,fnlwgt,1


## Embedded methods

### Principal Component Analysis

In [9]:
pca=PCA()
fit=pca.fit(X_train)
print(fit.explained_variance_ratio_)

[2.93344089e-01 1.40114212e-01 9.75515281e-02 8.81556660e-02
 7.45648421e-02 5.91115540e-02 5.39834678e-02 4.73938185e-02
 3.31258099e-02 3.04396041e-02 2.30376430e-02 2.27484563e-02
 1.51128669e-02 9.06851392e-03 3.83205425e-03 3.49433735e-03
 2.37451159e-03 2.26928837e-03 2.77737014e-04 1.88013746e-31
 4.92785749e-33]


### Feature Importance

In [10]:
model = ExtraTreesClassifier(n_estimators=21)
model.fit(X_train, y_train)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d - %s (%f)" % (f + 1, indices[f], X_train.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature 1 - fnlwgt (0.397747)
2. feature 2 - education-num (0.111239)
3. feature 3 - marital-status (0.098822)
4. feature 7 - capital-gain (0.086502)
5. feature 4 - occupation (0.066446)
6. feature 9 - hours-per-week (0.041729)
7. feature 15 - relationship_ Husband (0.041689)
8. feature 0 - age (0.040968)
9. feature 8 - capital-loss (0.029335)
10. feature 16 - relationship_ Not-in-family (0.015490)
11. feature 18 - relationship_ Own-child (0.011492)
12. feature 6 - sex (0.010826)
13. feature 20 - relationship_ Wife (0.009970)
14. feature 5 - race (0.008277)
15. feature 10 - native-country (0.008097)
16. feature 19 - relationship_ Unmarried (0.006413)
17. feature 11 - workclass_Private (0.004811)
18. feature 14 - workclass_selfempl (0.004089)
19. feature 12 - workclass_gov (0.004080)
20. feature 17 - relationship_ Other-relative (0.001859)
21. feature 13 - workclass_others (0.000117)
