# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# Load train and test dataset

In [2]:
training = pd.read_csv("training.csv")
test = pd.read_csv("test.csv")

# Remove unnecessary columns

In [3]:
training = training.drop("education", 1)
test = test.drop("education", 1)

# Replace all '?' by NaN and drop rows containing the latter

In [4]:
for data in training:
    training[data].replace(' ?', np.nan, inplace=True)
    
for data in test:
    test[data].replace(' ?', np.nan, inplace=True)

training.dropna(inplace=True)
test.dropna(inplace=True)

# Turning non-numerical values into numerical

In [5]:
ord_enc = OrdinalEncoder()
for i in ['workclass','marital-status','occupation','relationship','race','sex','native-country','salary-classification']:
    training[i] = ord_enc.fit_transform(training[[i]]).astype(int)
    test[i] = ord_enc.fit_transform(test[[i]]).astype(int)

In [6]:
X_train = training.drop('salary-classification',1)
y_train = training['salary-classification']
X_test = test.drop('salary-classification',1) 
y_test = test['salary-classification']

# Filter Methods

## Variance Threshold

In [7]:
selector = VarianceThreshold()
selector.fit_transform(X_train)
selector.variances_

array([7.30000000e+01, 9.09941854e-01, 1.47093600e+06, 6.50225850e+00,
       2.24397640e+00, 1.30000000e+01, 2.56419912e+00, 6.96716725e-01,
       2.19134909e-01, 9.99990000e+04, 4.35600000e+03, 9.80000000e+01,
       3.72743370e+01])

## SelectKBest

In [8]:
selector = SelectKBest(f_classif, k=13)
selector.fit(X_train, y_train)
cols = selector.get_support(indices=True)
cols_names = list(X_train.columns[cols])

for idx, (ci, cn) in enumerate(zip(cols, cols_names)):
    print("*" * (len(cols) - idx) + " " * idx, ci, cn)

X_train = X_train[cols_names]
X_test = X_test[cols_names]

scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
values_standardized_train = scaler.fit_transform(X_train.values)
values_standardized_test = scaler.fit_transform(X_test.values)
X_train = pd.DataFrame(values_standardized_train, columns = X_train.columns)
X_test = pd.DataFrame(values_standardized_test, columns = X_test.columns)

clf_model = SVC()

clf_model.fit(X_train,y_train)
score = clf_model.score(X_test,y_test)
print(score)

************* 0 age
************  1 workclass
***********   2 fnlwgt
**********    3 education-num
*********     4 marital-status
********      5 occupation
*******       6 relationship
******        7 race
*****         8 sex
****          9 capital-gain
***           10 capital-loss
**            11 hours-per-week
*             12 native-country
0.8446879150066401


# Wrapper Methods

## Recursive Feature Elimination

In [13]:
model = LogisticRegression(solver='lbfgs',max_iter=5000)
rfe = RFE(model,step=1)
fit = rfe.fit(X_train,y_train)
print('Optimal number of features: {}'.format(rfe.n_features_))
print(fit.support_)
print(fit.ranking_)
cols = rfe.get_support(indices=True)
cols_names = list(X_train.columns[cols])
cols
selected_features = pd.DataFrame({'Feature':list(X_train.columns),'Ranking':rfe.ranking_})
tabela = selected_features.sort_values(by='Ranking')
tabela.to_latex()
tabela

Optimal number of features: 6
[ True False False  True  True False False False False  True  True  True
 False]
[1 4 3 1 1 8 5 6 2 1 1 1 7]


Unnamed: 0,Feature,Ranking
0,age,1
3,education-num,1
4,marital-status,1
9,capital-gain,1
10,capital-loss,1
11,hours-per-week,1
8,sex,2
2,fnlwgt,3
1,workclass,4
6,relationship,5


# Embedded methods

## Principal Component Analysis

In [None]:
pca=PCA()
fit=pca.fit(X_train)
print(fit.explained_variance_ratio_)

## Feature Importance

In [None]:
model = ExtraTreesClassifier(n_estimators=13)
model.fit(X_train, y_train)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d - %s (%f)" % (f+1, indices[f], X_train.columns[indices[f]], importances[indices[f]]))