# Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.svm import SVC

# Load train and test dataset

In [2]:
training = pd.read_csv("training.csv")
test = pd.read_csv("test.csv")

# Remove unnecessary columns

In [3]:
training = training.drop("education", 1)
test = test.drop("education", 1)

# Replace all '?' by NaN and drop rows containing the latter

In [4]:
for data in training:
    training[data].replace(' ?', np.nan, inplace=True)
    
for data in test:
    test[data].replace(' ?', np.nan, inplace=True)

training.dropna(inplace=True)
test.dropna(inplace=True)

# Turning non-numerical values into numerical

In [5]:
ord_enc = OrdinalEncoder()
for i in ['workclass','marital-status','occupation','relationship','race','sex','native-country','salary-classification']:
    training[i] = ord_enc.fit_transform(training[[i]]).astype(int)
    test[i] = ord_enc.fit_transform(test[[i]]).astype(int)

In [6]:
X_train = training.drop('salary-classification',1)
y_train = training['salary-classification']
X_test = test.drop('salary-classification',1) 
y_test = test['salary-classification']

# **Filter methods**

# Variance threshold

In [7]:
selector = VarianceThreshold()
selector.fit_transform(X_train)
selector.variances_

array([7.30000000e+01, 9.09941854e-01, 1.47093600e+06, 6.50225850e+00,
       2.24397640e+00, 1.30000000e+01, 2.56419912e+00, 6.96716725e-01,
       2.19134909e-01, 9.99990000e+04, 4.35600000e+03, 9.80000000e+01,
       3.72743370e+01])

# SelectKBest

In [13]:
selector = SelectKBest(f_classif, k=3)
selector.fit(X_train, y_train)
cols = selector.get_support(indices=True)
cols_names = list(X_train.columns[cols])

for idx, (ci, cn) in enumerate(zip(cols, cols_names)):
    print("*" * (len(cols) - idx) + " " * idx, ci, cn)

clf_model = SVC()

clf_model.fit(X_train,y_train)
score = clf_model.score(X_test,y_test)
print(score)

*** 0 age
**  3 education-num
*   6 relationship
0.7908366533864541


# **Embedded methods**

# Feature Importance

In [9]:
model = ExtraTreesClassifier(n_estimators=13)
model.fit(X_train,y_train)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d - %s (%f)" % (f+1, indices[f], X_train.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature 2 - fnlwgt (0.164730)
2. feature 0 - age (0.162880)
3. feature 3 - education-num (0.125213)
4. feature 11 - hours-per-week (0.091299)
5. feature 9 - capital-gain (0.091088)
6. feature 4 - marital-status (0.087145)
7. feature 5 - occupation (0.076997)
8. feature 6 - relationship (0.067462)
9. feature 1 - workclass (0.040426)
10. feature 8 - sex (0.034084)
11. feature 10 - capital-loss (0.029511)
12. feature 12 - native-country (0.015384)
13. feature 7 - race (0.013781)
