# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.svm import SVC

# Load train and test dataset

In [2]:
training = pd.read_csv("training.csv")
columns = ['age','workclass','fnlgwt','education','education num','marital-status',
           'occupation','relationship','race','sex','capital-gain','capital-loss',
           'hours-per-week','native-country','salary-classification']
training=pd.read_csv("training.csv", names=columns)
test=pd.read_csv("test.csv", names=columns)

# Remove unnecessary columns

In [3]:
training = training.drop("education", 1)
test = test.drop("education", 1)

# Replace all '?' by NaN and drop rows containing the latter

In [4]:
for data in training:
    training[data].replace(' ?', np.nan, inplace=True)
    
for data in test:
    test[data].replace(' ?', np.nan, inplace=True)

training.dropna(inplace=True)
test.dropna(inplace=True)

# Turning non-numerical values into numerical

In [5]:
ord_enc = OrdinalEncoder()
for i in ['workclass','marital-status','occupation','relationship','race','sex','native-country','salary-classification']:
    training[i] = ord_enc.fit_transform(training[[i]]).astype(int)
    test[i] = ord_enc.fit_transform(test[[i]]).astype(int)

In [6]:
X_train = training.drop('salary-classification',1)
y_train = training['salary-classification']
X_test = test.drop('salary-classification',1) 
y_test = test['salary-classification']
train_target = training['salary-classification']
train_data = training.drop('salary-classification',1)

# Filter Methods

## Variance Threshold

In [7]:
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd

In [8]:
selector = VarianceThreshold()
selector.fit_transform(X_train)
selector.variances_

array([7.30000000e+01, 9.09941854e-01, 1.47093600e+06, 6.50225850e+00,
       2.24397640e+00, 1.30000000e+01, 2.56419912e+00, 6.96716725e-01,
       2.19134909e-01, 9.99990000e+04, 4.35600000e+03, 9.80000000e+01,
       3.72743370e+01])

## SelectKBest

In [9]:
selector = SelectKBest(f_classif, k=3)
selector.fit(X_train, y_train)
cols = selector.get_support(indices=True)
cols_names = list(X_train.columns[cols])

for idx, (ci, cn) in enumerate(zip(cols, cols_names)):
    print("*" * (len(cols) - idx) + " " * idx, ci, cn)

clf_model = SVC()

clf_model.fit(X_train,y_train)
score = clf_model.score(X_test,y_test)
print(score)

*** 0 age
**  3 education num
*   6 relationship
0.7908366533864541


# Wrapper Methods

## Recursive Feature Elimination

In [10]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [32]:
model = LogisticRegression(solver='lbfgs',max_iter=5000)
rfe = RFE(model,step=1)
fit = rfe.fit(training,train_target)
print('Optimal number of features: {}'.format(rfe.n_features_))
print(fit.support_)
print(fit.ranking_)
cols = rfe.get_support(indices=True)
cols_names = list(training.columns[cols])
cols
selected_features = pd.DataFrame({'Feature':list(training.columns),'Ranking':rfe.ranking_})
selected_features.sort_values(by='Ranking')

Optimal number of features: 7
[False  True False  True  True False  True  True  True False False False
 False  True]
[2 1 8 1 1 6 1 1 1 7 5 3 4 1]


Unnamed: 0,Feature,Ranking
1,workclass,1
3,education num,1
4,marital-status,1
6,relationship,1
7,race,1
8,sex,1
13,salary-classification,1
0,age,2
11,hours-per-week,3
12,native-country,4


# Embedded methods

## Principal Component Analysis

In [12]:
from sklearn.decomposition import PCA

In [13]:
pca=PCA()
fit=pca.fit(train_data)
print(fit.explained_variance_ratio_)

[9.95095391e-01 4.89001984e-03 1.45550751e-05 1.57081135e-08
 1.21494136e-08 3.31342302e-09 1.45294851e-09 5.43252921e-10
 2.14460071e-10 1.71928522e-10 8.03390640e-11 5.95177689e-11
 1.20979930e-11]


## Feature Importance

In [14]:
from sklearn.ensemble import ExtraTreesClassifier

In [16]:
model = ExtraTreesClassifier(n_estimators=13)
model.fit(train_data, train_target)
model.feature_importances_
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(train_data.shape[1]):
    print("%d. feature %d - %s (%f)" % (f+1, indices[f], train_data.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature 2 - fnlgwt (0.164596)
2. feature 0 - age (0.151930)
3. feature 3 - education num (0.128076)
4. feature 11 - hours-per-week (0.095413)
5. feature 6 - relationship (0.095317)
6. feature 9 - capital-gain (0.091756)
7. feature 5 - occupation (0.077219)
8. feature 4 - marital-status (0.064008)
9. feature 1 - workclass (0.041455)
10. feature 8 - sex (0.030236)
11. feature 10 - capital-loss (0.028553)
12. feature 12 - native-country (0.015853)
13. feature 7 - race (0.015588)
