# Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.svm import SVC

# Load train and test dataset

In [4]:
training = pd.read_csv("training.csv")
columns = ['age','workclass','fnlgwt','education','education num','marital-status',
           'occupation','relationship','race','sex','capital-gain','capital-loss',
           'hours-per-week','native-country','salary-classification']
training=pd.read_csv("training.csv", names=columns)
test=pd.read_csv("test.csv", names=columns)

# Remove unnecessary columns

In [5]:
training = training.drop("education", 1)
test = test.drop("education", 1)

# Replace all '?' by NaN and drop rows containing the latter

In [6]:
for data in training:
    training[data].replace(' ?', np.nan, inplace=True)
    
for data in test:
    test[data].replace(' ?', np.nan, inplace=True)

training.dropna(inplace=True)
test.dropna(inplace=True)

# Turning non-numerical values into numerical

In [7]:
ord_enc = OrdinalEncoder()
for i in ['workclass','marital-status','occupation','relationship','race','sex','native-country','salary-classification']:
    training[i] = ord_enc.fit_transform(training[[i]]).astype(int)
    test[i] = ord_enc.fit_transform(test[[i]]).astype(int)

In [8]:
X_train = training.drop('salary-classification',1)
y_train = training['salary-classification']
X_test = test.drop('salary-classification',1) 
y_test = test['salary-classification']
train_target = training['salary-classification']
train_data = training.drop('salary-classification',1)

# Filter Methods

## Variance Threshold

In [9]:
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd

In [10]:
selector = VarianceThreshold()
selector.fit_transform(X_train)
selector.variances_

array([7.30000000e+01, 9.09941854e-01, 1.47093600e+06, 6.50225850e+00,
       2.24397640e+00, 1.30000000e+01, 2.56419912e+00, 6.96716725e-01,
       2.19134909e-01, 9.99990000e+04, 4.35600000e+03, 9.80000000e+01,
       3.72743370e+01])

## SelectKBest

In [11]:
selector = SelectKBest(f_classif, k=3)
selector.fit(X_train, y_train)
cols = selector.get_support(indices=True)
cols_names = list(X_train.columns[cols])

for idx, (ci, cn) in enumerate(zip(cols, cols_names)):
    print("*" * (len(cols) - idx) + " " * idx, ci, cn)

clf_model = SVC()

clf_model.fit(X_train,y_train)
score = clf_model.score(X_test,y_test)
print(score)

*** 0 age
**  3 education num
*   6 relationship
0.7908366533864541


# Wrapper Methods

## Recursive Feature Elimination

In [12]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [13]:
model = LogisticRegression(solver='lbfgs',max_iter=5000)
rfe = RFE(model,10)
fit = rfe.fit(training,train_target)
print(fit.support_)
print(fit.ranking_)
cols = rfe.get_support(indices=True)
cols_names = list(training.columns[cols])
cols



[ True  True False  True  True False  True  True  True False False  True
  True  True]
[1 1 5 1 1 3 1 1 1 4 2 1 1 1]


array([ 0,  1,  3,  4,  6,  7,  8, 11, 12, 13], dtype=int64)

# Embedded methods

## Principal Component Analysis

In [14]:
from sklearn.decomposition import PCA

In [15]:
pca=PCA()
fit=pca.fit(train_data)
print(fit.explained_variance_ratio_)

[9.95095391e-01 4.89001984e-03 1.45550751e-05 1.57081135e-08
 1.21494136e-08 3.31342302e-09 1.45294851e-09 5.43252921e-10
 2.14460071e-10 1.71928522e-10 8.03390640e-11 5.95177689e-11
 1.20979930e-11]


## Feature Importance

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

In [17]:
model = ExtraTreesClassifier(n_estimators=19)
model.fit(train_data, train_target)
model.feature_importances_

array([0.1584243 , 0.03823948, 0.16483512, 0.12344247, 0.0600643 ,
       0.07664454, 0.09212139, 0.01330109, 0.03602807, 0.09886301,
       0.03034297, 0.09338521, 0.01430804])