In [1]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
# plt.style.use('seaborn-colorblind')
# %matplotlib inline
from feature_selection import filter_method as ft

## Load Dataset

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))

In [3]:
data.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), 
                                                    data.target, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((455, 30), (114, 30))

## Variance method
removing features that show the same value for the majority/all of the observations (constant/quasi-constant features)

In [5]:
# the original dataset has no constant variable
quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)

0  variables are found to be almost constant


In [6]:
# lets create a duumy variable that help us do the demonstration
X_train['dummy'] = np.floor(X_train['worst smoothness']*10)
# variable dummy has> 92% of the observations show one value, 1.0
X_train.dummy.value_counts() / np.float(len(X_train))

1.0    0.923077
0.0    0.068132
2.0    0.008791
Name: dummy, dtype: float64

In [7]:
quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)
quasi_constant_feature

1  variables are found to be almost constant


['dummy']

In [8]:
# drop that variable
X_train.drop(labels=quasi_constant_feature,axis=1,inplace=True)
print(X_train.shape)

(455, 30)


## Correlation method
remove features that are highly correlated with each other

In [9]:
corr = ft.corr_feature_detect(data=X_train,threshold=0.9)
# print all the correlated feature groups!
for i in corr:
    print(i,'\n')

          feature1         feature2      corr
0   mean perimeter      mean radius  0.998185
6   mean perimeter        mean area  0.986692
14  mean perimeter  worst perimeter  0.970507
19  mean perimeter     worst radius  0.969520
33  mean perimeter       worst area  0.941920 

           feature1      feature2      corr
12  perimeter error  radius error  0.978323
30  perimeter error    area error  0.944995 

          feature1             feature2      corr
36  mean concavity  mean concave points  0.914627 

        feature1       feature2      corr
38  mean texture  worst texture  0.908182 

                feature1             feature2      corr
40  worst concave points  mean concave points  0.906312 



then we can decide which ones to remove.

## Mutual Information Filter
Mutual information measures how much information the presence/absence of a feature contributes to making the correct prediction on Y.

In [10]:
# select the top 3 features
mi = ft.mutual_info(X=X_train,y=y_train,select_k=3)
print(mi)

Index(['mean concave points', 'worst perimeter', 'worst area'], dtype='object')


In [11]:
# select the top 20% features
mi = ft.mutual_info(X=X_train,y=y_train,select_k=0.2)
print(mi)

Index(['mean perimeter', 'mean concave points', 'worst radius',
       'worst perimeter', 'worst area', 'worst concave points'],
      dtype='object')


## Chi-Square Filter
Compute chi-squared stats between each non-negative feature and class

In [12]:
# select the top 3 features
chi = ft.chi_square_test(X=X_train,y=y_train,select_k=3)
print(chi)

Index(['mean area', 'area error', 'worst area'], dtype='object')


In [13]:
# select the top 20% features
chi = ft.chi_square_test(X=X_train,y=y_train,select_k=0.2)
print(chi)

Index(['mean perimeter', 'mean area', 'area error', 'worst radius',
       'worst perimeter', 'worst area'],
      dtype='object')


## Univariate ROC-AUC or MSE
builds one decision tree per feature, to predict the target, then make predictions and ranks the features according to the machine learning metric (roc-auc or mse)

In [18]:
uni_roc_auc = ft.univariate_roc_auc(X_train=X_train,y_train=y_train,
                                   X_test=X_test,y_test=y_test,threshold=0.8)
print(uni_roc_auc)

worst perimeter            0.917275
worst area                 0.895840
worst radius               0.893458
worst concave points       0.863131
mean concavity             0.856939
mean radius                0.849000
mean area                  0.839314
worst concavity            0.831375
mean perimeter             0.829628
mean concave points        0.826453
area error                 0.812321
worst compactness          0.742299
radius error               0.740235
mean compactness           0.734360
perimeter error            0.680534
worst texture              0.647666
worst fractal dimension    0.640997
concavity error            0.640203
worst symmetry             0.620991
concave points error       0.618133
compactness error          0.607336
mean symmetry              0.591775
mean texture               0.573357
texture error              0.568593
worst smoothness           0.565100
mean smoothness            0.557637
fractal dimension error    0.542077
smoothness error           0

In [17]:
uni_mse = ft.univariate_mse(X_train=X_train,y_train=y_train,
                            X_test=X_test,y_test=y_test,threshold=0.4)
print(uni_mse)

mean fractal dimension     0.491228
symmetry error             0.480750
fractal dimension error    0.456140
smoothness error           0.449561
texture error              0.412281
worst smoothness           0.403265
mean smoothness            0.399123
mean texture               0.396930
mean symmetry              0.363060
compactness error          0.361842
concave points error       0.357456
worst fractal dimension    0.355263
worst symmetry             0.350877
worst texture              0.333333
concavity error            0.333333
perimeter error            0.300439
mean compactness           0.258772
worst compactness          0.254386
radius error               0.245614
area error                 0.179825
mean perimeter             0.166667
mean concave points        0.166667
worst concavity            0.162281
mean radius                0.146930
mean concavity             0.142544
mean area                  0.140351
worst concave points       0.123782
worst area                 0