In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr


from sklearn.feature_selection import *
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import *
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.pipeline import make_pipeline

from get_data import *

In [2]:
# Select features according having k highest-score values or statistics
class FeaturesWithSelectKBest(TransformerMixin, BaseEstimator):
    
    def __init__(self, score_func=f_classif):
        self.score_func = score_func     # the method to use (For Regression lookup scikit_learn.feature_selection)
        return None
    
    def fit(self, df, y):
        num_features = int(2*df.shape[1]/3 + 0.5)
        self.selector = SelectKBest(self.score_func, k=num_features)

         # fit transform based on score_func and number of features to select
        self.selector = self.selector.fit(df, y)
        return self
    
    def transform(self, df, y=None):
        # call the fit_transform method and get data using best features 
        return self.selector.transform(df) 

# Select features with high correlation with target and low correlation with other features
class FeaturesWithCorrelations(TransformerMixin, BaseEstimator):
    
    def __init__(self, pval_X_y = 0.5, pval_X_X=0.75):
        self.pval_X_y = pval_X_y   # cut-off value for feature-target correlation
        self.pval_X_X = pval_X_X   # cut-off value for feature-feature correlation
        return None
    
    def fit(self, df, y):
        df2 = pd.DataFrame(df) # This ensures the column names are k=0,1,2 same as array indexing df[:, k]
                
        # Select feature_index that are correlated (wrt pval) with the target variable
        self.corr_X_y = [col for col in df2.columns if np.abs(pearsonr(df2[col], y)[0]) > self.pval_X_y] 

        # Compute feature/feature correlation for each feature
        corr_matrix = df2.corr().abs()
             
        # select feature_index with low feature/feature correlation
        self.corr_X_X = [col for col in corr_matrix if (corr_matrix[col][0:col-1] < self.pval_X_X).all()]
        return self
    
    def transform(self, df, y=None):
        # select features with high feature-target correlation and low feature-feature-correlation
        keep_index  = np.array(list(set(self.corr_X_y) & set(self.corr_X_X)))
        if len(keep_index) == 0: # Either self>corr_X_y is empty or self.corr_X_X.empty
            keep_index  = np.array(list(set(self.corr_X_y) | set(self.corr_X_X)))  # Select that which is not empty
        return df[:, keep_index]        
    
    
class RecursiveFeatureElimination(TransformerMixin, BaseEstimator):
    
    def __init__(self, num_features, model=LogisticRegression()):
        self.model = model               # model to use for feature elimination
        self.num_features = num_features # num_features to select
        return None
    
    def fit(self, df, y):               
        # fit the data and get the best features
        rfecv = RFECV(self.model, min_features_to_select=self.num_features)
        self.rfecv = rfecv.fit(df, y.ravel())  
        return self
        
    def transform(self, df, y=None):
        # Get the best features 
        selections = self.rfecv.get_support()
        return df[:, selections]
    
    

class FeaturesWithSelectFromModel(TransformerMixin, BaseEstimator):
    
    def __init__(self, base_model = ExtraTreesClassifier(n_estimators=50)):
        self.base_model = base_model
        self.selector = SelectFromModel(estimator=self.base_model)
        return None
        
    def fit(self, df, y):
        # fit transform based on score_func and number of features to select
        self.selector = self.selector.fit(df, y) 
        return self
    
    def transform(self, df, y=None):    
        # Get the best features after fit       
        return self.selector.transform(df)

In [3]:
def experiment(df, y):
    # Make pipeline with StandardScaler for each feature_selection method
    # The standardScaler is used in the event we have some features with high cardinality
    selectKBest_f_classif = make_pipeline(StandardScaler(),\
                                           FeaturesWithSelectKBest(),\
                                           LogisticRegression())
      
    selectKBest_mutual_info_classif = make_pipeline(StandardScaler(),\
                                           FeaturesWithSelectKBest(score_func=mutual_info_classif),\
                                           LogisticRegression())
    selectFromModel = make_pipeline(StandardScaler(),\
                                    FeaturesWithSelectFromModel(),\
                                    LogisticRegression())
    features_with_correlations = make_pipeline(StandardScaler(),\
                                              FeaturesWithCorrelations(),\
                                              LogisticRegression())
    recursive_feature_elimination = make_pipeline(StandardScaler(),\
                                              RecursiveFeatureElimination(num_features=5),\
                                              LogisticRegression())
    
    # Loop over each model and perform a cross validation test
    models = [selectKBest_f_classif, selectKBest_mutual_info_classif,\
             recursive_feature_elimination, features_with_correlations,\
              selectFromModel]
    score_type = 'accuracy'

    CV = RepeatedKFold(n_splits=10, n_repeats=10)
    cv_results = np.array([cross_val_score(model, df, y, cv=CV, scoring=score_type) for model in models]).T
    cv_results = np.mean(cv_results, axis=0).reshape((1,-1))[0]
    return cv_results

In [4]:
if __name__=='__main__':
    
    column = ['Breat_cancer', 'Wine', 'Iris', 'Glass']
    index = ['f_classif', 'mutual_info', 'selectFromModel','feature_correlations', 'RFE']

    # Execute the experiments
    breast_cancer_X, breast_cancer_y = breast_cancer_data()
    breast_cancer_result = experiment(breast_cancer_X, breast_cancer_y)
    
    wine_data_X, wine_data_y = load_wine(return_X_y=True)
    wine_result = experiment(wine_data_X, wine_data_y)
    
    iris_data_X, iris_data_y = load_iris(return_X_y=True)
    iris_result = experiment(iris_data_X, iris_data_y)  
    
    glass_data_X, glass_data_y = glass_data()
    glass_result = experiment(glass_data_X, glass_data_y)  
    
    all_results = np.array([breast_cancer_result, wine_result, \
                            iris_result, glass_result]).T

    df = pd.DataFrame(all_results, index=index, columns=column)
df

Unnamed: 0,Breat_cancer,Wine,Iris,Glass
f_classif,0.955797,0.966176,0.954667,0.878333
mutual_info,0.950942,0.971961,0.954,0.900563
selectFromModel,0.957089,0.987647,0.953333,0.895303
feature_correlations,0.944195,0.936144,0.958,0.598636
RFE,0.947201,0.98085,0.953333,0.811558
