In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import chi2
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
from ucimlrepo import fetch_ucirepo 
warnings.filterwarnings("ignore")
from implementation import *
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Eperiments on artificial data set

In [4]:
def generate_data(n,p,k):
    X = norm.rvs(size=(n,p))
    decision = chi2.median(df=k,loc=0.5)
    Y = np.zeros(n)
    for i in range(n):
        if sum(X[i,:k]**2) > decision:
            Y[i] = 1
        else:
            Y[i] = 0
    X= KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform').fit_transform(X)
    return X,Y 


In [5]:
X,Y = generate_data(500,10,5)

In [144]:
def perform_experiment_artificial(L,n,p,k):
    odds = np.zeros((L,4)) 
    for i in range(L):
        print(i)
        X,Y = generate_data(n,p,k)
        odds[i,0] = (forward_selection_jmi(X,Y,k) < k).sum()/k
        odds[i,1] = (forward_selection_cife(X,Y,k) < k).sum()/k
        odds[i,2] = (rfc_selection(X,Y,k) < k).sum()/k
        odds[i,3] = (boruta_selection(X,Y) < k).sum()/k 
    return odds

In [None]:
#Test different n values
#For L=20
L=20
n_values = [50,100,300,500,800]
methods = ['JMI','CIFE','RFC Variable Importance','Boruta']
probas = [perform_experiment_artificial(L,n,50,10) for n in n_values]
fig ,ax = plt.subplots(ncols=4,figsize=(20,5))
for i in range(2):
    ax[i].boxplot([probas_val[:,i] for probas_val in probas])
    ax[i].set_xticklabels(n_values)
    ax[i].set_ylabel('Probabilities of correct ordering')
    ax[i].set_xlabel('n')
    ax[i].set_title(f'Method: {methods[i]}')
plt.show()

In [None]:
#Test different p values
p_values = [15,30,50]
probas2 = [perform_experiment_artificial(20,500,p,10) for p in p_values]
fig ,ax = plt.subplots(ncols=4,figsize=(20,5))
for i in range(2):
    ax[i].boxplot([probas_val[:,i] for probas_val in probas])
    ax[i].set_xticklabels(n_values)
    ax[i].set_ylabel('Probabilities of correct ordering')
    ax[i].set_xlabel('n')
    ax[i].set_title(f'Method: {methods[i]}')
plt.show()

In [None]:
#Test different k values
k_values = [10,20,35]
probas3 = [perform_experiment_artificial(20,500,50,k) for k in k_values]
fig ,ax = plt.subplots(ncols=4,figsize=(20,5))
for i in range(2):
    ax[i].boxplot([probas_val[:,i] for probas_val in probas])
    ax[i].set_xticklabels(n_values)
    ax[i].set_ylabel('Probabilities of correct ordering')
    ax[i].set_xlabel('n')
    ax[i].set_title(f'Method: {methods[i]}')
plt.show()

# Real-world datasets experiments


In [143]:
def perform_experiment_real(X,Y,k_vals):
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=42)
    k_methods = [forward_selection_jmi,forward_selection_cife,rfc_selection]
    rfc = RandomForestClassifier(n_estimators=100)
    features = {}
    binary = len(np.unique(Y)) == 2
    rfc.fit(X_train,Y_train)
    Y_pred = rfc.predict(X_test)
    scores = {'accuracy_score':[],'f1_score':[],'roc_auc_score':[]}
    scores['accuracy_score'].append(accuracy_score(Y_test,Y_pred))
    if binary:
        scores['f1_score'].append(f1_score(Y_test,Y_pred))
        scores['roc_auc_score'].append(roc_auc_score(Y_test,rfc.predict_proba(X_test)[:,1]))
    else:
        scores['f1_score'].append(f1_score(Y_test,Y_pred,average='macro'))
        scores['roc_auc_score'].append(roc_auc_score(Y_test,rfc.predict_proba(X_test),multi_class='ovr'))
    results_df = pd.DataFrame(scores,index=['All Features'])
    print("\n For All Features")
    print(results_df)
    for k_val in k_vals:
        scores = {'accuracy_score':[],'f1_score':[],'roc_auc_score':[]}
        features[k_val] = [forward_selection_jmi(X,Y,k_val),forward_selection_cife(X,Y,k_val),rfc_selection(X,Y,k_val)]
        for i in range(len(k_methods)):
            rfc.fit(X_train[:,features[k_val][i]],Y_train)
            Y_pred = rfc.predict(X_test[:,features[k_val][i]])
            scores['accuracy_score'].append(accuracy_score(Y_test,Y_pred))
            if binary:
                scores['f1_score'].append(f1_score(Y_test,Y_pred))
                scores['roc_auc_score'].append(roc_auc_score(Y_test,rfc.predict_proba(X_test[:,features[k_val][i]])[:,1]))
            else:
                scores['f1_score'].append(f1_score(Y_test,Y_pred,average='macro'))
                scores['roc_auc_score'].append(roc_auc_score(Y_test,rfc.predict_proba(X_test[:,features[k_val][i]]),multi_class='ovr'))
        results_df = pd.DataFrame(scores,index=k_methods)
        print("\n For k = ",k_val)
        print(results_df)
    features_boruta = boruta_selection(X,Y)
    rfc.fit(X_train[:,features_boruta],Y_train)
    Y_pred = rfc.predict(X_test[:,features_boruta])
    scores = {'accuracy_score':[],'f1_score':[],'roc_auc_score':[]}
    scores['accuracy_score'].append(accuracy_score(Y_test,Y_pred))
    if binary:
        scores['f1_score'].append(f1_score(Y_test,Y_pred))
        scores['roc_auc_score'].append(roc_auc_score(Y_test,rfc.predict_proba(X_test[:,features_boruta])[:,1]))
    else:
        scores['f1_score'].append(f1_score(Y_test,Y_pred,average='macro'))
        scores['roc_auc_score'].append(roc_auc_score(Y_test,rfc.predict_proba(X_test[:,features_boruta]),multi_class='ovr'))
    results_df = pd.DataFrame(scores,index=['Boruta'])
    print("\n For Boruta")
    print(results_df)
    

Breast Cancer Wisconsin

In [133]:
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
X_bcw = breast_cancer_wisconsin_diagnostic.data.features 
Y_bcw = breast_cancer_wisconsin_diagnostic.data.targets 
X_bcw = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform').fit_transform(X_bcw)
Y_bcw =  np.where(Y_bcw == 'M', 0, 1).flatten()
print(X_bcw.shape)

(569, 30)


In [134]:
perform_experiment_real(X_bcw,Y_bcw,[5,10,15])


 For All Features
              accuracy_score  f1_score  roc_auc_score
All Features         0.97076  0.977169       0.995444

 For k =  5
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.964912  0.972222       0.989785
CIFE                           0.935673  0.949309       0.989565
RFC Variable Importance        0.953216  0.962963       0.991917

 For k =  10
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.959064  0.968037       0.994782
CIFE                           0.959064  0.967442       0.993754
RFC Variable Importance        0.959064  0.967742       0.988316

 For k =  15
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.959064  0.968037       0.995664
CIFE                           0.953216  0.963303       0.993901
RFC Variable Importance        0.964912  0.972727       0.994929

 For Boruta
        accuracy_score  f1_score  roc_a

Red Wine Quality

In [135]:
pd_wine = pd.read_csv('data/winequality-red.csv')
Y_wine = pd_wine['quality']
X_wine = pd_wine.drop(columns=['quality'])
X_wine = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform').fit_transform(X_wine)
print(X_wine.shape)

(1599, 11)


In [136]:
perform_experiment_real(X_wine,Y_wine,[4,6,8])


 For All Features
              accuracy_score  f1_score  roc_auc_score
All Features        0.685417  0.390376       0.820561

 For k =  4
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.572917  0.303693       0.746146
CIFE                           0.575000  0.315595       0.754404
RFC Variable Importance        0.566667  0.283674       0.757592

 For k =  6
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.608333  0.397515       0.817667
CIFE                           0.635417  0.375329       0.806983
RFC Variable Importance        0.618750  0.317977       0.795188

 For k =  8
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.645833  0.387671       0.833138
CIFE                           0.643750  0.342299       0.800906
RFC Variable Importance        0.652083  0.380667       0.821552

 For Boruta
        accuracy_score  f1_score  roc_auc

Heart Failure

In [137]:
pd_heart = pd.read_csv('data/heart.csv')
X_heart = pd_heart.drop(columns=['HeartDisease'])
le = LabelEncoder()
heart_categorical = []
for col in X_heart.columns:
    if X_heart[col].dtype == 'object':
        heart_categorical.append(col)
        X_heart[col] = le.fit_transform(X_heart[col])
X_heart_new_categories = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform').fit_transform(X_heart.drop(columns=heart_categorical))
X_heart = np.concatenate((X_heart_new_categories,X_heart[heart_categorical].to_numpy()),axis=1)
Y_heart = pd_heart['HeartDisease'].to_numpy()
print(X_heart.shape)

(918, 11)


In [138]:
perform_experiment_real(X_heart,Y_heart,[4,6,8])


 For All Features
              accuracy_score  f1_score  roc_auc_score
All Features        0.884058       0.9       0.943815

 For k =  4
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.807971  0.831746       0.902221
CIFE                           0.818841  0.845679       0.890625
RFC Variable Importance        0.807971  0.830671       0.900479

 For k =  6
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.826087  0.846154       0.912375
CIFE                           0.800725  0.825397       0.888556
RFC Variable Importance        0.822464  0.842444       0.910306

 For k =  8
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.847826  0.867925       0.927673
CIFE                           0.858696  0.876190       0.922501
RFC Variable Importance        0.840580  0.860759       0.914906

 For Boruta
        accuracy_score  f1_score  roc_auc

House Prices

In [139]:
df_house = pd.read_csv('data/kc_house_data.csv')
Y_house = df_house['price'].to_numpy().reshape(-1,1)
X_house = df_house.drop(columns=['price','id','date'])
Y_house = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile').fit_transform(Y_house).flatten()
X_house = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform').fit_transform(X_house)
print(X_house.shape)


(21613, 18)


In [140]:
perform_experiment_real(X_house,Y_house,[5,10,15])


 For All Features
              accuracy_score  f1_score  roc_auc_score
All Features        0.641888  0.640781       0.897039

 For k =  5
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.615052  0.619147       0.883127
CIFE                           0.618600  0.619693       0.879350
RFC Variable Importance        0.591764  0.592899       0.863827

 For k =  10
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.636952  0.636372       0.890066
CIFE                           0.621221  0.620701       0.878298
RFC Variable Importance        0.635410  0.634609       0.888956

 For k =  15
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.647131  0.645683       0.896572
CIFE                           0.629550  0.628755       0.883768
RFC Variable Importance        0.640191  0.638771       0.894605

 For Boruta
        accuracy_score  f1_score  roc_a

Adult income

In [141]:
adult = fetch_ucirepo(id=2) 
X_adult = adult.data.features 
Y_adult = adult.data.targets
Y_adult = np.where(Y_adult == '<=50K', 0, 1).flatten()
le = LabelEncoder()
adult_categorical = []
for col in X_adult.columns:
    if X_adult[col].dtype == 'object':
        adult_categorical.append(col)
        X_adult[col] = le.fit_transform(X_adult[col])
X_adult_new_categories = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform').fit_transform(X_adult.drop(columns=adult_categorical))
X_adult = np.concatenate((X_adult_new_categories,X_adult[adult_categorical].to_numpy()),axis=1)
print(X_adult.shape)


(48842, 14)


In [142]:
perform_experiment_real(X_adult,Y_adult,[4,6,8,10])


 For All Features
              accuracy_score  f1_score  roc_auc_score
All Features         0.63905  0.618811       0.691685

 For k =  4
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.643964  0.611628       0.693039
CIFE                           0.646489  0.617712       0.701705
RFC Variable Importance        0.625947  0.602970       0.668674

 For k =  6
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.637753  0.612781       0.689032
CIFE                           0.637480  0.615852       0.683038
RFC Variable Importance        0.626561  0.605024       0.672669

 For k =  8
                         accuracy_score  f1_score  roc_auc_score
JMI                            0.639869  0.622829       0.683263
CIFE                           0.633522  0.612610       0.680002
RFC Variable Importance        0.627858  0.608908       0.673945

 For k =  10
                         accuracy_score 