In [1]:
import sys
sys.path.append('..')

In [2]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
%load_ext autoreload
%autoreload 2
from src.feature_selector import FeatureSelector, discretize, conditional_mutual_information
from src.data_generators import DataSetGenerator, draw_function
from src.jmi import JMI
from src.minimax import MiniMax
from src.mifs import MIFS
from src.lasso import LassoFS
from src.rffs import RandomForestFS

In [4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Communities and Crime

In [5]:
# fetch dataset 
communities_and_crime = fetch_ucirepo(id=183) 
  
# data (as pandas dataframes) 
X = communities_and_crime.data.features.replace('?',np.NaN) 
y = communities_and_crime.data.targets.replace('?',np.NaN)

In [6]:
df = pd.concat([y, X], axis=1)._get_numeric_data()

In [7]:
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df)
df = pd.DataFrame(imp.transform(df), columns=df.columns)

In [10]:
jmi = JMI(df, unique_th=10, stopping_n_features=10, stopping_criterium=0.01)
jmi.run_fs()

Feature selected:  PctKids2Par
Criterium value:  0.3928704420992558
Feature selected:  numbUrban
Criterium value:  0.028607721586031164
Feature selected:  racePctWhite
Criterium value:  0.11862176903928084


In [11]:
mm = MiniMax(df, unique_th=10, stopping_n_features=10)
mm.run_fs()

Feature selected:  PctKids2Par
Criterium value:  0.3928704420992558
Feature selected:  state
Criterium value:  0.17422674714897293
Feature selected:  racePctWhite
Criterium value:  0.16500282526798876
Feature selected:  PctBornSameState
Criterium value:  0.14104335562801837


In [12]:
mifs = MIFS(df, unique_th=10, stopping_n_features=20)
mifs.run_fs()

Feature selected:  PctKids2Par
Criterium value:  0.3928704420992558
Feature selected:  numbUrban
Criterium value:  0.028607721586031164
Feature selected:  agePct12t21
Criterium value:  -0.01660668275779889
Feature selected:  MedNumBR
Criterium value:  -0.024884946623184634
Feature selected:  fold
Criterium value:  -0.047616985645613806
Feature selected:  PctVacMore6Mos
Criterium value:  -0.08715037872787607
Feature selected:  NumStreet
Criterium value:  -0.13335897820482764
Feature selected:  PctWorkMomYoungKids
Criterium value:  -0.15832009921978235
Feature selected:  pctWFarmSelf
Criterium value:  -0.19804972709335084
Feature selected:  indianPerCap
Criterium value:  -0.24287120162185177
Feature selected:  PctEmplManu
Criterium value:  -0.2686142204078272
Feature selected:  NumImmig
Criterium value:  -0.3136957335080716
Feature selected:  pctWRetire
Criterium value:  -0.3565173659675882
Feature selected:  LandArea
Criterium value:  -0.3832340123253762
Feature selected:  PctUsePubTran

In [None]:
rffs = RandomForestFS(df, unique_th=10)
lasso.run_fs()

In [None]:
lasso = LassoFS(df, unique_th=10)
lasso.run_fs()

In [158]:
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant

In [149]:
features_sets = [jmi, mm, mifs, lasso]

In [161]:
model.rsquared_adj

0.6202679607243053

In [164]:
for features_set in features_sets:
    X = df[features_set.features]
    y = df["ViolentCrimesPerPop"]
    model = OLS(y, add_constant(X)).fit()
    print(f"R-sqaured adjusted for {type(features_set).__name__} = {model.rsquared_adj} and bic = {model.bic} with {len(features_set.features)} features.")

R-sqaured adjusted for JMI = 0.6196955003635882 and bic = -2052.296049957855 with 3 features.
R-sqaured adjusted for MiniMax = 0.6098143709808939 and bic = -1994.5536138953855 with 4 features.
R-sqaured adjusted for MIFS = 0.6091666548819471 and bic = -1885.7849954247445 with 20 features.
R-sqaured adjusted for LassoFS = 0.6793037773347836 and bic = -1748.3323553967452 with 101 features.


# Myocardial infarction complications

In [59]:
# fetch dataset 
myocardial_infarction_complications = fetch_ucirepo(id=579) 
  
# data (as pandas dataframes) 
X = myocardial_infarction_complications.data.features 
y = myocardial_infarction_complications.data.targets 

In [60]:
X

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,IBS_NASL,GB,SIM_GIPERT,DLIT_AG,...,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n
0,77.0,1,2.0,1.0,1.0,2.0,,3.0,0.0,7.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
2,52.0,1,0.0,0.0,0.0,2.0,,2.0,0.0,2.0,...,3.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
3,68.0,0,0.0,0.0,0.0,2.0,,2.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
4,60.0,1,0.0,0.0,0.0,2.0,,3.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,77.0,0,0.0,4.0,2.0,1.0,,2.0,0.0,7.0,...,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1696,70.0,0,0.0,6.0,2.0,1.0,,2.0,0.0,7.0,...,0.0,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1697,55.0,1,3.0,6.0,2.0,2.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1698,79.0,0,2.0,2.0,2.0,1.0,,2.0,0.0,7.0,...,1.0,,,1.0,0.0,1.0,1.0,1.0,0.0,0.0


In [61]:
df = pd.concat([y["FIBR_PREDS"], X], axis=1)._get_numeric_data()
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df)
df = pd.DataFrame(imp.transform(df), columns=df.columns)

KeyboardInterrupt: 

In [45]:
jmi = JMI(df, unique_th=10, stopping_n_features=10)
jmi.run_fs()

Feature selected:  n_r_ecg_p_05
Criterium value:  0.018802109613468974
Feature selected:  n_r_ecg_p_08
Criterium value:  0.002397284234167213
Feature selected:  AGE
Criterium value:  0.0033374311822512662
Feature selected:  nr_07
Criterium value:  0.001003000395104792
Feature selected:  nr_03
Criterium value:  0.0023836997190714226
Feature selected:  nr_11
Criterium value:  0.0016590257413470022
Feature selected:  zab_leg_03
Criterium value:  0.0014311822898791361
Feature selected:  R_AB_3_n
Criterium value:  0.0015987008744733381
Feature selected:  LID_S_n
Criterium value:  0.0015562594218809158
Feature selected:  n_r_ecg_p_01
Criterium value:  0.001654024826511852


# spambase

In [43]:
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = spambase.data.features 
y = spambase.data.targets 

In [57]:
df = pd.concat([y, X], axis=1)._get_numeric_data()
# imp = IterativeImputer(max_iter=10, random_state=0)
# imp.fit(df)
# df = pd.DataFrame(imp.transform(df), columns=df.columns)

In [58]:
df

Unnamed: 0,Class,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,...,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0,1.0,,,,,,,,,,...,,,,,,,,,,
1,1.0,,,,,,,,,,...,,,,,,,,,,
2,1.0,,,,,,,,,,...,,,,,,,,,,
3,1.0,,,,,,,,,,...,,,,,,,,,,
4,1.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,,0.31,0.0,0.62,0.0,0.00,0.31,0.0,0.0,0.0,...,0.0,0.000,0.232,0.0,0.000,0.0,0.0,1.142,3.0,88.0
4597,,0.00,0.0,0.00,0.0,0.00,0.00,0.0,0.0,0.0,...,0.0,0.000,0.000,0.0,0.353,0.0,0.0,1.555,4.0,14.0
4598,,0.30,0.0,0.30,0.0,0.00,0.00,0.0,0.0,0.0,...,0.0,0.102,0.718,0.0,0.000,0.0,0.0,1.404,6.0,118.0
4599,,0.96,0.0,0.00,0.0,0.32,0.00,0.0,0.0,0.0,...,0.0,0.000,0.057,0.0,0.000,0.0,0.0,1.147,5.0,78.0
