In [1]:
import sys
sys.path.append('..')

In [2]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
%load_ext autoreload
%autoreload 2
from src.feature_selector import FeatureSelector, discretize, conditional_mutual_information
from src.data_generators import DataSetGenerator, draw_function
from src.experiment import Experiment
from src.jmi import JMI
from src.minimax import MiniMax
from src.mifs import MIFS
from src.lasso import LassoFS
from src.rffs import RandomForestFS

In [4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant

In [46]:
def generate_fs_list(df, stopping_features=100):
    jmi = JMI(df, unique_th=10, stopping_n_features=10, stopping_criterium=0.01)
    mm = MiniMax(df, unique_th=10, stopping_n_features=min(10, stopping_features))
    mifs = MIFS(df, unique_th=10, stopping_n_features=min(20, stopping_features))
    rffs = RandomForestFS(df)
    lasso = LassoFS(df, unique_th=10)
    return [jmi, mm, mifs, lasso, rffs]

# Communities and Crime

In [5]:
# fetch dataset 
communities_and_crime = fetch_ucirepo(id=183) 
  
# data (as pandas dataframes) 
X = communities_and_crime.data.features.replace('?',np.NaN) 
y = communities_and_crime.data.targets.replace('?',np.NaN)

In [6]:
df = pd.concat([y, X], axis=1)._get_numeric_data()
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df)
df = pd.DataFrame(imp.transform(df), columns=df.columns)

In [8]:
features_sets = generate_fs_list(df)

In [10]:
communities_and_crime_experiment = Experiment(features_sets, OLS, df, "ViolentCrimesPerPop")
communities_and_crime_experiment.fit_fs()

----JMI----
Feature selected:  PctKids2Par
Criterium value:  0.3928704420992558
Feature selected:  numbUrban
Criterium value:  0.028607721586031164
Feature selected:  racePctWhite
Criterium value:  0.11862176903928084
----MiniMax----
Feature selected:  PctKids2Par
Criterium value:  0.3928704420992558
Feature selected:  state
Criterium value:  0.17422674714897293
Feature selected:  racePctWhite
Criterium value:  0.16500282526798876
Feature selected:  PctBornSameState
Criterium value:  0.14104335562801837
----MIFS----
Feature selected:  PctKids2Par
Criterium value:  0.3928704420992558
Feature selected:  numbUrban
Criterium value:  0.028607721586031164
Feature selected:  agePct12t21
Criterium value:  -0.01660668275779889
Feature selected:  MedNumBR
Criterium value:  -0.024884946623184634
Feature selected:  fold
Criterium value:  -0.047616985645613806
Feature selected:  PctVacMore6Mos
Criterium value:  -0.08715037872787607
Feature selected:  NumStreet
Criterium value:  -0.13335897820482764

In [12]:
communities_and_crime_experiment.print_results()

R-squared adjusted for JMI = 0.6196955003635882                   and bic = -2052.296049957855 with 3 features.
R-squared adjusted for MiniMax = 0.6098143709808939                   and bic = -1994.5536138953855 with 4 features.
R-squared adjusted for MIFS = 0.6091666548819471                   and bic = -1885.7849954247454 with 20 features.
R-squared adjusted for LassoFS = 0.6793037773347834                   and bic = -1748.3323553967452 with 101 features.
R-squared adjusted for RandomForestFS = 0.6220978857114035                   and bic = -1998.9984924909695 with 13 features.


# Myocardial infarction complications

In [18]:
# fetch dataset 
myocardial_infarction_complications = fetch_ucirepo(id=579) 
  
# data (as pandas dataframes) 
X = myocardial_infarction_complications.data.features 
y = myocardial_infarction_complications.data.targets 

In [19]:
df = pd.concat([y["FIBR_PREDS"], X], axis=1)._get_numeric_data()
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df)
df = pd.DataFrame(imp.transform(df), columns=df.columns)

In [21]:
features_sets = generate_fs_list(df)

In [23]:
myocardial_experiment = Experiment(features_sets, OLS, df, "FIBR_PREDS")
myocardial_experiment.fit_fs()
myocardial_experiment.print_results()

----JMI----
Feature selected:  nr_07
Criterium value:  0.0010030003951053085
----MiniMax----
Feature selected:  IBS_POST
Criterium value:  0.00928721594008113
----MIFS----
----LassoFS----


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

----RandomForestFS----


# spambase

In [27]:
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = spambase.data.features 
y = spambase.data.targets 

In [28]:
df = pd.concat([y, X], axis=1)._get_numeric_data()
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df)
df = pd.DataFrame(imp.transform(df), columns=df.columns)

In [29]:
features_sets = generate_fs_list(df)

In [32]:
spam_experiment = Experiment(features_sets, OLS, df, "Class")
spam_experiment.fit_fs()
spam_experiment.print_results()

----JMI----
Feature selected:  word_freq_receive
Criterium value:  0.02370615943894136
----MiniMax----
Feature selected:  word_freq_receive
Criterium value:  0.02753022204397636
----MIFS----
----LassoFS----
----RandomForestFS----
R-squared adjusted for JMI = 0.3352438893555246                   and bic = 4634.187293128869 with 5 features.
R-squared adjusted for MiniMax = 0.3352438893555246                   and bic = 4634.187293128869 with 5 features.
R-squared adjusted for MIFS = 0.4467601406026084                   and bic = 3900.778508544669 with 20 features.
R-squared adjusted for LassoFS = 0.5544149965500782                   and bic = 3179.843937482965 with 57 features.
R-squared adjusted for RandomForestFS = 0.3561848474710567                   and bic = 4620.669495731263 with 23 features.


# spambase

In [34]:
gcd = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = gcd.data.features 
y = gcd.data.targets 

In [42]:
df = pd.concat([y, X], axis=1)._get_numeric_data()
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df)
df = pd.DataFrame(imp.transform(df), columns=df.columns)

In [47]:
features_sets = generate_fs_list(df, stopping_features=7)

In [48]:
gcd_experiment = Experiment(features_sets, OLS, df, "class")
gcd_experiment.fit_fs()
gcd_experiment.print_results()

----JMI----
Feature selected:  Attribute2
Criterium value:  0.025258400886859246
Feature selected:  Attribute18
Criterium value:  -0.0074120013975928585
Feature selected:  Attribute11
Criterium value:  -0.008662909018615687
Feature selected:  Attribute8
Criterium value:  -0.010064534372552636
----MiniMax----
Feature selected:  Attribute2
Criterium value:  0.025258400886859246
Feature selected:  Attribute5
Criterium value:  0.03921148996443606
----MIFS----
Feature selected:  Attribute2
Criterium value:  0.025258400886859246
Feature selected:  Attribute18
Criterium value:  -0.0074120013975928585
Feature selected:  Attribute11
Criterium value:  -0.017701851244556487
Feature selected:  Attribute16
Criterium value:  -0.03529821930781121
Feature selected:  Attribute8
Criterium value:  -0.040903654658092946
Feature selected:  Attribute13
Criterium value:  -0.19497814628480728
Feature selected:  Attribute5
Criterium value:  -0.36799158972929663
----LassoFS----
----RandomForestFS----
R-squared 

# Diabetes

In [81]:
diabetes = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes.data.features 
y = diabetes.data.targets 

  df = pd.read_csv(data_url)


In [86]:
y.loc[y['readmitted'] == 'NO','readmitted'] = 0
y.loc[y['readmitted'] != 0,'readmitted'] = 1
y = y.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.loc[y['readmitted'] == 'NO','readmitted'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.loc[y['readmitted'] != 0,'readmitted'] = 1


In [87]:
df = pd.concat([y, X], axis=1)._get_numeric_data()
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df)
df = pd.DataFrame(imp.transform(df), columns=df.columns)

In [88]:
df

Unnamed: 0,readmitted,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,0.0,6.0,25.0,1.0,1.0,41.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,1.0,1.0,7.0,3.0,59.0,0.0,18.0,0.0,0.0,0.0,9.0
2,0.0,1.0,1.0,7.0,2.0,11.0,5.0,13.0,2.0,0.0,1.0,6.0
3,0.0,1.0,1.0,7.0,2.0,44.0,1.0,16.0,0.0,0.0,0.0,7.0
4,0.0,1.0,1.0,7.0,1.0,51.0,0.0,8.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1.0,1.0,3.0,7.0,3.0,51.0,0.0,16.0,0.0,0.0,0.0,9.0
101762,0.0,1.0,4.0,5.0,5.0,33.0,3.0,18.0,0.0,0.0,1.0,9.0
101763,0.0,1.0,1.0,7.0,1.0,53.0,0.0,9.0,1.0,0.0,0.0,13.0
101764,0.0,2.0,3.0,7.0,10.0,45.0,2.0,21.0,0.0,0.0,1.0,9.0


In [89]:
features_sets = generate_fs_list(df, stopping_features=10)

In [90]:
diabetes_experiment = Experiment(features_sets, OLS, df, "readmitted")
diabetes_experiment.fit_fs()
diabetes_experiment.print_results()

----JMI----
Feature selected:  discharge_disposition_id
Criterium value:  0.013511359657251091
Feature selected:  number_inpatient
Criterium value:  0.011784981136142252
Feature selected:  number_outpatient
Criterium value:  -4.372121659966142e-05
Feature selected:  number_diagnoses
Criterium value:  0.0011413492145203883
----MiniMax----
Feature selected:  discharge_disposition_id
Criterium value:  0.013511359657251091
Feature selected:  number_inpatient
Criterium value:  0.014152356765805634
----MIFS----
Feature selected:  discharge_disposition_id
Criterium value:  0.013511359657251091
Feature selected:  number_inpatient
Criterium value:  0.011784981136142252
Feature selected:  number_outpatient
Criterium value:  -0.0009050781506386274
Feature selected:  number_emergency
Criterium value:  -0.0022161961351000264
Feature selected:  num_procedures
Criterium value:  -0.005585019862521185
Feature selected:  number_diagnoses
Criterium value:  -0.014375284738845306
Feature selected:  num_lab