## MODIFIED 1

In [10]:
def result_rskf1(d, pipeline, mod_disp_name, n_splits=5, n_repeats=3):
   
    # feature/row selection
    d_values = d.values
    x, y = d_values[:,1:], d_values[:,:1].ravel()
    
    # define cv method
    cv = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=1)  
    
    # define performance metrics
    scoring = {
        'accuracy':'accuracy', 'precision':'precision', 'recall':'recall', 'f1':'f1', 
        'f2':make_scorer(fbeta_score, beta=2)} # dict val = scorer fct or predefined metric str  
    
    # evaluate result
    result = cross_validate(
        pipeline, x, y, cv=cv, 
        scoring=scoring, return_train_score=True, n_jobs=-1)
        
    # make a summary table
    df = pd.DataFrame(
        (k, mean(v), std(v)) for k,v in result.items()
        ).rename({0:'metric', 1:'mean', 2:'std'}, axis=1
                ).set_index('metric')
    df.index.name = None
    df.columns = pd.MultiIndex.from_product([[mod_disp_name],df.columns])
    
    return df, result

## MODIFIED 2

In [10]:
def result_rskf2(d, pipeline, mod_disp_name, n_splits=5, n_repeats=3):
    
    # feature selection
    d_values = d.values
    x, y = d_values[:,1:], d_values[:,:1].ravel()

    # define cv method
    cv = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=1)  
    
    # define performance metrics
    scoring = {
        'accuracy':'accuracy', 'precision':'precision', 'recall':'recall', 'f1':'f1', 
        'f2':make_scorer(fbeta_score, beta=2)} # dict val = scorer fct or predefined metric str  
    
    # evaluate result
    result = cross_validate(
        pipeline, x, y, cv=cv, 
        scoring=scoring, return_train_score=True, n_jobs=-1)
        
    # make a summary table
    df = pd.DataFrame(
        (k, mean(v), std(v)) for k,v in result.items()
        ).rename({0:'metric', 1:'mean', 2:'std'}, axis=1
                ).set_index('metric')
    df.index.name = None
    df.columns = pd.MultiIndex.from_product([[mod_disp_name],df.columns])
    
    return df, result

# MODIFIED

In [12]:
def summary_by_mod2(pred_mods, scalers, features, **n_splits_and_repeats):
    
    time_0 = time.time() # for all methods in pipeline
    
    results = []
    for scaler in scalers:
        time_1 = time.time() # for each scaler
        print(f'Scaler: {scaler[0]}\n')
        
        results_features = []
        for feature in features:
            time_2 = time.time() # for each feature selection model
            print(f'FS Model: {feature[0]}\n')
            
            results_pred_mods = []
            for pred_mod in pred_mods:
                time_3 = time.time() # for each prediction model
                print(f'Prediction Model: {pred_mod[0]}\n')
                
                # define pipeline
                pipeline = Pipeline([('s', scaler[1]), ('fs', feature[1]), ('m', pred_mod[1])])

                # fit models
                n_splits, n_repeats = (i for i in n_splits_and_repeats.values())
                results_model = result_rskf(x, y, pipeline, pred_mod[0], n_splits, n_repeats)[0]
                
                # collect results - innermost
                print(f'Model {pred_mod[0]} Runtime: {time.strftime("%M:%S", time.gmtime(time.time()-time_3))}')
                results_pred_mods.append(results_model)
            
            # collect results - middle
            print(f'FS Model {feature[0]} Avg Runtime per Model: {time.strftime("%M:%S", time.gmtime((time.time()-time_2)/len(pred_mods)))}\n\n')
            results_features.append(results_pred_mods)
        
        # collect results - outermost
        print(f'Scaler {scaler[0]} Avg Runtime per Model: {time.strftime("%M:%S", time.gmtime((time.time()-time_1)/len(features)))}\n\n')
        results.append(results_features)
        
    print(f'Total Runtime: {time.strftime("%M:%S", time.gmtime(time.time()-time_0))} min')
    return results

# ANOVA F-Classifier

In [None]:
# Compute the ANOVA F-value for the provided sample.
from sklearn.feature_selection import f_classif

In [None]:
f, p = f_classif(x_train, y_train)

In [139]:
ref = {}
for i in np.arange(x_train.shape[1]):
    ref[i]=d.iloc[:,1:].columns[i]

In [None]:
# Compute the ANOVA F-value for the provided sample.
from sklearn.feature_selection import f_classif
f, p = f_classif(x_train, y_train)

In [None]:
pval_ftest = pd.DataFrame(p, columns=['p-val'])
pval_ftest.index = d.iloc[:,1:].columns

In [None]:
# pval_ftest.style.set_table_attributes('style="font-size: 15px"')

In [None]:
pval_ftest[pval_ftest['p-val']<.16]

In [None]:
X_indices = np.arange(x_train.shape[-1])

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Univariate feature selection with F-test for feature scoring
# We use the default selection function to select the four
# most significant features

selector = SelectKBest(f_classif, k=7)
selector.fit(x_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()

In [None]:
plt.figure(figsize=(15,6))
plt.bar(X_indices - .45, scores, width=.8,
        label=r'Univariate score ($-Log(p_{value})$)')
plt.xlabel('Features')
plt.xticks(X_indices, d.iloc[:,1:].columns, rotation=30)
plt.show()

In [None]:
# ANOVA feature selection for numeric input and categorical output
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# define feature selection
fs = SelectKBest(score_func=f_classif, k=8)
# apply feature selection
x_selected = fs.fit_transform(x_train, y_train)
print(x_selected.shape)

For categorical predictors, we will use the chi-squared test and mutual information (information gain) from the field of information theory. Mutual information is agnostic to the data types.

# Filtered Data Anova

In [None]:
d_values = d[var_to_keep].values
x, y = d_values[:,1:], d_values[:,:1].ravel()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, shuffle=True, stratify=y)

In [None]:
ref = {}
for i in np.arange(x_train.shape[1]):
    ref[i]=d.iloc[:,1:].columns[i]

In [None]:
# Compute the ANOVA F-value for the provided sample.
from sklearn.feature_selection import f_classif
f, p = f_classif(x_train, y_train)

In [None]:
pval_ftest = pd.DataFrame(p, columns=['p-val'])
pval_ftest.index = d[var_to_keep].iloc[:,1:].columns

In [None]:
# pval_ftest.style.set_table_attributes('style="font-size: 15px"')

In [None]:
pval_ftest

In [None]:
pval_ftest[pval_ftest['p-val']<.16]

In [None]:
X_indices = np.arange(x_train.shape[-1])

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Univariate feature selection with F-test for feature scoring
# We use the default selection function to select the four
# most significant features

selector = SelectKBest(f_classif, k=7)
selector.fit(x_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()

In [None]:
plt.figure(figsize=(15,6))
plt.bar(X_indices - .45, scores, width=.8,
        label=r'Univariate score ($-Log(p_{value})$)')
plt.xlabel('Features')
plt.xticks(X_indices, d[var_to_keep].iloc[:,1:].columns, rotation=30)
plt.show()

In [None]:
# ANOVA feature selection for numeric input and categorical output
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# define feature selection
fs = SelectKBest(score_func=f_classif, k=8)
# apply feature selection
x_selected = fs.fit_transform(x_train, y_train)
print(x_selected.shape)

# RFE

In [None]:
# report which features were selected by RFE
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

# load data
d = pd.read_csv('source/d_num.csv')
d = d.values
x = d[:,1:]
y = d[:,:1].ravel()

# split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, shuffle=True, stratify=y)

# define RFE
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=8)

# fit RFE
rfe.fit(x_train, y_train)

In [None]:
d = pd.read_csv('source/d_num.csv')

In [None]:
d_RFE = pd.DataFrame(zip(d.columns[1:], rfe.ranking_), columns=['Variable', 'Ranking']).sort_values(by='Ranking').reset_index(drop=True)

In [None]:
d_RFE