# Feature Selection

In [1]:
# to handle paths
import sys
import os
from pathlib import Path

# to handle datasets
import pandas as pd
import numpy as np

# for iterating
from itertools import product

pd.pandas.set_option('display.max_columns', 5)

In [2]:
# variables
project_name = "RF+Clust"
project_folder = f"C:/Users/anani/Downloads/{project_name}"
sys.path.append(project_folder)

from variables import suite_name, features, target, transformation, n_folds, budgets, algorithms
from variables import model_name, explainer_type

In [3]:
# define directories
data_folder = f"{project_folder}/Data/{suite_name}/preprocessed/"
results_folder = f"{project_folder}/Results/{suite_name}/transformation={transformation}_features=all/{model_name}"

In [4]:
def get_topn_features(path: str, n: int):
    """
    Function to read feature importance from file and get topn feature names.
    """
    feature_importance = pd.read_csv(path)
    print("Preview feature importance:")
    print(feature_importance.head())

    return feature_importance[feature_importance['rank'] <= n]['feature_name'].tolist()

In [5]:
for algorithm_name, budget in product(algorithms, budgets):
    print(f"Algorithm name: {algorithm_name}, budget: {budget}")
    
    # define directories
    data_folder_all_temp = f"{data_folder}/transformation={transformation}_features=all/algorithm_name={algorithm_name}_budget={budget}"
    results_folder_all_temp = f"{results_folder}/algorithm_name={algorithm_name}_budget={budget}"
    # create directories
    data_folder_selected_temp = f"{data_folder}/transformation={transformation}_features={features}/{model_name}/algorithm_name={algorithm_name}_budget={budget}"
    os.makedirs(data_folder_selected_temp, exist_ok=True)
    
    for fold_number in np.arange(1, n_folds+1):
        print(f"fold_number: {fold_number}")
        
        # load data
        X_train = pd.read_csv(f"{data_folder_all_temp}/X_train_fold={fold_number}.csv"
                                  , dtype={"f_id": int, "i_id": int}, index_col=['f_id', 'i_id'])
        X_test = pd.read_csv(f"{data_folder_all_temp}/X_test_fold={fold_number}.csv"
                                  , dtype={"f_id": int, "i_id": int}, index_col=['f_id', 'i_id'])
        y_train = pd.read_csv(f"{data_folder_all_temp}/y_train_fold={fold_number}.csv")
        y_test = pd.read_csv(f"{data_folder_all_temp}/y_test_fold={fold_number}.csv")
        
        # get topn features
        shap_importance_path = f"{results_folder_all_temp}/shap/shap_importance_set=train_fold={fold_number}.csv"
        topn_features = get_topn_features(path=shap_importance_path, n=features)
        print("\n")
        print(f"topn_features: {topn_features}")
        print(f"len topn_features: {len(topn_features)}")
        
        X_train = X_train[topn_features]
        X_test = X_test[topn_features]
        print(f"preview data: ")
        print(X_train.head(3))
        print(X_train.shape)

        # save
        X_train.reset_index().to_csv(f"{data_folder_selected_temp}/X_train_fold={fold_number}.csv", index=False)
        X_test.reset_index().to_csv(f"{data_folder_selected_temp}/X_test_fold={fold_number}.csv", index=False)
        y_train.to_csv(f"{data_folder_selected_temp}/y_train_fold={fold_number}.csv", index=False)
        y_test.to_csv(f"{data_folder_selected_temp}/y_test_fold={fold_number}.csv", index=False)

Algorithm name: DE1, budget: 5000
fold_number: 1
Preview feature importance:
                      feature_name  feature_importance  rank
0     ela_meta.lin_simple.coef.min            0.432102     1
1                       ic.eps.max            0.355355     2
2  ela_meta.quad_w_interact.adj_r2            0.106794     3
3    ela_meta.lin_simple.intercept            0.053418     4
4             ela_level.qda_mda_10            0.035930     5


topn_features: ['ela_meta.lin_simple.coef.min', 'ic.eps.max', 'ela_meta.quad_w_interact.adj_r2', 'ela_meta.lin_simple.intercept', 'ela_level.qda_mda_10', 'ic.eps.ratio', 'disp.ratio_mean_02', 'ic.eps.s', 'ela_distr.number_of_peaks', 'disp.ratio_mean_10']
len topn_features: 10
preview data: 
           ela_meta.lin_simple.coef.min  ic.eps.max  ...  \
f_id i_id                                            ...   
2    1                        -0.133855   -0.037239  ...   
     2                        -0.133875   -0.032532  ...   
     3                 

Preview feature importance:
                      feature_name  feature_importance  rank
0     ela_meta.lin_simple.coef.min            0.485266     1
1                       ic.eps.max            0.389903     2
2  ela_meta.quad_w_interact.adj_r2            0.186615     3
3                     ic.eps.ratio            0.089965     4
4             ela_level.qda_mda_10            0.041374     5


topn_features: ['ela_meta.lin_simple.coef.min', 'ic.eps.max', 'ela_meta.quad_w_interact.adj_r2', 'ic.eps.ratio', 'ela_level.qda_mda_10', 'ic.eps.s', 'ela_level.lda_qda_10', 'ela_meta.lin_w_interact.adj_r2', 'ic.m0', 'ela_level.lda_mda_10']
len topn_features: 10
preview data: 
           ela_meta.lin_simple.coef.min  ic.eps.max  ...     ic.m0  \
f_id i_id                                            ...             
1    1                        -0.134107   -0.236516  ... -0.402759   
     2                        -0.134107   -0.236516  ... -0.769508   
     3                        -0.134107   -0.23

Preview feature importance:
                      feature_name  feature_importance  rank
0     ela_meta.lin_simple.coef.min            0.497088     1
1                       ic.eps.max            0.355449     2
2  ela_meta.quad_w_interact.adj_r2            0.181009     3
3                     ic.eps.ratio            0.097598     4
4   ela_meta.lin_w_interact.adj_r2            0.019293     5


topn_features: ['ela_meta.lin_simple.coef.min', 'ic.eps.max', 'ela_meta.quad_w_interact.adj_r2', 'ic.eps.ratio', 'ela_meta.lin_w_interact.adj_r2', 'ela_level.qda_mda_10', 'ela_level.lda_qda_10', 'ela_meta.lin_simple.coef.max', 'ic.m0', 'ela_meta.lin_simple.intercept']
len topn_features: 10
preview data: 
           ela_meta.lin_simple.coef.min  ic.eps.max  ...     ic.m0  \
f_id i_id                                            ...             
1    1                         -0.13411   -0.236775  ... -0.381621   
     2                         -0.13411   -0.236775  ... -0.752231   
     3            

Preview feature importance:
                      feature_name  feature_importance  rank
0                       ic.eps.max            0.559500     1
1    ela_meta.lin_simple.intercept            0.088185     2
2  ela_meta.quad_w_interact.adj_r2            0.084068     3
3                     ic.eps.ratio            0.078366     4
4               ela_distr.skewness            0.067003     5


topn_features: ['ic.eps.max', 'ela_meta.lin_simple.intercept', 'ela_meta.quad_w_interact.adj_r2', 'ic.eps.ratio', 'ela_distr.skewness', 'ela_meta.lin_simple.coef.min', 'ic.eps.s', 'ela_level.mmce_lda_10', 'nbc.nn_nb.cor', 'ela_meta.lin_simple.coef.max']
len topn_features: 10
preview data: 
           ic.eps.max  ela_meta.lin_simple.intercept  ...  nbc.nn_nb.cor  \
f_id i_id                                             ...                  
1    1      -0.226713                      -0.179615  ...       2.078675   
     2      -0.226713                      -0.179614  ...       1.277996   
     3   

Preview feature importance:
                      feature_name  feature_importance  rank
0                       ic.eps.max            0.539798     1
1     ela_meta.lin_simple.coef.max            0.212762     2
2                     ic.eps.ratio            0.077637     3
3               ela_distr.skewness            0.074618     4
4  ela_meta.quad_w_interact.adj_r2            0.056553     5


topn_features: ['ic.eps.max', 'ela_meta.lin_simple.coef.max', 'ic.eps.ratio', 'ela_distr.skewness', 'ela_meta.quad_w_interact.adj_r2', 'ic.eps.s', 'ela_meta.lin_simple.intercept', 'ela_level.mmce_mda_10', 'ela_level.mmce_lda_10', 'nbc.nn_nb.mean_ratio']
len topn_features: 10
preview data: 
           ic.eps.max  ela_meta.lin_simple.coef.max  ...  \
f_id i_id                                            ...   
1    1      -0.225098                     -0.167831  ...   
     2      -0.225098                     -0.167831  ...   
     3      -0.225098                     -0.167831  ...   

           e

Preview feature importance:
                      feature_name  feature_importance  rank
0                       ic.eps.max            0.664439     1
1    ela_meta.lin_simple.intercept            0.110515     2
2  ela_meta.quad_w_interact.adj_r2            0.079552     3
3                     ic.eps.ratio            0.076117     4
4               ela_distr.skewness            0.060215     5


topn_features: ['ic.eps.max', 'ela_meta.lin_simple.intercept', 'ela_meta.quad_w_interact.adj_r2', 'ic.eps.ratio', 'ela_distr.skewness', 'ela_level.mmce_lda_10', 'nbc.dist_ratio.coeff_var', 'nbc.nb_fitness.cor', 'nbc.nn_nb.cor', 'nbc.nn_nb.sd_ratio']
len topn_features: 10
preview data: 
           ic.eps.max  ela_meta.lin_simple.intercept  ...  nbc.nn_nb.cor  \
f_id i_id                                             ...                  
1    1      -0.236775                      -0.180905  ...       2.086393   
     2      -0.236775                      -0.180904  ...       1.281668   
     3      -

Algorithm name: DE3, budget: 5000
fold_number: 1
Preview feature importance:
                   feature_name  feature_importance  rank
0  ela_meta.lin_simple.coef.min            0.031737     1
1                      ic.eps.s            0.025710     2
2  ela_meta.lin_simple.coef.max            0.023979     3
3          ela_level.qda_mda_10            0.023337     4
4             disp.diff_mean_25            0.023096     5


topn_features: ['ela_meta.lin_simple.coef.min', 'ic.eps.s', 'ela_meta.lin_simple.coef.max', 'ela_level.qda_mda_10', 'disp.diff_mean_25', 'ic.eps.max', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'ela_level.lda_qda_25', 'ela_level.lda_mda_25']
len topn_features: 10
preview data: 
           ela_meta.lin_simple.coef.min  ic.eps.s  ...  ela_level.lda_qda_25  \
f_id i_id                                          ...                         
2    1                        -0.133855  1.430562  ...              0.470187   
     2                        -0.133875  1.405680  

Preview feature importance:
                   feature_name  feature_importance  rank
0                    ic.eps.max            0.076546     1
1  ela_meta.lin_simple.coef.min            0.052694     2
2                  ic.eps.ratio            0.051610     3
3           disp.diff_median_25            0.044896     4
4      nbc.dist_ratio.coeff_var            0.037737     5


topn_features: ['ic.eps.max', 'ela_meta.lin_simple.coef.min', 'ic.eps.ratio', 'disp.diff_median_25', 'nbc.dist_ratio.coeff_var', 'disp.diff_mean_02', 'ela_level.lda_qda_50', 'ela_meta.quad_w_interact.adj_r2', 'disp.ratio_mean_10', 'ela_level.mmce_lda_50']
len topn_features: 10
preview data: 
           ic.eps.max  ela_meta.lin_simple.coef.min  ...  disp.ratio_mean_10  \
f_id i_id                                            ...                       
1    1      -0.236516                     -0.134107  ...           -1.501127   
     2      -0.236516                     -0.134107  ...           -0.955021   
     3   

Preview feature importance:
                      feature_name  feature_importance  rank
0  ela_meta.quad_w_interact.adj_r2            0.047261     1
1                       ic.eps.max            0.039682     2
2                     ic.eps.ratio            0.038512     3
3     ela_meta.lin_simple.coef.max            0.034330     4
4                disp.diff_mean_05            0.033061     5


topn_features: ['ela_meta.quad_w_interact.adj_r2', 'ic.eps.max', 'ic.eps.ratio', 'ela_meta.lin_simple.coef.max', 'disp.diff_mean_05', 'disp.ratio_mean_02', 'ela_level.lda_qda_50', 'disp.ratio_median_02', 'disp.ratio_mean_10', 'ela_level.lda_mda_25']
len topn_features: 10
preview data: 
           ela_meta.quad_w_interact.adj_r2  ic.eps.max  ...  \
f_id i_id                                               ...   
1    1                            0.682424   -0.236775  ...   
     2                            0.682424   -0.236775  ...   
     3                            0.682424   -0.236775  ...   

 

fold_number: 23
Preview feature importance:
                   feature_name  feature_importance  rank
0          disp.ratio_median_05            0.035500     1
1           disp.diff_median_02            0.034805     2
2           disp.diff_median_25            0.033974     3
3  ela_meta.lin_simple.coef.max            0.027045     4
4                    ic.eps.max            0.026786     5


topn_features: ['disp.ratio_median_05', 'disp.diff_median_02', 'disp.diff_median_25', 'ela_meta.lin_simple.coef.max', 'ic.eps.max', 'disp.diff_mean_10', 'ela_meta.lin_simple.intercept', 'ic.eps.s', 'disp.diff_mean_25', 'ela_level.lda_qda_50']
len topn_features: 10
preview data: 
           disp.ratio_median_05  disp.diff_median_02  ...  disp.diff_mean_25  \
f_id i_id                                             ...                      
1    1                -1.357938            -1.300925  ...          -1.545420   
     2                -0.904673            -0.921819  ...          -0.752806   
     3