# Data Preprocessing

In [1]:
# to handle paths
import sys
import os
from pathlib import Path

# to handle datasets
import pandas as pd
import numpy as np

# for iterating
from itertools import product

# for train test split
from sklearn.model_selection import LeaveOneGroupOut, cross_validate

# feature scaling
from sklearn.preprocessing import StandardScaler

# feature selection
from feature_engine.selection import DropDuplicateFeatures, DropConstantFeatures
import networkx as nx
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# for creating the preprocessing pipeline
from sklearn.pipeline import Pipeline

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# to save the trained scaler class
import joblib

pd.pandas.set_option('display.max_columns', 10)

In [2]:
# variables
project_name = "RF+Clust"
project_folder = f"C:/Users/anani/Downloads/{project_name}"
sys.path.append(project_folder)

from variables import suite_name, features, target, transformation, n_folds, budgets, algorithms
from variables import model_name, explainer_type

In [3]:
# define directories
data_folder = f"{project_folder}/Data/{suite_name}/original/folds"

In [4]:
# create directories
folds_folder = f"{project_folder}/Data/{suite_name}/preprocessed/transformation={transformation}_features={features}"
os.makedirs(folds_folder, exist_ok=True)

In [5]:
for algorithm_name, budget in product(algorithms, budgets):
    print("\n")
    print(f"Algorithm name: {algorithm_name}")
    print(f"budget: {budget}")

    # create directory
    folds_folder_temp = f"{folds_folder}/algorithm_name={algorithm_name}_budget={budget}"
    os.makedirs(folds_folder_temp, exist_ok=True)
    
    for fold_number in range(1, n_folds+1):
        # load fold data
        X_train = pd.read_csv(f"{data_folder}/algorithm_name={algorithm_name}_budget={budget}/X_train_fold={fold_number}.csv"
                                  , dtype={"f_id": int, "i_id": int}, index_col = ["f_id", "i_id"])
        y_train = pd.read_csv(f"{data_folder}/algorithm_name={algorithm_name}_budget={budget}/y_train_fold={fold_number}.csv"
                                  , dtype={"f_id": int, "i_id": int}, index_col = ["f_id", "i_id"])
        print("Preview fold data:")
#         print(X_train.head())
#         print(X_train.dtypes)
        print(f"Shape: {X_train.shape}")  
        print(y_train.head())
        
        X_test = pd.read_csv(f"{data_folder}/algorithm_name={algorithm_name}_budget={budget}/X_test_fold={fold_number}.csv"
                                  , dtype={"f_id": int, "i_id": int}, index_col = ["f_id", "i_id"])
        y_test = pd.read_csv(f"{data_folder}/algorithm_name={algorithm_name}_budget={budget}/y_test_fold={fold_number}.csv"
                                  , dtype={"f_id": int, "i_id": int}, index_col = ["f_id", "i_id"])
#         print(X_test.head())
#         print(X_test.dtypes)
        print(f"Shape: {X_test.shape}")  
        print(y_test.head())

        # create preprocessing pipeline: drop constant and duplicated features, scale features
        pipe = Pipeline([
            ("drop_constant", DropConstantFeatures(tol=1)),
            ("drop_duplicated", DropDuplicateFeatures()),
            ("std_scaler", StandardScaler())
        ])
        
        # fit pipeline to train data
        pipe.fit(X_train)

        # transform train and test data
        X_train_preprocessed = pd.DataFrame(pipe.transform(X_train), index=X_train.index, columns=pipe.named_steps['drop_duplicated'].get_feature_names_out())
        X_test_preprocessed = pd.DataFrame(pipe.transform(X_test), index=X_test.index, columns=pipe.named_steps['drop_duplicated'].get_feature_names_out())
        
        constant_features = list(pipe.named_steps['drop_constant'].features_to_drop_)
        duplicate_features = list(pipe.named_steps['drop_duplicated'].features_to_drop_)
        final_features = list(pipe.named_steps['drop_duplicated'].get_feature_names_out())

        print("\n")
        print(f"constant_features: {constant_features}")
        print(f"duplicate_features: {duplicate_features}")
        print(f"final_features: {final_features}")
        print(f"final_features len: {len(final_features)}")

        print(f"preview data: ")
        print(X_train_preprocessed.head(3))
        print(X_train_preprocessed.shape)

        print(X_test_preprocessed.head())        
        print(X_test_preprocessed.shape) 
    
        # save preprocessing pipeline
        joblib.dump(pipe, f"{folds_folder_temp}/preprocessing_pipeline_fold={fold_number}.joblib")
        
        # save fold data
        X_train_preprocessed.reset_index().to_csv(f"{folds_folder_temp}/X_train_fold={fold_number}.csv", index=False)
        X_test_preprocessed.reset_index().to_csv(f"{folds_folder_temp}/X_test_fold={fold_number}.csv", index=False)

        y_train.reset_index().to_csv(f"{folds_folder_temp}/y_train_fold={fold_number}.csv", index=False)
        y_test.reset_index().to_csv(f"{folds_folder_temp}/y_test_fold={fold_number}.csv", index=False)



Algorithm name: DE1
budget: 5000
Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
2    1          0.491186
     2          0.397307
     3          1.019078
     4          0.395285
     5          0.554339
Shape: (5, 64)
           log_precision
f_id i_id               
1    1          0.004468
     2          0.001517
     3          0.002879
     4          0.002733
     5          0.003954


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', '

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.004468
     2          0.001517
     3          0.002879
     4          0.002733
     5          0.003954
Shape: (5, 64)
           log_precision
f_id i_id               
4    1          1.694975
     2          1.710077
     3          1.660660
     4          1.658542
     5          1.684316


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.004468
     2          0.001517
     3          0.002879
     4          0.002733
     5          0.003954
Shape: (5, 64)
           log_precision
f_id i_id               
6    1          1.454080
     2          1.494976
     3          1.389206
     4          1.449244
     5          1.383148


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.004468
     2          0.001517
     3          0.002879
     4          0.002733
     5          0.003954
Shape: (5, 64)
           log_precision
f_id i_id               
9    1          2.520785
     2          2.512130
     3          2.540581
     4          2.508416
     5          2.272684


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.004468
     2          0.001517
     3          0.002879
     4          0.002733
     5          0.003954
Shape: (5, 64)
           log_precision
f_id i_id               
12   1          5.508481
     2          4.721602
     3          5.298826
     4          5.438039
     5          5.013253


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Shape: (5, 64)
           log_precision
f_id i_id               
15   1          1.794616
     2          1.799602
     3          1.798891
     4          1.866167
     5          1.864154


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.lda_mda_25', 'ela_level.lda_mda_50', 'ela_level.lda_qda_10', 'ela_level.lda_qda_25', 'ela_level.lda_qda_50', 'ela_level.mmce_lda_10', 'ela_level.mmce_lda_25', 'ela_level.mmce_lda_50', 'ela_level.mmce_mda_10', 'el

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.004468
     2          0.001517
     3          0.002879
     4          0.002733
     5          0.003954
Shape: (5, 64)
           log_precision
f_id i_id               
17   1          0.497532
     2          0.615320
     3          0.452977
     4          0.392524
     5          0.611013


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.004468
     2          0.001517
     3          0.002879
     4          0.002733
     5          0.003954
Shape: (5, 64)
           log_precision
f_id i_id               
19   1          0.810203
     2          0.803341
     3          0.824606
     4          0.824604
     5          0.806598


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.004468
     2          0.001517
     3          0.002879
     4          0.002733
     5          0.003954
Shape: (5, 64)
           log_precision
f_id i_id               
22   1          0.804091
     2          0.589283
     3          0.847805
     4          0.881655
     5          0.968421


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.004468
     2          0.001517
     3          0.002879
     4          0.002733
     5          0.003954
Shape: (5, 64)
           log_precision
f_id i_id               
24   1          1.903305
     2          1.898678
     3          1.898648
     4          1.925837
     5          1.903056


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.229386
     2          0.401374
     3          0.485594
     4          0.328702
     5          0.401022
Shape: (5, 64)
           log_precision
f_id i_id               
2    1          3.516764
     2          3.686549
     3          3.793223
     4          3.651296
     5          3.888664


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.229386
     2          0.401374
     3          0.485594
     4          0.328702
     5          0.401022
Shape: (5, 64)
           log_precision
f_id i_id               
4    1          1.803461
     2          1.845953
     3          1.746316
     4          1.919497
     5          1.811070


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

           disp.diff_mean_02  disp.diff_mean_05  disp.diff_mean_10  \
f_id i_id                                                            
5    1              0.610041           0.634696           0.704033   
     2              0.487651           0.546865           0.616124   
     3              0.546584           0.612693           0.653106   
     4              0.531404           0.617875           0.661067   
     5              0.533301           0.626922           0.726109   

           disp.diff_mean_25  disp.diff_median_02  ...  pca.expl_var.cor_init  \
f_id i_id                                          ...                          
5    1              0.816204             0.614551  ...              -1.343457   
     2              0.826233             0.492291  ...              -1.343457   
     3              0.827175             0.572946  ...              -1.343457   
     4              0.835147             0.537872  ...              -1.343457   
     5              0.8

Shape: (5, 64)
           log_precision
f_id i_id               
9    1          1.976266
     2          2.065614
     3          2.119345
     4          1.991850
     5          2.275320


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.lda_mda_25', 'ela_level.lda_mda_50', 'ela_level.lda_qda_10', 'ela_level.lda_qda_25', 'ela_level.lda_qda_50', 'ela_level.mmce_lda_10', 'ela_level.mmce_lda_25', 'ela_level.mmce_lda_50', 'ela_level.mmce_mda_10', 'el

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.229386
     2          0.401374
     3          0.485594
     4          0.328702
     5          0.401022
Shape: (5, 64)
           log_precision
f_id i_id               
11   1          1.745279
     2          1.632252
     3          1.718699
     4          1.855124
     5          1.777732


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.229386
     2          0.401374
     3          0.485594
     4          0.328702
     5          0.401022
Shape: (5, 64)
           log_precision
f_id i_id               
13   1          2.261792
     2          2.484236
     3          2.384000
     4          2.317258
     5          2.309676


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.229386
     2          0.401374
     3          0.485594
     4          0.328702
     5          0.401022
Shape: (5, 64)
           log_precision
f_id i_id               
15   1          1.639018
     2          1.595568
     3          1.740277
     4          1.646804
     5          1.648417


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Shape: (5, 64)
           log_precision
f_id i_id               
17   1          0.465672
     2          0.422499
     3          0.443723
     4          0.500240
     5          0.396804


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.lda_mda_25', 'ela_level.lda_mda_50', 'ela_level.lda_qda_10', 'ela_level.lda_qda_25', 'ela_level.lda_qda_50', 'ela_level.mmce_lda_10', 'ela_level.mmce_lda_25', 'ela_level.mmce_lda_50', 'ela_level.mmce_mda_10', 'el

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.229386
     2          0.401374
     3          0.485594
     4          0.328702
     5          0.401022
Shape: (5, 64)
           log_precision
f_id i_id               
19   1          0.608799
     2          0.615020
     3          0.588108
     4          0.619335
     5          0.618266


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.229386
     2          0.401374
     3          0.485594
     4          0.328702
     5          0.401022
Shape: (5, 64)
           log_precision
f_id i_id               
21   1          0.793255
     2          0.931707
     3          0.535112
     4          0.869051
     5          0.986941


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1          0.229386
     2          0.401374
     3          0.485594
     4          0.328702
     5          0.401022
Shape: (5, 64)
           log_precision
f_id i_id               
23   1          0.477246
     2          0.467838
     3          0.465995
     4          0.454770
     5          0.479685


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l



constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.lda_mda_25', 'ela_level.lda_mda_50', 'ela_level.lda_qda_10', 'ela_level.lda_qda_25', 'ela_level.lda_qda_50', 'ela_level.mmce_lda_10', 'ela_level.mmce_lda_25', 'ela_level.mmce_lda_50', 'ela_level.mmce_mda_10', 'ela_level.mmce_mda_25', 'ela_level.mmce_mda_50', 'ela_level.mmce_qda_10', 'ela_level.mmce_qda_25', 'ela_level.mmce_qda_50', 'ela_level.qda_mda_10', 'ela_level.qda_mda_25', 'ela_level.qda_mda_5

           disp.diff_mean_02  disp.diff_mean_05  disp.diff_mean_10  \
f_id i_id                                                            
1    1             -1.401136          -1.449504          -1.470875   
     2             -1.029651          -1.003992          -0.931178   
     3             -1.054458          -0.994642          -0.969182   

           disp.diff_mean_25  disp.diff_median_02  ...  pca.expl_var.cor_init  \
f_id i_id                                          ...                          
1    1             -1.500174            -1.300569  ...              -1.293993   
     2             -0.794075            -0.956865  ...              -1.293993   
     3             -0.860973            -0.985355  ...              -1.293993   

           pca.expl_var.cov_init  pca.expl_var_PC1.cor_init  \
f_id i_id                                                     
1    1                 -0.390418                   0.646846   
     2                 -0.390418                   1.2



constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.lda_mda_25', 'ela_level.lda_mda_50', 'ela_level.lda_qda_10', 'ela_level.lda_qda_25', 'ela_level.lda_qda_50', 'ela_level.mmce_lda_10', 'ela_level.mmce_lda_25', 'ela_level.mmce_lda_50', 'ela_level.mmce_mda_10', 'ela_level.mmce_mda_25', 'ela_level.mmce_mda_50', 'ela_level.mmce_qda_10', 'ela_level.mmce_qda_25', 'ela_level.mmce_qda_50', 'ela_level.qda_mda_10', 'ela_level.qda_mda_25', 'ela_level.qda_mda_5



constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.lda_mda_25', 'ela_level.lda_mda_50', 'ela_level.lda_qda_10', 'ela_level.lda_qda_25', 'ela_level.lda_qda_50', 'ela_level.mmce_lda_10', 'ela_level.mmce_lda_25', 'ela_level.mmce_lda_50', 'ela_level.mmce_mda_10', 'ela_level.mmce_mda_25', 'ela_level.mmce_mda_50', 'ela_level.mmce_qda_10', 'ela_level.mmce_qda_25', 'ela_level.mmce_qda_50', 'ela_level.qda_mda_10', 'ela_level.qda_mda_25', 'ela_level.qda_mda_5



constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.lda_mda_25', 'ela_level.lda_mda_50', 'ela_level.lda_qda_10', 'ela_level.lda_qda_25', 'ela_level.lda_qda_50', 'ela_level.mmce_lda_10', 'ela_level.mmce_lda_25', 'ela_level.mmce_lda_50', 'ela_level.mmce_mda_10', 'ela_level.mmce_mda_25', 'ela_level.mmce_mda_50', 'ela_level.mmce_qda_10', 'ela_level.mmce_qda_25', 'ela_level.mmce_qda_50', 'ela_level.qda_mda_10', 'ela_level.qda_mda_25', 'ela_level.qda_mda_5

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1      5.546488e-09
     2      5.915447e-09
     3      5.604998e-09
     4      5.107313e-09
     5      5.791580e-09
Shape: (5, 64)
           log_precision
f_id i_id               
12   1          0.624383
     2          0.508860
     3          0.496855
     4          0.743854
     5          1.159697


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1      5.546488e-09
     2      5.915447e-09
     3      5.604998e-09
     4      5.107313e-09
     5      5.791580e-09
Shape: (5, 64)
           log_precision
f_id i_id               
14   1          0.000566
     2          0.000646
     3          0.000548
     4          0.000535
     5          0.000796


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1      5.546488e-09
     2      5.915447e-09
     3      5.604998e-09
     4      5.107313e-09
     5      5.791580e-09
Shape: (5, 64)
           log_precision
f_id i_id               
16   1          1.073127
     2          1.108161
     3          1.085776
     4          1.078580
     5          1.075910


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1      5.546488e-09
     2      5.915447e-09
     3      5.604998e-09
     4      5.107313e-09
     5      5.791580e-09
Shape: (5, 64)
           log_precision
f_id i_id               
18   1          0.269370
     2          0.338946
     3          0.251053
     4          0.290216
     5          0.366839


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1      5.546488e-09
     2      5.915447e-09
     3      5.604998e-09
     4      5.107313e-09
     5      5.791580e-09
Shape: (5, 64)
           log_precision
f_id i_id               
20   1          0.228122
     2          0.225408
     3          0.242118
     4          0.201996
     5          0.199300


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1      5.546488e-09
     2      5.915447e-09
     3      5.604998e-09
     4      5.107313e-09
     5      5.791580e-09
Shape: (5, 64)
           log_precision
f_id i_id               
22   1          0.470557
     2          0.470557
     3          0.782672
     4          0.475349
     5          0.928092


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l

Preview fold data:
Shape: (115, 64)
           log_precision
f_id i_id               
1    1      5.546488e-09
     2      5.915447e-09
     3      5.604998e-09
     4      5.107313e-09
     5      5.791580e-09
Shape: (5, 64)
           log_precision
f_id i_id               
24   1          1.717523
     2          1.692746
     3          1.721075
     4          1.722035
     5          1.713971


constant_features: ['pca.expl_var.cor_x', 'pca.expl_var.cov_x']
duplicate_features: ['pca.expl_var_PC1.cov_x']
final_features: ['disp.diff_mean_02', 'disp.diff_mean_05', 'disp.diff_mean_10', 'disp.diff_mean_25', 'disp.diff_median_02', 'disp.diff_median_05', 'disp.diff_median_10', 'disp.diff_median_25', 'disp.ratio_mean_02', 'disp.ratio_mean_05', 'disp.ratio_mean_10', 'disp.ratio_mean_25', 'disp.ratio_median_02', 'disp.ratio_median_05', 'disp.ratio_median_10', 'disp.ratio_median_25', 'ela_distr.kurtosis', 'ela_distr.number_of_peaks', 'ela_distr.skewness', 'ela_level.lda_mda_10', 'ela_level.l