In [None]:
# Setup and Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to the project folder
%cd #YOUR PATH TO THE NOTEBOOK IN GOOGLE COLAB

# Import dependencies
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from configuration import experiment_id as EXPERIMENT_ID
from configuration import data_root as DATA_ROOT
from configuration import data_path as DATA_PATH

DATA_PATH

### ELA cleaning

In [None]:
# Define ELA feature cleaning
def clean_ela_features(data_root, columns):
  """Preprocess ELA features and return a cleaned DataFrame."""
  # Read data
  filepath = f"{data_root}/ela_feats-05D-n050.csv"
  df = pd.read_csv(filepath)
  print(df)
  print(df.shape)
  # Select ELA features
  df = df[["fid", "iid"] + columns]
  print(f"Number of feature calculation runs per problem instance: {df.groupby(['fid', 'iid']).size().unique()}")
  # Calculate the final ELA feature value over multiple runs using MEAN aggregation
  df = df.groupby(["fid", "iid"], as_index=False).mean()
  # Rename id columns
  df = df.rename(columns={"fid": "f_id", "iid": "i_id"})
  return df

In [None]:
# List of ELA features, the others contain nans or are not relevent such as the cost of calculating a feature etc.
columns = [
 'cm_angle.dist_ctr2best.mean',
 'cm_angle.dist_ctr2worst.mean',
 'cm_angle.angle.mean',
#  'cm_angle.costs_runtime',
 'cm_grad.mean',
#  'cm_grad.costs_runtime',
 'disp.ratio_mean_02',
 'disp.ratio_mean_05',
 'disp.ratio_mean_10',
 'disp.ratio_mean_25',
 'disp.ratio_median_02',
 'disp.ratio_median_05',
 'disp.ratio_median_10',
 'disp.ratio_median_25',
 'disp.diff_mean_02',
 'disp.diff_mean_05',
 'disp.diff_mean_10',
 'disp.diff_mean_25',
 'disp.diff_median_02',
 'disp.diff_median_05',
 'disp.diff_median_10',
 'disp.diff_median_25',
#  'disp.costs_runtime',
 'ela_conv.conv_prob',
 'ela_conv.lin_prob',
 'ela_conv.lin_dev.orig',
 'ela_conv.lin_dev.abs',
#  'ela_conv.costs_runtime',
 'ela_curv.grad_norm.min',
 'ela_curv.grad_norm.lq',
 'ela_curv.grad_norm.mean',
 'ela_curv.grad_norm.med',
 'ela_curv.grad_norm.uq',
 'ela_curv.grad_norm.max',
 'ela_curv.grad_norm.sd',
 'ela_curv.grad_scale.nas',
 'ela_curv.hessian_cond.nas',
#  'ela_curv.costs_fun_evals',
#  'ela_curv.costs_runtime',
 'ela_distr.skewness',
 'ela_distr.kurtosis',
 'ela_distr.number_of_peaks',
#  'ela_distr.costs_runtime',
 'ela_level.mmce_lda_10',
 'ela_level.mmce_qda_10',
 'ela_level.mmce_mda_10',
 'ela_level.lda_qda_10',
 'ela_level.lda_mda_10',
 'ela_level.qda_mda_10',
 'ela_level.mmce_lda_25',
 'ela_level.mmce_qda_25',
 'ela_level.mmce_mda_25',
 'ela_level.lda_qda_25',
 'ela_level.lda_mda_25',
 'ela_level.qda_mda_25',
 'ela_level.mmce_lda_50',
 'ela_level.mmce_qda_50',
 'ela_level.mmce_mda_50',
 'ela_level.lda_qda_50',
 'ela_level.lda_mda_50',
 'ela_level.qda_mda_50',
#  'ela_level.costs_runtime',
 'ela_local.n_loc_opt.abs',
 'ela_local.n_loc_opt.rel',
 'ela_local.best2mean_contr.orig',
 'ela_local.basin_sizes.avg_best',
 'ela_local.basin_sizes.avg_non_best',
 'ela_local.basin_sizes.avg_worst',
#  'ela_local.fun_evals.min',
#  'ela_local.fun_evals.lq',
#  'ela_local.fun_evals.mean',
#  'ela_local.fun_evals.median',
#  'ela_local.fun_evals.uq',
#  'ela_local.fun_evals.max',
#  'ela_local.fun_evals.sd',
#  'ela_local.costs_fun_evals',
#  'ela_local.costs_runtime',
 'ela_meta.lin_simple.adj_r2',
 'ela_meta.lin_simple.intercept',
 'ela_meta.lin_simple.coef.min',
 'ela_meta.lin_simple.coef.max',
 'ela_meta.lin_simple.coef.max_by_min',
 'ela_meta.lin_w_interact.adj_r2',
 'ela_meta.quad_simple.adj_r2',
 'ela_meta.quad_simple.cond',
 'ela_meta.quad_w_interact.adj_r2',
#  'ela_meta.costs_runtime',
 'ic.h.max',
 'ic.eps.s',
 'ic.eps.max',
 'ic.eps.ratio',
 'ic.m0',
#  'ic.costs_runtime',
 'nbc.nn_nb.sd_ratio',
 'nbc.nn_nb.mean_ratio',
 'nbc.nn_nb.cor',
 'nbc.dist_ratio.coeff_var',
 'nbc.nb_fitness.cor',
#  'nbc.costs_runtime'
 ]
len(columns)

In [None]:
# Clean ELA features
df = clean_ela_features(DATA_ROOT, columns)

# Save cleaned data
df.to_csv(f"{DATA_PATH}/X.csv", index=False)

### EDA

In [None]:
# Exploratory Data Analysis (EDA)
def plot_feature_distribution(df, columns, title, data_path):
  """Plot and save the distribution of features."""
  plt.figure(figsize=(7, 20))
  sns.boxplot(data=df[columns].melt(), x="value", y="variable", color='white')
  # Save
  plt.savefig(f"{data_path}/{title}.png", dpi=300)
  plt.savefig(f"{data_path}/{title}.pdf", bbox_inches='tight')

# Scale features
def scale_features(df, columns):
  """Scale features using MinMaxScaler and return scaled DataFrame."""
  scaler = MinMaxScaler(feature_range=(0, 1))
  scaled_values = scaler.fit_transform(df[columns])
  scaled_df = pd.DataFrame(scaled_values, columns=columns, index=df.index)
  return scaled_df

In [None]:
# Inspect feature ranges
df.describe().transpose()

In [None]:
# Plot feature distribution
plot_feature_distribution(df, columns, "X_ELA_distribution_original",  DATA_PATH)

# Apply feature scaling
scaled_df = scale_features(df, columns)
plot_feature_distribution(scaled_df, columns, "X_ELA_distribution_scaled", DATA_PATH)

In [None]:
# Feature correlation
def heatmap(df: pd.DataFrame, figsize, font_scale, path, **kwargs):
  """ Function to plot a heatmap."""
  # Set up figure
  sns.set(style="white", font_scale=font_scale)
  plt.figure(figsize=figsize)
  # Plot
  ax = sns.heatmap(df, **kwargs)
  cbar = ax.collections[0].colorbar
  cbar.ax.tick_params(labelsize=25)
  plt.tight_layout()
  # save
  plt.savefig(f"{path}.png", dpi=300)
  plt.savefig(f"{path}.pdf")

  plt.show()
  plt.close()

In [None]:
# Explore feature correlation
heatmap(df=df.set_index(["f_id", "i_id"]).corr(method="spearman"), figsize=(25, 20), font_scale=1, path=f"{DATA_PATH}/X_ELA_correlation_spearman", vmin=-1, vmax=1, annot=False, center=0)
heatmap(df=df.set_index(["f_id", "i_id"]).corr(method="pearson"), figsize=(25, 20), font_scale=1, path=f"{DATA_PATH}/X_ELA_correlation_pearson", vmin=-1, vmax=1, annot=False, center=0)