In [None]:
from google.colab import drive
drive.mount('/content/drive')
# %cd #YOUR PATH TO THE NOTEBOOK FOLDER IN GOOGLE COLAB

In [None]:
!pip install mlxtend

In [None]:
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.model_selection import StratifiedKFold
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from configuration import data_path as DATA_PATH
from configuration import results_root as RESULTS_ROOT
from configuration import results_path as RESULTS_PATH
from configuration import hyperparameter_tuning as HYPERPARAMETER_TUNING
from configuration import model_name as MODEL_NAME
from configuration import n_splits as N_SPLITS
from configuration import n_features as N_FEATURES

In [None]:
DATA_PATH, RESULTS_PATH, HYPERPARAMETER_TUNING

In [None]:
MODEL_NAME, N_SPLITS, N_FEATURES

In [None]:
def load_training_data(path: str):
    """
    Load training data from CSV files.
    Args:
        path (str): The directory path where the training data CSV files are stored.
    Returns:
        X_train (pd.DataFrame): A DataFrame containing the input features for training.
        y_train (pd.DataFrame): A DataFrame containing the target labels for training.
    """
    X_train = pd.read_csv(f"{path}/X_train.csv", index_col=["f_id", "i_id"])
    y_train = pd.read_csv(f"{path}/y_train.csv", index_col=["f_id", "i_id"])
    return X_train, y_train

def load_hyperparameters(model_name: str, results_root: str, hyperparameter_tuning: bool) -> dict:
    """
    Load previously tuned hyperparameters or set default parameters.
    Args:
        model_name (str): The name of the model (e.g., 'random_forest', 'multitask_elastic_net').
        results_root (str): The root directory where results are saved.
        hyperparameter_tuning (bool): If true, load the tuned hyperparameters from file.
    Returns:
        dict: The hyperparameters for the model.
    """
    if hyperparameter_tuning:
        params = joblib.load(f"{results_root}/{model_name}/hyperparameter_tuning=True-feature_selection=False/best_params.joblib")
    else:
        if model_name == "random_forest":
            params = {"random_state": 42}
        elif model_name == "multitask_elastic_net":
            params = {}
        else:
            raise ValueError("Unknown model name")

    return params

def init_model(params: dict, model_name: str):
    """
    Initialize a machine learning model based on the model name and provided parameters.
    Args:
        params (dict): A dictionary containing the hyperparameters for the model.
        model_name (str): A string specifying the name of the model to be initialized.
                          Supported values are "multitask_elastic_net" and "random_forest".
    Returns:
        model: The initialized machine learning model, either MultiTaskElasticNet or RandomForestRegressor.
    """
    if model_name == "multitask_elastic_net":
        return MultiTaskElasticNet(**params)
    elif model_name == "random_forest":
        # Ensure random_state is set for RandomForest if not provided
        if "random_state" not in params:
            params["random_state"] = 42
        return RandomForestRegressor(**params)
    else:
        raise ValueError("Unknown model name")

In [None]:
def run_feature_selection(results_path):
  # Load training data
  X_train, y_train = load_training_data(path=DATA_PATH)
  # Load previously tuned hyperparameters or set default parameters
  params = load_hyperparameters(model_name=MODEL_NAME, results_root=RESULTS_ROOT, hyperparameter_tuning=HYPERPARAMETER_TUNING)
  print(f"\n Loading hyperparameters: f{params}")
  # Initialize model
  model = init_model(params=params, model_name=MODEL_NAME)
  # Initialize cross-validation
  skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
  # Forward feature selection
  sfs = SFS(model,
            k_features=N_FEATURES,
            forward=True,
            floating=False,
            scoring='neg_mean_absolute_error',
            verbose=2,
            n_jobs=-1,
            cv=list(skf.split(X_train, X_train.reset_index()["f_id"])))

  sfs = sfs.fit(X_train.reset_index(drop=True), y_train.reset_index(drop=True))

  # Save feature selection
  selected_features = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
  selected_features['n'] = selected_features['feature_names'].apply(lambda x: len(x))
  selected_features["avg_score"] = -selected_features["avg_score"]
  selected_features.sort_values(by="n", ascending=True)
  selected_features.to_csv(f"{results_path}/feature_selection_report.csv", index=False)

  return sfs, selected_features, model

In [None]:
sfs, selected_features, model = run_feature_selection(results_path=RESULTS_PATH)

In [None]:
model

In [None]:
selected_features

In [None]:
def plot_feature_selection_scores(selected_features, save_path):
    """
    Function to plot the feature selection scores with respect to the number of features (n).

    Args:
        selected_features (pd.DataFrame): DataFrame containing feature selection results, including columns:
                                          'n' for number of features, 'avg_score' for mean score (e.g. MAE),
                                          and 'std_dev' for standard deviation.
        save_path (str): Directory path where the plot will be saved.

    Returns:
        None: Displays the plot and saves the figure as PNG.
    """
    # Set theme for seaborn
    sns.set_theme(context='notebook', style='whitegrid', font_scale=1)

    # Create a figure and axis
    fig, ax = plt.subplots(figsize=(7, 5))

    # Sort selected features by average score
    selected_features = selected_features.sort_values(by="avg_score", ascending=True)

    # Plot line for the feature selection scores
    sns.lineplot(data=selected_features, ax=ax, x="n", y="avg_score")

    # Add error bars for standard deviation
    plt.errorbar(selected_features['n'], selected_features['avg_score'],
                 yerr=selected_features['std_dev'], fmt='none', capsize=5, color='gray')

    # Add a vertical line for the best feature set (lowest avg_score)
    plt.axvline(x=selected_features.head(1)["n"].values[0], color='red', linestyle='--')

    # Customize plot appearance
    sns.despine()
    plt.xlabel("Number of features")
    plt.ylabel("MAE")

    # Save the plot to the specified directory
    plt.savefig(f"{save_path}/feature_selection_scores.png", dpi=300, bbox_inches='tight')

    # Show the plot
    plt.show()

In [None]:
plot_feature_selection_scores(selected_features=selected_features, save_path=RESULTS_PATH)