In [None]:
from google.colab import drive
drive.mount('/content/drive')
# %cd #YOUR PATH TO THE NOTEBOOK FOLDER IN GOOGLE COLAB

In [None]:
import pandas as pd
import numpy as np
import random
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input

# Set seed for Python's random module
random.seed(42)
# Set seed for NumPy's random generator
np.random.seed(42)
# Set seed for TensorFlow
tf.random.set_seed(42)

In [None]:
from configuration import data_path as DATA_PATH
from configuration import results_root as RESULTS_ROOT
from configuration import model_eval_path as MODEL_EVAL_PATH
from configuration import n_splits as N_SPLITS
from configuration import batchsize as BATCH_SIZE
from configuration import epochs as EPOCHS

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 1000)

In [None]:
DATA_PATH, RESULTS_ROOT, MODEL_EVAL_PATH

In [None]:
def load_data(path: str):
    """
    Load training data from CSV files.
    Args:
        path (str): The directory path where the training data CSV files are stored.
    Returns:
        X_train (pd.DataFrame): A DataFrame containing the input features for training.
        y_train (pd.DataFrame): A DataFrame containing the target labels for training.
    """
    X_train = pd.read_csv(f"{path}/X_train.csv", index_col=["f_id", "i_id"])
    y_train = pd.read_csv(f"{path}/y_train.csv", index_col=["f_id", "i_id"])

    X_test = pd.read_csv(f"{path}/X_test.csv", index_col=["f_id", "i_id"])
    y_test = pd.read_csv(f"{path}/y_test.csv", index_col=["f_id", "i_id"])
    return X_train, y_train, X_test, y_test

In [None]:
def load_hyperparameters(model_name: str, results_root: str, hyperparameter_tuning: bool) -> dict:
    """
    Load previously tuned hyperparameters or set default parameters.
    Args:
        model_name (str): The name of the model (e.g., 'random_forest', 'multitask_elastic_net').
        results_root (str): The root directory where results are saved.
        hyperparameter_tuning (bool): If true, load the tuned hyperparameters from file.
    Returns:
        dict: The hyperparameters for the model.
    """
    if hyperparameter_tuning:
        params = joblib.load(f"{results_root}/{model_name}/hyperparameter_tuning=True-feature_selection=False/best_params.joblib")
    else:
        if model_name == "random_forest":
            params = {"random_state": 42}
        elif model_name == "multitask_elastic_net":
            params = {}
        elif model_name == "neural_network":
            params = {'n_layers': 1,
                      'n_units_l0': 32,
                      'dropout_l0': 0.2,
                      }
        else:
            raise ValueError("Unknown model name")

    return params

def init_model(params: dict, model_name: str, num_features, num_targets):
    """
    Initialize a machine learning model based on the model name and provided parameters.
    Args:
        params (dict): A dictionary containing the hyperparameters for the model.
        model_name (str): A string specifying the name of the model to be initialized.
                          Supported values are "multitask_elastic_net" and "random_forest".
    Returns:
        model: The initialized machine learning model, either MultiTaskElasticNet or RandomForestRegressor.
    """
    if model_name == "multitask_elastic_net":
        return MultiTaskElasticNet(**params)
    elif model_name == "random_forest":
        # Ensure random_state is set for RandomForest if not provided
        if "random_state" not in params:
            params["random_state"] = 42
        return RandomForestRegressor(**params)
    elif model_name == "neural_network":
        model = Sequential()
        model.add(Input(shape=(num_features,)))  # Input layer
        # Add hidden layers dynamically based on n_layers parameter
        for i in range(params['n_layers']):
            model.add(Dense(params[f'n_units_l{i}'], activation="relu", kernel_initializer="normal"))
            model.add(Dropout(rate=params[f'dropout_l{i}'], seed=42))
        model.add(Dense(num_targets))  # Output layer with num_targets
        # Compile the model
        model.compile(
            loss="mean_absolute_error",
            optimizer="adam"
        )
        model.summary()  # Display the model summary
        return model
    else:
        raise ValueError("Unknown model name")

In [None]:
def evaluate_model(model, model_name, X_train, y_train, X_test, y_test, n_splits=None, cv=None):
  """
  Evaluate a regression model using a predefined cross-validation strategy and on the test set with multiple metrics.
  Returns cross-validation scores and test scores for MAE, MSE, and R².

  Parameters:
  -----------
  model : estimator object
      The regression model to evaluate.

  X_train : pandas.DataFrame or numpy.ndarray
      Training data features.

  y_train : pandas.Series or numpy.ndarray
      Training data target.

  X_test : pandas.DataFrame or numpy.ndarray
      Test data features.

  y_test : pandas.Series or numpy.ndarray
      Test data target.

  cv : cross-validation strategy or int, optional
      A predefined cross-validation splitting strategy (e.g., KFold object). If None, defaults to 5-fold cross-validation.

  Returns:
  --------
  cv_scores : dict
      A dictionary containing the mean cross-validation scores for MAE, MSE, and R².

  test_scores : dict
      A dictionary containing the test set scores for MAE, MSE, and R².
  """
  # Use provided cross-validation strategy or default to KFold with 5 splits
  if cv is None:
      cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

  # Lists to store scores for each target
  targets = y_train.columns
  r2_scores = []
  mae_scores = []
  cv_scores = []
  test_scores = []

  # Cross-validation loop
  for fold, (train_index, test_index) in enumerate(cv.split(X_train, X_train.index.get_level_values("f_id"))):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Fit the model on the training fold
    if model_name == "neural_network":
      model.fit(X_train_fold, y_train_fold, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=False)
    else:
      model.fit(X_train_fold, y_train_fold)

    # Make predictions on the test fold
    y_pred_fold = model.predict(X_val_fold)

    # Calculate R^2 and MAE for each target and append the results
    fold_r2_scores = r2_score(y_val_fold, y_pred_fold, multioutput='raw_values')
    fold_mae_scores = mean_absolute_error(y_val_fold, y_pred_fold, multioutput='raw_values')

    # Organize results into a DataFrame
    for i, target in enumerate(targets):
            # Append R^2 score
            cv_scores.append({
                'Fold': fold + 1,
                'Target': target,
                'Scorer': 'R2',
                'Score': fold_r2_scores[i]
            })
            # Append MAE score
            cv_scores.append({
                'Fold': fold + 1,
                'Target': target,
                'Scorer': 'MAE',
                'Score': fold_mae_scores[i]
            })
  # Convert results list to a DataFrame
  cv_scores = pd.DataFrame(cv_scores)

  # Group by 'Target' and 'Scorer' to calculate the mean score for each combination
  cv_scores = cv_scores.groupby(['Target', 'Scorer'])['Score'].mean().reset_index()

  cv_scores = cv_scores.sort_values(
  by=['Scorer', 'Target'],
  ascending=[True, True],
  )   # Sort by Scorer ('MAE' ascending, 'R2' descending), then by Score and Target

  #################################################################################
  # Fit the model on the full training set
  if model_name == "neural_network":
    model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=False)
  else:
    model.fit(X_train, y_train)

  # Predict on the test set
  y_pred = model.predict(X_test)

  # Calculate R^2 and MAE for each target
  r2_scores = r2_score(y_test, y_pred, multioutput='raw_values')
  mae_scores = mean_absolute_error(y_test, y_pred, multioutput='raw_values')

  for i, target in enumerate(targets):
    # Append R^2 score
    test_scores.append({
        'Target': target,
        'Scorer': 'R2',
        'Score': r2_scores[i]
    })
    # Append MAE score
    test_scores.append({
        'Target': target,
        'Scorer': 'MAE',
        'Score': mae_scores[i]
    })

  # Convert results list to a DataFrame
  test_scores = pd.DataFrame(test_scores)

  test_scores = test_scores.sort_values(
  by=['Scorer', 'Target'],
  ascending=[True, True],
  )   # Sort by Scorer ('MAE' ascending, 'R2' descending), then by Score and Target

  return cv_scores, test_scores

In [None]:
def load_selected_features(path, n=None):
    """
    Load selected features based on a feature selection report.

    This function loads a CSV file that contains the results of a feature selection process.
    The user can either retrieve the features associated with the minimum average score (`avg_score`),
    or, if `n` is specified, retrieve features corresponding to a specific number of features (or another
    criterion defined by 'n').

    Args:
        path (str): The path to the feature selection report CSV file.
        n (int, optional): The number of features (or another selection criterion) to filter the report.
                           If not specified, the features with the minimum `avg_score` will be returned.

    Returns:
        list: A list of selected feature names.
    """

    # Load the feature selection report
    selected_features_df = pd.read_csv(f"{path}/feature_selection_report.csv")

    # If n is not specified, use the features associated with the minimum avg_score
    if n is None:
        selected_features = selected_features_df[selected_features_df['avg_score'] == selected_features_df["avg_score"].min()]['feature_names'].values[0]
    else:
        # Filter based on n ('n' corresponds to a specific number of features or other criterion)
        selected_features = selected_features_df[selected_features_df['n'] == n]['feature_names'].values[0]

    # Evaluate the string representation of the feature names and convert it to a list
    selected_features = eval(selected_features)
    selected_features = list(selected_features)

    return selected_features

In [None]:
def save_model(model, file_path):
  joblib.dump(model, file_path)

In [None]:
def model_performance_comparison(data_path, results_root, model_eval_path):
    # Load training data
    X_train, y_train, X_test, y_test = load_data(path=DATA_PATH)

    # Initialize cross-validation
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

    # Evaluate Baseline Model
    baseline_model = MultiOutputRegressor(DummyRegressor(strategy="mean"))
    baseline_cv_score, baseline_test_score = evaluate_model(baseline_model, 'baseline', X_train, y_train, X_test, y_test, cv=skf)
    # Save model
    save_model(baseline_model, f'{model_eval_path}/baseline.pkl')

    # Initialize lists to collect scores from all models, including the baseline
    all_cv_scores = [baseline_cv_score.assign(Model='baseline', Hyperparameter_Tuning=False, Feature_Selection=False)]
    all_test_scores = [baseline_test_score.assign(Model='baseline', Hyperparameter_Tuning=False, Feature_Selection=False)]

    for model_name in ['multitask_elastic_net', 'random_forest', 'neural_network']:
      print(f'Model: {model_name}')
      for hyperparameter_tuning, feature_selection in [(False, False), (True, False), (True, True)]:
          if model_name == 'neural_network' and feature_selection == True:
            pass
          else:
            print(f'Hyperparameter Tuning: {hyperparameter_tuning}, Feature Selection: {feature_selection}')

            # Load previously tuned hyperparameters or set default parameters
            params = load_hyperparameters(model_name=model_name, results_root=RESULTS_ROOT, hyperparameter_tuning=hyperparameter_tuning)
            print(f"\nLoading hyperparameters: {params}")

            # Initialize model
            model = init_model(params=params, model_name=model_name, num_features=X_train.shape[1], num_targets=y_train.shape[1])

            if feature_selection:
              selected_features = load_selected_features(path=f"{results_root}/{model_name}/hyperparameter_tuning=True-feature_selection=True", n=None)

            # Initialize cross-validation
            skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

            # Evaluate model
            cv_score, test_score = evaluate_model(model, model_name, X_train, y_train, X_test, y_test, cv=skf)

            # Add model details to cv_score and test_score using assign
            cv_score = cv_score.assign(
                Model=model_name,
                Hyperparameter_Tuning=hyperparameter_tuning,
                Feature_Selection=feature_selection
            )
            test_score = test_score.assign(
                Model=model_name,
                Hyperparameter_Tuning=hyperparameter_tuning,
                Feature_Selection=feature_selection
            )

            # Append to lists
            all_cv_scores.append(cv_score)
            all_test_scores.append(test_score)

            # Save model
            save_model(model, f'{model_eval_path}/{model_name}_{hyperparameter_tuning}_{feature_selection}.pkl')

    # Concatenate all scores into final DataFrames
    final_cv_scores = pd.concat(all_cv_scores, ignore_index=True)
    final_test_scores = pd.concat(all_test_scores, ignore_index=True)

    # Print final scores
    print("Cross-Validation Scores for All Models:")
    print(final_cv_scores.sort_values(by='Score'))
    print("\nTest Scores for All Models:")
    print(final_test_scores.sort_values(by='Score'))

    # Save scores
    final_cv_scores.to_csv(f'{model_eval_path}/cv_scores.csv', index=False)
    final_test_scores.to_csv(f'{model_eval_path}/test_scores.csv', index=False)

    return final_cv_scores, final_test_scores

final_cv_scores, final_test_scores = model_performance_comparison(DATA_PATH, RESULTS_ROOT, MODEL_EVAL_PATH)

In [None]:
final_cv_scores.to_latex()

In [None]:
final_test_scores.to_latex()