In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from collections import defaultdict
from sklearn.model_selection import KFold, train_test_split, GroupKFold
from tqdm.notebook import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor


sns.set_style("whitegrid")

In [2]:
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

In [3]:
data=pd.read_csv("../3. EDA and data preprocessing/preprocessed_training_data.csv")

# The Global mean baseline model

In [4]:
updrs_cols = ['updrs_1', 'updrs_2', 'updrs_3']

mae_results = {}
mae_results['Global_Mean'] = defaultdict(list)


for fold, (train_idx, test_idx) in enumerate(tqdm(gkf.split(data, groups=data["participant_id"]), total=n_splits, desc="Running CV")):
  train, holdout=data.iloc[train_idx], data.iloc[test_idx]
  global_mean_preds = train[updrs_cols].mean().to_dict()

  holdout_preds_mean = holdout.copy()
  for col, mean_val in global_mean_preds.items():
    holdout_preds_mean[f'pred_{col}'] = mean_val

  for col in updrs_cols:
      actual_values = holdout_preds_mean[col]
      predicted_values = holdout_preds_mean[f'pred_{col}']

      valid_indices = predicted_values.notna() & actual_values.notna()
      mae = mean_absolute_error(actual_values[valid_indices], predicted_values[valid_indices])
      mae_results['Global_Mean'][col].append(mae)

print("\n")
for col, scores in mae_results['Global_Mean'].items():
    avg_mae = np.mean(scores)
    print(f"Average MAE for {col}: {avg_mae:.4f}")

Running CV:   0%|          | 0/5 [00:00<?, ?it/s]



Average MAE for updrs_1: 4.2773
Average MAE for updrs_2: 5.2968
Average MAE for updrs_3: 11.6654


# Mean by Visit Month Baseline Model

In [5]:

mae_results['Mean_by_Visit_Month_Baseline_Model'] = defaultdict(list)


for fold, (train_idx, test_idx) in enumerate(tqdm(gkf.split(data, groups=data["participant_id"]), total=n_splits, desc="Running CV")):
  train, holdout=data.iloc[train_idx], data.iloc[test_idx]
  mean_by_month = train.groupby('visit_month')[updrs_cols].mean()

  holdout_preds_mean = holdout.copy()
  test_preds_month_mean = holdout_preds_mean.merge(
      mean_by_month.rename(columns=lambda c: f'pred_{c}'),
      on='visit_month',
      how='left'
  )

  # Handle cases where a visit_month in the test set might not be in the train set
  # We'll fill any NaNs with the overall global mean

  for col in updrs_cols:
      test_preds_month_mean[f'pred_{col}']=test_preds_month_mean[f'pred_{col}'].fillna(global_mean_preds[col])

  for col in updrs_cols:
      actual_values = test_preds_month_mean[col]
      predicted_values = test_preds_month_mean[f'pred_{col}']

      valid_indices = predicted_values.notna() & actual_values.notna()
      mae = mean_absolute_error(actual_values[valid_indices], predicted_values[valid_indices])
      mae_results['Mean_by_Visit_Month_Baseline_Model'][col].append(mae)


print("\n")
for col, scores in mae_results['Mean_by_Visit_Month_Baseline_Model'].items():
    avg_mae = np.mean(scores)
    print(f"Average MAE for {col}: {avg_mae:.4f}")





Running CV:   0%|          | 0/5 [00:00<?, ?it/s]



Average MAE for updrs_1: 4.2539
Average MAE for updrs_2: 5.2254
Average MAE for updrs_3: 11.5332


# Random Forest with Simple Features

In [6]:
initial_feature_cols=["age_at_baseline", "visit_month"]

mae_results['Random_Forest_with_Simple_Features'] = defaultdict(list)


for fold, (train_idx, test_idx) in enumerate(tqdm(gkf.split(data, groups=data["participant_id"]), total=n_splits, desc="Running CV")):
    train_fold = data.iloc[train_idx]
    holdout_fold = data.iloc[test_idx]



    # Define features (X) and targets (y) for this fold
    X_train, y_train = train_fold[initial_feature_cols], train_fold[updrs_cols]
    X_holdout, y_holdout = holdout_fold[initial_feature_cols], holdout_fold[updrs_cols]
    y_train_clean = y_train.dropna()
    X_train_clean = X_train.loc[y_train_clean.index]

    rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model = MultiOutputRegressor(rf)

    # Train the model on the training data for this fold
    model.fit(X_train_clean, y_train_clean)

    # --- Prediction and Evaluation ---
    # Make predictions on the holdout set
    predictions_array = model.predict(X_holdout)

    # Convert predictions to a DataFrame for easier, column-wise evaluation
    predictions_df = pd.DataFrame(predictions_array, index=X_holdout.index, columns=updrs_cols)

    # Calculate MAE for each target column, handling potential missing values in y_holdout
    for col in updrs_cols:
        actual_values = y_holdout[col]
        predicted_values = predictions_df[col]

        # Filter out missing actuals before calculating MAE
        valid_indices = actual_values.notna()
        mae = mean_absolute_error(actual_values[valid_indices], predicted_values[valid_indices])

        # Store the result for this fold
        mae_results['Random_Forest_with_Simple_Features'][col].append(mae)

print("\n--- Cross-Validation Results for Random Forest Baseline ---")

for col, scores in mae_results['Random_Forest_with_Simple_Features'].items():
    avg_mae = np.mean(scores)
    std_mae = np.std(scores)
    print(f"Average MAE for {col}: {avg_mae:.4f}")

Running CV:   0%|          | 0/5 [00:00<?, ?it/s]


--- Cross-Validation Results for Random Forest Baseline ---
Average MAE for updrs_1: 4.3620
Average MAE for updrs_2: 5.3256
Average MAE for updrs_3: 11.5609


# Conclusion: 

Observe that the random forest model performs no better than the other two models, even if we select `visit_month` and `age_at_baseline`, which has the largest impacts on updrs scores progression by our preliminary EDA. This reveal the nature for predictions regrading the time series data, where the partcipant's own starting point and trajectory are the most important feature. Namely, the model needs to "memorize" past information in order to improve its performance on predicting the progression.

<br>

#### This also point the direction for our next step: to try the tree models with lags.