<a href="https://colab.research.google.com/github/cgenevier/CSCI5622-HW4/blob/part-d-DLmodels/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Study 1: Designing explainable speech-based machine learning models of depression

To open this ipynb in Colab, click the "Open in Colab" button at the top of the ipynb in Github, or [follow this link](https://colab.research.google.com/github/cgenevier/CSCI5622-HW4/blob/main/main.ipynb).

Given that Colab doesn't automatically load any of the content (data or other functions) from the Github repo, running the code below will copy the repo into the workspace directory for use. To save this ipynb file back to Github, select **File > Save** (which should show the repo if you're signed in) or **File > Save a copy in Github** if it's in the menu.

Note that the content of the data files or any of the other file structures are not saved back to Github, so make sure that if you make changes to things there, that you put them in Github separately.

In [None]:
# Clone Github Repo into the temporary local environment so data can be accessed and manipulated
!git clone https://github.com/cgenevier/CSCI5622-HW4.git
%cd CSCI5622-HW4

In [None]:
# Import necessary libraries

# Helpers
import glob

# Pandas, seaborn, and numpy for data manipulation
import pandas as pd
pd.set_option("display.max_rows", None)
import statistics as stat
import seaborn as sns
import numpy as np
np.random.seed(42)

# Keras & TensorFlow for building the neural networks
import itertools, json, time
from itertools import count
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, regularizers, callbacks, backend as K
tf.random.set_seed(42)

# Feature extraction
!pip install vaderSentiment transformers torch
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import logging, BertTokenizer, BertModel
import torch

# Matplotlib for graphing
import matplotlib.pyplot as plt

# Disable progress bars (necessary for it to show up correctly in Github)
logging.set_verbosity_error()

##### Formatting the data - Depression Labels

In [3]:
# Import Depression Labels
# Columns: Participant_ID, PHQ_Score
depression_labels = pd.read_csv("data/DepressionLabels.csv")

# Rename Participant_ID to ParticipantID to match accoustic files below & force trimmed string type
depression_labels = depression_labels.rename(columns={"Participant_ID": "ParticipantID"})
depression_labels["ParticipantID"] = depression_labels["ParticipantID"].astype(str).str.strip()

##### Formatting the data - Text Features

In [None]:
# Import Text Dataset (for text feature extraction)
# Note: When comparing the E-DAIC_Transcripts files to the corresponding E-DAIC Acoustics files,
# it looks like the transcripts sometimes only contain partial data from the accoustics text column -
# for example, 386_Transcript.csv - so it seems to make sense to concatenate Text data in the
# Acoustics file for completeness.
rows = []
for p in glob.glob("data/E-DAIC_Acoustics/*_utterance_agg.csv"):
    df = pd.read_csv(p)
    df["ParticipantID"] = df["ParticipantID"].astype(str).str.strip()
    full_text = " ".join(df["Text"].dropna().astype(str))
    full_text = " ".join(full_text.split())  # collapse whitespace
    rows.append({"ParticipantID": df["ParticipantID"].iloc[0], "FullText": full_text})

# Columns: ParticipantID, FullText
text_df = pd.DataFrame(rows)
# Merge with labels. Columns: ParticipantID, FullText, PHQ_Score
lang_df = depression_labels.merge(text_df, on="ParticipantID", how="inner")

# Inspect results
lang_df.head()

##### Formatting the data - Acoustic Features

In [None]:
# Import Accoustic Dataset (for part c, d)

# Helper function for mean, standard dev, & IQR (interquartile range)
def summarize_cols(num_df: pd.DataFrame) -> pd.DataFrame:
    # mean and std per column (NaN-safe)
    mean_s = num_df.mean(numeric_only=True)
    std_s  = num_df.std(numeric_only=True)
    # IQR = Q3 - Q1 per column
    q75 = num_df.quantile(0.75, numeric_only=True)
    q25 = num_df.quantile(0.25, numeric_only=True)
    iqr_s = q75 - q25

    # assemble into a tidy (feature, stat) table
    stats = pd.concat(
        {"mean": mean_s, "std": std_s, "iqr": iqr_s},
        axis=1
    )  # index=feature, columns=[mean,std,iqr]

    # flatten to one row with columns like feature__mean
    wide = stats.stack().to_frame().T
    wide.columns = [f"{feat}__{stat}" for feat, stat in wide.columns]
    return wide

# Each file in E-DAIC_Acoustics contains utterance-level acoustic features for one participant.
rows_with_conf = []
rows_no_conf = []
for p in glob.glob("data/E-DAIC_Acoustics/*_utterance_agg.csv"):
    df = pd.read_csv(p)
    df["ParticipantID"] = df["ParticipantID"].astype(str).str.strip()

    # Include Confidence (column 5) + all acoustic features (6+)
    numeric_with_conf = df.columns[5:]
    df[numeric_with_conf] = df[numeric_with_conf].apply(pd.to_numeric, errors="coerce")

    agg_with_conf = summarize_cols(df[numeric_with_conf])
    agg_with_conf.insert(0, "ParticipantID", df["ParticipantID"].iloc[0])
    rows_with_conf.append(agg_with_conf)

    # Excludes Confidence - only include acoustic features
    numeric_no_conf = df.columns[6:]
    df[numeric_no_conf] = df[numeric_no_conf].apply(pd.to_numeric, errors="coerce")

    agg_no_conf = summarize_cols(df[numeric_no_conf])
    agg_no_conf.insert(0, "ParticipantID", df["ParticipantID"].iloc[0])
    rows_no_conf.append(agg_no_conf)

# Combine into one DataFrame each
acoustic_features_with_conf = pd.concat(rows_with_conf, ignore_index=True)
acoustic_features_no_conf = pd.concat(rows_no_conf, ignore_index=True)

# Merge with labels to add PHQ_Score
acoustic_df_with_confidence = depression_labels.merge(acoustic_features_with_conf, on="ParticipantID", how="inner")
acoustic_df_no_confidence   = depression_labels.merge(acoustic_features_no_conf, on="ParticipantID", how="inner")

# Reorder columns: ParticipantID, PHQ_Score, then features
cols = ["ParticipantID", "PHQ_Score"] + [c for c in acoustic_df_with_confidence.columns if c not in ["ParticipantID", "PHQ_Score"]]
acoustic_df_with_confidence = acoustic_df_with_confidence[cols]
cols = ["ParticipantID", "PHQ_Score"] + [c for c in acoustic_df_no_confidence.columns if c not in ["ParticipantID", "PHQ_Score"]]
acoustic_df_no_confidence = acoustic_df_no_confidence[cols]

# Inspect results
display(acoustic_df_with_confidence.head())
display(acoustic_df_no_confidence.head())

### (a) (2 points) Extracting language features.

**Syntactic vectorizers:** count vectorizer (e.g., CountVectorizer from sklearn) transforming
a collection of text documents into a numerical matrix of word or token counts; TFIDF vectorizer (e.g., TfidfVectorizer from sklearn) incorporating document-level weighting,
which emphasizes words significant to specific documentsâ€™ part-of-speech features counting
the distribution of part of speech tags over a document

In [None]:
# Use TfidfVectorizer from sklearn
vect = TfidfVectorizer(max_features=1000)
X_tfidf = vect.fit_transform(lang_df["FullText"])

# Convert sparse matrix to DataFrame
syntactic_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vect.get_feature_names_out()
)

# Add ParticipantID column & move to first column
syntactic_df["ParticipantID"] = lang_df["ParticipantID"].values
cols = ["ParticipantID"] + [c for c in syntactic_df.columns if c != "ParticipantID"]
syntactic_df = syntactic_df[cols]

# Add back in PHQ_Score & move to second column
syntactic_df = syntactic_df.merge(depression_labels, on="ParticipantID", how="inner")
cols = ["ParticipantID", "PHQ_Score"] + [c for c in syntactic_df.columns if c not in ["ParticipantID", "PHQ_Score"]]
syntactic_df = syntactic_df[cols]

# Inspect dataframe
syntactic_df.head()

**Semantic features:** sentiment scores (e.g., Vader, https://github.com/cjhutto/vaderSentiment),
topic distribution (using topic modeling), or named entities

In [None]:
# Using Vader to analyze sentiment of the text data
analyzer = SentimentIntensityAnalyzer()

# Apply Vader to the text data (creates 4 new columns)
vader_scores = lang_df["FullText"].apply(lambda x: pd.Series(analyzer.polarity_scores(str(x))))
semantic_df = pd.concat([lang_df, vader_scores], axis=1)

# Inspect dataframe
semantic_df.head()


**Advanced features:** word embeddings, such as Word2Vec or BERT (e.g., pytorch-pretrainedbert)) for capturing contextual meaning

In [None]:
# Use BERT to capture contextual meaning (note: takes about 4 minutes to run on T4)

# Load uncased base model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

# Loop through text data and get embeddings
embeddings = []
for text in lang_df["FullText"]:
    # Truncate long text (BERT max = 512 tokens)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # [CLS] token
    embeddings.append(cls_embedding)

# Convert list of embeddings (each 768-dim) to DataFrame
bert_df = pd.DataFrame(np.vstack(embeddings))
bert_df.columns = [f"bert_{i}" for i in range(bert_df.shape[1])]

# Add ParticipantID and PHQ_Score
bert_df = pd.concat([lang_df[["ParticipantID", "PHQ_Score"]].reset_index(drop=True), bert_df], axis=1)

# Inspect dataframe
bert_df.head()

**Combined dataset:** Combined the three dataframes above into one with all the features

In [None]:
# Merge all three on ParticipantID
text_feature_df = (
    syntactic_df
    .merge(semantic_df, on=["ParticipantID", "PHQ_Score"], how="outer")
    .merge(bert_df, on=["ParticipantID", "PHQ_Score"], how="outer")
)
text_feature_df.head()

# Optional: sort by ParticipantID for clarity
#merged_df = merged_df.sort_values("ParticipantID").reset_index(drop=True)

### (b) (2 points) Estimating depression severity with interpretable models using language features.

Question: In semantic feature extraction above, there are four features generated: neg, neu, pos, compound. They are inter-related because neg is the proportion of the document that is negative, neu is the proportion of the document that is neutral, pos is the proportion of the document that is positive, and compound is a normalized sentiment value that takes into account all 3. Should we remove some of these features because they're redundant?

### (c) (2 points) Estimating depression severity with interpretable models using acoustic features.

### (d) (2 points) Estimating depression severity with unimodal and multimodal deep learning models.

In [10]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr

import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [11]:
"""Helpers"""

# PyTorch dataset wrapper for tabular (numpy) features
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (
            torch.from_numpy(self.X[idx]),
            torch.tensor(self.y[idx], dtype=torch.float32)
        )

# Basic MLP for regression
class MLPRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

# Compute evaluation metrics
def compute_metrics(y_true, y_pred, global_max_phq):
    y_true = np.asarray(y_true, dtype=np.float32)
    y_pred = np.asarray(y_pred, dtype=np.float32)

    # Pearson correlation
    r, _ = pearsonr(y_true, y_pred)

    # Absolute relative error (average over participants)
    re = np.mean(np.abs(y_pred - y_true) / global_max_phq)

    return r, re

# Train the MLP on one fold of cross-validation
def train_one_fold(X_train, y_train, X_val, y_val, num_epochs=10, batch_size=32, lr=1e-3):

    # Regularization
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled   = scaler.transform(X_val)

    # Build PyTorch datasets/loaders
    train_ds = TabularDataset(X_train_scaled.astype(np.float32), y_train)
    val_ds   = TabularDataset(X_val_scaled.astype(np.float32), y_val)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    # Model, loss, optimizer
    model = MLPRegressor(input_dim=X_train.shape[1]).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        # Train
        for Xb, yb in train_loader:
            Xb = Xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad()
            preds = model(Xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * len(Xb)

        avg_loss = running_loss / len(train_ds)

        # Validate
        model.eval()
        val_preds, val_true = [], []

        with torch.no_grad():
            for Xb, yb in val_loader:
                Xb = Xb.to(device)
                preds = model(Xb).cpu().numpy()
                val_preds.append(preds)
                val_true.append(yb.numpy())

        # Concatenate predictions across batches
        val_preds = np.concatenate(val_preds)
        val_true  = np.concatenate(val_true)

        # Compute metrics
        global_max_phq = y.max()
        r, re = compute_metrics(val_true, val_preds, global_max_phq)

        # Print epoch results
        print(f"Epoch {epoch+1:02d} | train_loss={avg_loss:.4f} | val_r={r:.3f} | val_RE={re:.3f}")

    return model, scaler


In [12]:
"""Language features only"""

df = text_feature_df.dropna(subset=["PHQ_Score"]).copy() # Keep only rows with PHQ scores
feature_cols = [c for c in text_feature_df.columns if c.startswith("bert_")] # Use BERT features

X = df[feature_cols].fillna(0.0).to_numpy().astype(np.float32) # Features
y = df["PHQ_Score"].to_numpy().astype(np.float32)              # Labels

print("X shape:", X.shape)
print("y shape:", y.shape)

kf = KFold(n_splits=5, shuffle=True, random_state=42) # 5-fold cross-validation
all_val_preds = np.zeros_like(y, dtype=np.float32)    # Store predictions for all validation folds
fold_results = []

global_max_phq = y.max() # For relative error normalization

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=== Fold {fold+1} ===")

    # Split
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Train on this fold
    model, scaler = train_one_fold(
        X_train, y_train, X_val, y_val,
        num_epochs=10,
        batch_size=32,
        lr=1e-3
    )

    # Predict on validation set
    X_val_scaled = scaler.transform(X_val).astype(np.float32)
    val_ds = TabularDataset(X_val_scaled, y_val)
    val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)

    model.eval()
    val_preds = []
    with torch.no_grad():
        for Xb, _ in val_loader:
            Xb = Xb.to(device)
            preds = model(Xb).cpu().numpy()
            val_preds.append(preds)

    # Combine predictions across batches
    val_preds = np.concatenate(val_preds).astype(np.float32)
    all_val_preds[val_idx] = val_preds

    # Compute metrics
    r, re = compute_metrics(y_val, val_preds, global_max_phq)
    fold_results.append((r, re))

    # This fold results
    print(f"Fold {fold+1} summary: r={r:.3f}, RE={re:.3f}")

# Final results across all folds
overall_r, overall_re = compute_metrics(y, all_val_preds, global_max_phq)
print("\n=== Overall 5-fold CV results (language-only MLP) ===")
print(f"Pearson r: {overall_r:.3f}")
print(f"Mean absolute relative error: {overall_re:.3f}")
print("Per-fold (r, RE):", fold_results)


X shape: (134, 768)
y shape: (134,)

=== Fold 1 ===
Epoch 01 | train_loss=67.5614 | val_r=-0.192 | val_RE=0.184
Epoch 02 | train_loss=53.8038 | val_r=-0.121 | val_RE=0.170
Epoch 03 | train_loss=43.9864 | val_r=-0.111 | val_RE=0.165
Epoch 04 | train_loss=34.9317 | val_r=-0.107 | val_RE=0.162
Epoch 05 | train_loss=27.5896 | val_r=-0.088 | val_RE=0.165
Epoch 06 | train_loss=21.3995 | val_r=-0.068 | val_RE=0.173
Epoch 07 | train_loss=18.3062 | val_r=-0.034 | val_RE=0.180
Epoch 08 | train_loss=15.0482 | val_r=0.006 | val_RE=0.183
Epoch 09 | train_loss=11.6023 | val_r=0.009 | val_RE=0.186
Epoch 10 | train_loss=9.5905 | val_r=-0.008 | val_RE=0.187
Fold 1 summary: r=-0.008, RE=0.187

=== Fold 2 ===
Epoch 01 | train_loss=62.3974 | val_r=-0.437 | val_RE=0.266
Epoch 02 | train_loss=51.1440 | val_r=-0.468 | val_RE=0.250
Epoch 03 | train_loss=41.6718 | val_r=-0.435 | val_RE=0.238
Epoch 04 | train_loss=32.3926 | val_r=-0.398 | val_RE=0.230
Epoch 05 | train_loss=25.6756 | val_r=-0.377 | val_RE=0.229


In [13]:
"""Acoustic features only"""

df = acoustic_df_no_confidence.dropna(subset=["PHQ_Score"]).copy() # Keep only rows with PHQ scores
non_feature_cols = ["ParticipantID", "PHQ_Score"]
feature_cols = [c for c in df.columns if c not in non_feature_cols] # Input columns

X = df[feature_cols].fillna(0.0).to_numpy().astype(np.float32) # Features
y = df["PHQ_Score"].to_numpy().astype(np.float32)              # Labels

print("Acoustic X shape:", X.shape)
print("Acoustic y shape:", y.shape)

kf = KFold(n_splits=5, shuffle=True, random_state=42) # 5-fold cross-validation
all_val_preds = np.zeros_like(y, dtype=np.float32)    # Store predictions for all validation folds
fold_results = []

global_max_phq = y.max() # For relative error normalization

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=== Acoustic Fold {fold+1} ===")

    # Split
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Train on this fold
    model, scaler = train_one_fold(
        X_train, y_train, X_val, y_val,
        num_epochs=10,
        batch_size=32,
        lr=1e-3
    )

    # Predict on validation set
    X_val_scaled = scaler.transform(X_val).astype(np.float32)
    val_ds = TabularDataset(X_val_scaled, y_val)
    val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)

    model.eval()
    val_preds = []
    with torch.no_grad():
        for Xb, _ in val_loader:
            Xb = Xb.to(device)
            preds = model(Xb).cpu().numpy()
            val_preds.append(preds)

    # Combine predictions across batches
    val_preds = np.concatenate(val_preds).astype(np.float32)
    all_val_preds[val_idx] = val_preds

    # Compute metrics
    r, re = compute_metrics(y_val, val_preds, global_max_phq)
    fold_results.append((r, re))

    # This fold results
    print(f"Fold {fold+1} summary: r={r:.3f}, RE={re:.3f}")

# Final results across all folds
overall_r, overall_re = compute_metrics(y, all_val_preds, global_max_phq)
print("\n=== Overall 5-fold CV results (acoustic-only MLP) ===")
print(f"Pearson r: {overall_r:.3f}")
print(f"Mean absolute relative error: {overall_re:.3f}")
print("Per-fold (r, RE):", fold_results)


Acoustic X shape: (134, 69)
Acoustic y shape: (134,)

=== Acoustic Fold 1 ===
Epoch 01 | train_loss=68.4185 | val_r=0.139 | val_RE=0.193
Epoch 02 | train_loss=63.7061 | val_r=0.094 | val_RE=0.183
Epoch 03 | train_loss=59.8291 | val_r=0.039 | val_RE=0.175
Epoch 04 | train_loss=56.1703 | val_r=0.001 | val_RE=0.169
Epoch 05 | train_loss=52.6099 | val_r=-0.009 | val_RE=0.163
Epoch 06 | train_loss=49.7511 | val_r=-0.014 | val_RE=0.159
Epoch 07 | train_loss=46.3375 | val_r=-0.012 | val_RE=0.154
Epoch 08 | train_loss=43.3980 | val_r=-0.016 | val_RE=0.152
Epoch 09 | train_loss=41.1712 | val_r=-0.015 | val_RE=0.151
Epoch 10 | train_loss=38.2755 | val_r=-0.015 | val_RE=0.152
Fold 1 summary: r=-0.015, RE=0.152

=== Acoustic Fold 2 ===
Epoch 01 | train_loss=64.5845 | val_r=0.121 | val_RE=0.280
Epoch 02 | train_loss=60.8403 | val_r=0.070 | val_RE=0.265
Epoch 03 | train_loss=56.8003 | val_r=0.049 | val_RE=0.253
Epoch 04 | train_loss=53.2129 | val_r=0.042 | val_RE=0.241
Epoch 05 | train_loss=50.1440 

In [14]:
"""Multimodal (language and acoustic features)"""

# Merge acoustic and language features
multi_df = (
    text_feature_df
    .merge(acoustic_df_no_confidence, on=["ParticipantID", "PHQ_Score"], how="inner")
)

df = multi_df.dropna(subset=["PHQ_Score"]).copy() # Keep only rows with PHQ scores

lang_cols = [c for c in multi_df.columns if c.startswith("bert_")]
acoustic_cols = [c for c in multi_df.columns
                 if c not in ["ParticipantID", "PHQ_Score", "FullText"]
                 and not c.startswith("bert_")] # Avoid double-counting language features
feature_cols = lang_cols + acoustic_cols # Concatenate both sets

X = df[feature_cols].fillna(0.0).to_numpy().astype(np.float32) # Features
y = df["PHQ_Score"].to_numpy().astype(np.float32)              # Labels

print("Multimodal X shape:", X.shape)
print("Multimodal y shape:", y.shape)

kf = KFold(n_splits=5, shuffle=True, random_state=42) # 5-fold cross-validation
all_val_preds = np.zeros_like(y, dtype=np.float32)    # Store predictions for all validation folds
fold_results = []

global_max_phq = y.max() # For relative error normalization

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=== Multimodal Fold {fold+1} ===")

    # Split
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Train on this fold
    model, scaler = train_one_fold(
        X_train, y_train, X_val, y_val,
        num_epochs=10,
        batch_size=32,
        lr=1e-3
    )

    # Predict on validation set
    X_val_scaled = scaler.transform(X_val).astype(np.float32)
    val_ds = TabularDataset(X_val_scaled, y_val)
    val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)

    model.eval()
    val_preds = []
    with torch.no_grad():
        for Xb, _ in val_loader:
            Xb = Xb.to(device)
            preds = model(Xb).cpu().numpy()
            val_preds.append(preds)

    # Combine predictions across batches
    val_preds = np.concatenate(val_preds).astype(np.float32)
    all_val_preds[val_idx] = val_preds

    # Compute metrics
    r, re = compute_metrics(y_val, val_preds, global_max_phq)
    fold_results.append((r, re))

    # This fold results
    print(f"Fold {fold+1} summary: r={r:.3f}, RE={re:.3f}")

# Final results across all folds
overall_r, overall_re = compute_metrics(y, all_val_preds, global_max_phq)
print("\n=== Overall 5-fold CV results (multimodal MLP) ===")
print(f"Pearson r: {overall_r:.3f}")
print(f"Mean absolute relative error: {overall_re:.3f}")
print("Per-fold (r, RE):", fold_results)

Multimodal X shape: (134, 1841)
Multimodal y shape: (134,)

=== Multimodal Fold 1 ===
Epoch 01 | train_loss=71.1891 | val_r=-0.338 | val_RE=0.198
Epoch 02 | train_loss=50.1982 | val_r=-0.137 | val_RE=0.184
Epoch 03 | train_loss=34.7170 | val_r=-0.073 | val_RE=0.171
Epoch 04 | train_loss=21.4074 | val_r=-0.028 | val_RE=0.159
Epoch 05 | train_loss=12.2882 | val_r=0.001 | val_RE=0.156
Epoch 06 | train_loss=6.6352 | val_r=0.034 | val_RE=0.162
Epoch 07 | train_loss=4.3216 | val_r=0.076 | val_RE=0.165
Epoch 08 | train_loss=3.1201 | val_r=0.107 | val_RE=0.165
Epoch 09 | train_loss=2.6576 | val_r=0.116 | val_RE=0.163
Epoch 10 | train_loss=2.2221 | val_r=0.103 | val_RE=0.162
Fold 1 summary: r=0.103, RE=0.162

=== Multimodal Fold 2 ===
Epoch 01 | train_loss=61.8432 | val_r=-0.096 | val_RE=0.263
Epoch 02 | train_loss=43.2430 | val_r=-0.172 | val_RE=0.249
Epoch 03 | train_loss=30.6390 | val_r=-0.199 | val_RE=0.242
Epoch 04 | train_loss=17.8422 | val_r=-0.203 | val_RE=0.234
Epoch 05 | train_loss=11

### (e) (2 points) Explainable ML.

### (f) (Bonus, 2 points) Experimenting with transformers.