In [1]:
# Mount Google Drive to access datasets and files stored in your Google Drive account
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
import numpy as np
import statistics
from sklearn.metrics import f1_score, accuracy_score
from keras import backend as K
from tensorflow.keras.callbacks import Callback
import pickle
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, GRU, GlobalAveragePooling1D, Dense, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold
import math
from transformers import AutoTokenizer, AutoModel, Wav2Vec2FeatureExtractor, Wav2Vec2Model, Wav2Vec2Processor, Wav2Vec2Tokenizer
import random
K.clear_session()
random.seed(999)
np.random.seed(999)
tf.random.set_seed(999)

## data processing

In [None]:
# Ensure all necessary libraries are installed
# !pip install transformers
# !pip install torch

# Setup for model devices, assuming CUDA is available for GPU acceleration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize tokenizer and model for BERT
bert_model_name = "dbmdz/bert-base-italian-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name).to(device)

# Initialize processor and model for Wav2Vec2
w2v_model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-italian"
w2v_processor = Wav2Vec2Processor.from_pretrained(w2v_model_name)
w2vec_model = Wav2Vec2Model.from_pretrained(w2v_model_name).to(device)

def extract_features(file_path, text):
    """
    Extract audio and text features using pre-trained Wav2Vec2 and BERT models.

    Args:
        file_path (str): Path to the audio file.
        text (str): Text data for feature extraction via BERT.

    Returns:
        tuple: A tuple containing extracted audio features and text features.
    """
    # Process audio input and extract features
    audio_input = w2v_processor(file_path, return_tensors="pt", padding=True, sampling_rate=16000)
    audio_features = w2vec_model(audio_input.input_values.to(device))

    # Tokenize and extract text features
    encoded_text = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    text_features = bert_model(encoded_text.input_ids.to(device))

    return audio_features.last_hidden_state, text_features.last_hidden_state

# df_labels is the main dataframe containing all necessary metadata
df_list = "..."
for index, row in df_labels.iterrows():
    # Directory to the audio file stored in Google Drive
    file_url_audio = f".../{row['audio_clip_chunk']}.wav"
    df_audio = pd.read_csv(file_url_audio, header=None, skiprows=1)
    text = row['transcribed_text']

    # Extract audio and text features
    audio_features, text_features = extract_features(file_url_audio, text)

    # Normalize and segment audio data into fixed timesteps
    timesteps_audio = 499
    step_number_audio = math.floor(len(df_audio) / timesteps_audio) * timesteps_audio

    # Compile extracted features and metadata into a structured list
    df_list.append([df_audio.iloc[0:timesteps_audio].values.flatten(), text_features.cpu().numpy().flatten(), row['depression'], row['full_speaker_id'], row['session_number']])

# Convert the structured list into numpy arrays suitable for machine learning model inputs
x_audio_data = np.array([i[0] for i in df_list])  # Audio features
x_text_data = np.array([i[1] for i in df_list])   # Text features
y_data = np.array([i[2] for i in df_list])        # Depression labels (binary: 0 for control, 1 for depressed)
unique_clip_ID_No = np.array([i[3] for i in df_list])  # Unique clip identifier
session_number = np.array([i[4] for i in df_list])     # Session number



## target model - multimodal depression detection

In [None]:
# Majority vote function

def majority_vote(y_test, y_pred, ID):
    """
    Returns the majority vote label for each unique clip_id
    """
    df = pd.DataFrame({'ID': ID, 'y_test': y_test, 'y_pred': y_pred})

    def majority_or_default(x):

    # Ensure y_test is consistent within each ID group
    y_test_grouped = df.groupby('ID')['y_test'].nunique()
    if any(y_test_grouped > 1):
        raise ValueError("Inconsistent y_test values within a single ID group.")

    y_test_MV = df.groupby('ID')['y_test'].first().tolist()  # Just take the first as they should all be the same
    majority_vote_labels = df.groupby('ID')['y_pred'].apply(majority_or_default).tolist()

    acc_MV = accuracy_score(y_test_MV, majority_vote_labels)
    F1 = f1_score(y_test_MV, majority_vote_labels)

    print('y_test_MV' , y_test_MV)
    print('majority_vote_labels', majority_vote_labels)

    return acc_MV, F1

In [None]:
# Early stopping
class CustomEarlyStopping(Callback):
    def __init__(self, patience=10, restore_path=None):
        super(CustomEarlyStopping, self).__init__()
        self.patience = patience
        self.wait = 0
        self.stopped_epoch = 0
        self.prev_val_accuracy = -1
        self.restore_path = restore_path

    def on_train_begin(self, logs=None):
        self.wait = 0
        self.stopped_epoch = 0
        self.prev_val_accuracy = -1

    def on_epoch_end(self, epoch, logs=None):
        current = logs.get('val_accuracy')

        # Check if the current accuracy is the same as the previous one
        if current == self.prev_val_accuracy:
            self.wait += 1
        else:
            self.wait = 0

        # Update the previous validation accuracy
        self.prev_val_accuracy = current

        if self.wait >= self.patience:
            self.stopped_epoch = epoch
            self.model.stop_training = True
            restore_weights_path = f"checkpoint_epoch_{epoch - self.patience}.h5"
            self.model.load_weights(restore_weights_path)
            # Save the restored model to the specified path
            if self.restore_path:
                self.model.save(self.restore_path)

    def on_train_end(self, logs=None):
        if self.stopped_epoch > 0:
            print(f"\nEarly stopped: epoch {self.stopped_epoch}. Restored model weights from epoch {self.stopped_epoch - self.patience}.")

In [None]:
# Constants and placeholders
num_folds = 5
speaker_ids = np.unique(session_number)  # session_number is predefined
k_fold = KFold(n_splits=num_folds, shuffle=True)
training_index, testing_index = [], []
accuracies_best, F1_best, f1_MV_best_, accuracies_MV_best, session_list = [], [], [], [], []

# Splitting data into folds
for fold_indexes_training, fold_indexes_test in k_fold.split(speaker_ids):
    fold_test = speaker_ids[fold_indexes_test]
    result_test = np.isin(session_number, fold_test)
    testing_index.append(np.where(result_test))

    fold_train = speaker_ids[fold_indexes_training]
    result_train = np.isin(session_number, fold_train)
    training_index.append(np.where(result_train))

# Loop through each fold
for i in range(num_folds):
    print(f'Processing fold number: {i + 1}')

    # Load data per fold
    x_train_audio, x_train_text = x_audio_data[training_index[i][0]], x_text_data[training_index[i][0]]
    y_train = y_data[training_index[i][0]]
    x_test_audio, x_test_text = x_audio_data[testing_index[i][0]], x_text_data[testing_index[i][0]]
    y_test = y_data[testing_index[i][0]]
    session_no = session_number[testing_index[i][0]]

    # Print test session numbers
    unique_sessions = np.unique(session_no)
    print(f'Test sessions for this fold: {unique_sessions}')
    session_list.append(unique_sessions)


    # Clip id calculate the mjority vote function
    clip_id_testing=unique_clip_ID_No[testing_index[i][0]]

    # Neural network setup
    acoustic_input = Input(shape=(499, 512), name="acoustic_input")
    lexical_input = Input(shape=(333, 768), name="lexical_input")

    # Acoustic processing layers
    x_a = acoustic_input
    for _ in range(2):  # Two convolution layers
        x_a = Conv1D(filters=64, kernel_size=3, activation='relu')(x_a)
        x_a = MaxPooling1D(pool_size=2)(x_a)
    x_a = Bidirectional(GRU(64, return_sequences=True))(x_a)
    x_a = GlobalAveragePooling1D()(x_a)

    # Lexical processing layers
    x_l = lexical_input
    x_l = Bidirectional(GRU(64, return_sequences=True))(x_l)
    x_l = GlobalAveragePooling1D()(x_l)

    # Concatenation of features from both modalities
    joint_representation = tf.keras.layers.concatenate([x_a, x_l])
    joint_representation_ = Dense(256, use_bias=False, activation=None)(joint_representation)

    # Dense layers and model output
    dense_layer = Dense(1024, activation='tanh')(joint_representation_)
    output_layer = Dense(2, activation='softmax')(dense_layer)
    model = Model(inputs=[acoustic_input, lexical_input], outputs=output_layer)

    # Model compilation
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Training the model
    early_stopping_custom = CustomEarlyStopping(patience=5)
    model.fit([x_train_audio, x_train_text], y_train, epochs=200, batch_size=64, validation_split=0.2, callbacks=[early_stopping_custom])

    # Evaluating the model
    loss, accuracy = model.evaluate([x_test_audio, x_test_text], y_test)
    accuracies_best.append(accuracy)
    loss_fold_best.append(loss)

    # Steps to calculate F1 score
    y_test_pred_best = []  # Initialize the predictions list
    threshold = 0.5  # Adjust the threshold as needed

    # Split the test data
    x_test_acoustic = np.array(x_test_audio)
    x_test_text = np.array(x_test_text)

    # Predict using the model for each pair of audio-text test samples
    for i in range(len(x_test_acoustic)):
      sample_acoustic = x_test_acoustic[i]
      sample_text = x_test_text[i]

      predictions = model.predict([np.array([sample_acoustic]), np.array([sample_text])], verbose=0)
      predicted_label = np.argmax(predictions[0])
      y_test_pred_best.append(predicted_label)

    # Convert predictions list to a numpy array
    y_test_pred_best = np.array(y_test_pred_best)
    # Ensure the predicted labels and actual labels have the same shape
    assert y_test_pred_best.shape == y_test.shape, f"Shapes mismatch! y_test_pred_best: {y_test_pred_best.shape}, y_test: {y_test.shape}"

    # Calculate F1 score
    F1_score = f1_score(y_test, y_test_pred_best)
    F1_best.append(F1_score)

    # majority vote accuracy
    acc_MV_best, f1_MV_best = majority_vote(y_test, y_test_pred_best, clip_id_testing)
    accuracies_MV_best.append(acc_MV_best)
    f1_MV_best_.append(f1_MV_best)

# Print average and standard deviation of accuracies and F1 scores
print(f'Average accuracy per frame: {np.mean(accuracies_best):.4f}, std: {statistics.stdev(accuracies_best):.4f}')
print(f'Average F1 score per frame: {np.mean(F1_best):.4f}, std: {statistics.stdev(F1_best):.4f}')
print(f'Average accuracy per clip: {np.mean(accuracies_MV_best):.4f}, std: {statistics.stdev(accuracies_MV_best):.4f}')
print(f'Average F1 score per clip: {np.mean(f1_MV_best_):.4f}, std: {statistics.stdev(f1_MV_best_):.4f}')