In [None]:
import os
import sys
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# Add src directory to path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import the aggregation function from utils script
from src.utils import aggregate_interview_sequences

from src.data_loader import load_androids_corpus
from src.foundation_model_extractor import extract_wav2vec2_sequences

In [None]:
# Extract and Prepare All Sequence Data for DL Models
# Load metadata and extract clip-level sequences
BASE_DATA_PATH = 'E:/Dissertation_Data/Androids-Corpus' # Verify this path
print("Loading corpus metadata...")
reading_df, interview_df = load_androids_corpus(BASE_DATA_PATH, verbose=False)
participant_metadata = reading_df[['unique_participant_id', 'label', 'fold']].drop_duplicates().reset_index(drop=True)

SEQUENCES_READING_PATH = '../data/Processed_Features/features_wav2vec2_sequences_reading_task.pkl'
SEQUENCES_INTERVIEW_CLIPS_PATH = '../data/Processed_Features/features_wav2vec2_sequences_interview_clips.pkl'

# Run extraction for interview clips if the file doesn't already exist
if not os.path.exists(SEQUENCES_INTERVIEW_CLIPS_PATH):
    print("Extracting sequential embeddings for all interview clips...")
    interview_clip_sequences = extract_wav2vec2_sequences(interview_df)
    if interview_clip_sequences:
        print(f"Saving interview clip sequences to: {SEQUENCES_INTERVIEW_CLIPS_PATH}")
        with open(SEQUENCES_INTERVIEW_CLIPS_PATH, 'wb') as f: pickle.dump(interview_clip_sequences, f)
else:
    print(f"Interview clip sequences already exist. Loading from file.")

# Load all necessary sequence data
with open(SEQUENCES_READING_PATH, 'rb') as f: reading_sequences = pickle.load(f)
with open(SEQUENCES_INTERVIEW_CLIPS_PATH, 'rb') as f: interview_clip_sequences = pickle.load(f)
print(f"\nLoaded {len(reading_sequences)} Reading sequences and {len(interview_clip_sequences)} Interview clip sequences.")

# Aggregate interview clips into session-level sequences using the util function
interview_session_sequences = aggregate_interview_sequences(interview_clip_sequences, interview_df)
print(f"Aggregated clips for {len(interview_session_sequences)} participants.")

# Create the final Reading, Interview, and Combined datasets
# Remap reading sequences to be keyed by participant ID for consistency
reading_participant_map = reading_df.set_index('filename')['unique_participant_id']
reading_session_sequences = {reading_participant_map[fname]: seq for fname, seq in reading_sequences.items() if fname in reading_participant_map.index}

# Create combined sequences by concatenating reading and interview sequences
combined_session_sequences = {}
for participant_id in tqdm(participant_metadata['unique_participant_id'], desc="Creating Combined Sequences"):
    reading_seq = reading_session_sequences.get(participant_id)
    interview_seq = interview_session_sequences.get(participant_id)
    if reading_seq is not None and interview_seq is not None:
        combined_session_sequences[participant_id] = np.vstack([reading_seq, interview_seq])

# Store all prepared sequence sets in a final dictionary
sequence_sets = {
    'reading': reading_session_sequences,
    'interview': interview_session_sequences,
    'combined': combined_session_sequences
}
print("\n--- All sequence datasets are now prepared and ready for training ---")

Loading corpus metadata...
Successfully loaded 112 Read task and 116 Interview task fold assignments.
Interview clip sequences already exist. Loading from file.

Loaded 111 Reading sequences and 857 Interview clip sequences.

Aggregating interview clips into single sequences per participant...


Aggregating Sequences: 0it [00:00, ?it/s]

Aggregated clips for 114 participants.


Creating Combined Sequences:   0%|          | 0/111 [00:00<?, ?it/s]


--- All sequence datasets are now prepared and ready for training ---


In [None]:
# Load or Extract Reading Task Sequences

# Define the path to the reading task sequences file
SEQUENCES_READING_PATH = '../data/Processed_Features/features_wav2vec2_sequences_reading_task.pkl'

# Check if the file exists before running extraction
if not os.path.exists(SEQUENCES_READING_PATH):
    print("Reading task sequences not found. Running extraction...")
    
    # Check if the reading_df DataFrame is loaded
    if 'reading_df' in locals():
        print("\nExtracting sequential embeddings for the Reading Task...")
        
        # Call the extractor function
        reading_sequences = extract_wav2vec2_sequences(reading_df)
        
        # Save the new sequences to the pickle file
        if reading_sequences:
            print(f"Saving new reading task sequences to: {SEQUENCES_READING_PATH}")
            with open(SEQUENCES_READING_PATH, 'wb') as f:
                pickle.dump(reading_sequences, f)
            print("Extraction and saving complete.")
    else:
        print("ERROR: 'reading_df' not found. Cannot run extraction.")
else:
    print(f"Reading task sequences already exist. Loading from file: {SEQUENCES_READING_PATH}")

# Load the data regardless of whether it was just created or already existed
try:
    with open(SEQUENCES_READING_PATH, 'rb') as f:
        reading_sequences = pickle.load(f)
    
    print(f"\nSuccessfully loaded data for {len(reading_sequences)} reading files.")
    
    # Final Verification
    first_filename = list(reading_sequences.keys())[0]
    first_sequence = reading_sequences[first_filename]
    print(f"Verified sequence shape for '{first_filename}': {first_sequence.shape}")

except FileNotFoundError:
    print(f"ERROR: Could not load reading sequences from {SEQUENCES_READING_PATH}")

Reading task sequences already exist. Loading from file: ../data/Processed_Features/features_wav2vec2_sequences_reading_task.pkl

Successfully loaded data for 111 reading files.
Verified sequence shape for '01_CF56_1.wav': (4378, 768)


In [None]:
# Run CNN-LSTM Experiments for All Data Types

from src.dl_cv_strategies import run_pytorch_cv_with_early_stopping

# Dictionary to store the results of the DL experiments
dl_results = {}

# Define experiment parameters
N_EPOCHS = 50
PATIENCE = 10
BATCH_SIZE = 8
LEARNING_RATE = 0.0001
# The 'participant_metadata' DataFrame should be loaded from a previous cell.

# Loop through the three prepared sequence sets
for name, seq_dict in sequence_sets.items():
    experiment_name = f'wav2vec2_cnn_lstm_{name}'
    results_save_path = f'../data/Processed_Features/results_{experiment_name}.pkl'
    
    if not os.path.exists(results_save_path):
        print(f"\n--- Running experiment: {experiment_name.upper()} ---")
        
        # Call training function, handles the data alignment internally.
        # Pass the full participant_metadata DataFrame for reliable label lookup.
        results_df, predictions = run_pytorch_cv_with_early_stopping(
            sequences_dict=seq_dict,
            metadata_df=participant_metadata,
            epochs=N_EPOCHS,
            patience=PATIENCE,
            batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE
        )
        
        # Save results
        results_to_save = {'results_df': results_df, 'predictions': predictions}
        with open(results_save_path, 'wb') as f:
            pickle.dump(results_to_save, f)
        print(f"Results saved to {results_save_path}")
        dl_results[experiment_name] = results_to_save
    else:
        print(f"\nLoading pre-computed results for {experiment_name.upper()}")
        with open(results_save_path, 'rb') as f:
            dl_results[experiment_name] = pickle.load(f)

print("\n--- All Deep Learning experiments are now complete! ---")


--- Running experiment: WAV2VEC2_CNN_LSTM_READING ---


Running 5-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s]

  > Early stopping triggered at epoch 34
  > Early stopping triggered at epoch 19
  > Early stopping triggered at epoch 33
  > Early stopping triggered at epoch 28
  > Early stopping triggered at epoch 38
Results saved to ../data/Processed_Features/results_wav2vec2_cnn_lstm_reading.pkl

--- Running experiment: WAV2VEC2_CNN_LSTM_INTERVIEW ---


Running 5-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s]

  > Early stopping triggered at epoch 44
  > Early stopping triggered at epoch 37
  > Early stopping triggered at epoch 48
  > Early stopping triggered at epoch 31
Results saved to ../data/Processed_Features/results_wav2vec2_cnn_lstm_interview.pkl

--- Running experiment: WAV2VEC2_CNN_LSTM_COMBINED ---


Running 5-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s]

  > Early stopping triggered at epoch 35
  > Early stopping triggered at epoch 44
  > Early stopping triggered at epoch 35
  > Early stopping triggered at epoch 49
Results saved to ../data/Processed_Features/results_wav2vec2_cnn_lstm_combined.pkl

--- All Deep Learning experiments are now complete! ---


In [None]:
# Final Analysis - Compare SVM and DL Models

# Load SVM results from notebook 03, ensure they are saved them to a pkl file
SVM_RESULTS_PATH = '../data/Processed_Features/all_svm_results.pkl'
if os.path.exists(SVM_RESULTS_PATH):
    with open(SVM_RESULTS_PATH, 'rb') as f:
        all_results = pickle.load(f)
else:
    print("Warning: SVM results file not found. Final comparison will only show DL models.")
    all_results = {}

# Add the new DL results to the main dictionary
all_results.update(dl_results)

# re-run all the plotting and analysis cells from notebook 03
# e.g. generate final summary table:

final_summary_data = []
for experiment_name, data in all_results.items():
    results_df = data['results_df']
    final_summary_data.append({
        'Experiment': experiment_name,
        'Mean F1-Score': results_df['f1_score'].mean(),
        'Std Dev F1-Score': results_df['f1_score'].std(),
        'Mean AUC': results_df['auc'].mean(),
        'Std Dev AUC': results_df['auc'].std(),
        'Mean Accuracy': results_df['accuracy'].mean(),
        'Std Dev Accuracy': results_df['accuracy'].std()
    })

final_summary_df = pd.DataFrame(final_summary_data).set_index('Experiment')
display(final_summary_df.style.background_gradient(cmap='viridis', subset=[c for c in final_summary_df.columns if 'Mean' in c], axis=0)
                          .background_gradient(cmap='viridis_r', subset=[c for c in final_summary_df.columns if 'Std Dev' in c], axis=0)
                          .format("{:.3f}"))



Unnamed: 0_level_0,Mean F1-Score,Std Dev F1-Score,Mean AUC,Std Dev AUC,Mean Accuracy,Std Dev Accuracy
Experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
wav2vec2_cnn_lstm_reading,0.683,0.063,0.77,0.079,0.694,0.059
wav2vec2_cnn_lstm_interview,0.739,0.075,0.815,0.076,0.744,0.074
wav2vec2_cnn_lstm_combined,0.71,0.124,0.809,0.086,0.717,0.124
