In [1]:
import os
import sys
import pickle
import numpy as np

In [2]:
# Add src directory to path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.data_loader import load_androids_corpus
from src.foundation_model_extractor import extract_wav2vec2_sequences

# Load the data manifest to get the filepaths
BASE_DATA_PATH = 'E:/Dissertation_Data/Androids-Corpus'
print("Loading corpus metadata...")
reading_df, _ = load_androids_corpus(BASE_DATA_PATH, verbose=False)

Loading corpus metadata...
Successfully loaded 112 Read task and 116 Interview task fold assignments.


In [None]:
# Extract and Save Wav2Vec2 Sequences

MODEL_NAME = "facebook/wav2vec2-base-960h"
SEQUENCES_OUTPUT_PATH = '../data/Processed_Features/features_wav2vec2_sequences_reading_task.pkl'

# Check if the sequences file already exists to save time
if not os.path.exists(SEQUENCES_OUTPUT_PATH):
    print(f"Extracting sequences from {MODEL_NAME}...")
    
    # Call the new function to get the dictionary of sequences
    sequences_dict = extract_wav2vec2_sequences(reading_df)
    
    # Save the dictionary to a pickle file
    if sequences_dict:
        print(f"Saving sequence data to: {SEQUENCES_OUTPUT_PATH}")
        with open(SEQUENCES_OUTPUT_PATH, 'wb') as f:
            pickle.dump(sequences_dict, f)
        print("Save complete.")
else:
    print(f"Sequence data already exists at: {SEQUENCES_OUTPUT_PATH}")

Sequence data already exists at: ../data/Processed_Features/features_wav2vec2_sequences_reading_task.pkl


In [None]:
# Verify the Saved Pickle File

if os.path.exists(SEQUENCES_OUTPUT_PATH):
    print(f"Loading saved sequence data from: {SEQUENCES_OUTPUT_PATH}")
    with open(SEQUENCES_OUTPUT_PATH, 'rb') as f:
        loaded_sequences = pickle.load(f)
    
    print(f"\nSuccessfully loaded data for {len(loaded_sequences)} files.")
    
    # Get the first filename from the dictionary keys
    first_filename = list(loaded_sequences.keys())[0]
    first_sequence = loaded_sequences[first_filename]
    
    print(f"\nExample -- File: '{first_filename}'")
    print(f"  - Sequence Type: {type(first_sequence)}")
    print(f"  - Sequence Shape: {first_sequence.shape}")
    print("This shape means [Number of Time Steps, Embedding Dimension], which is the correct format for an LSTM/CNN.")
else:
    print("Could not find the saved sequences file to verify.")

Loading saved sequence data from: ../data/Processed_Features/features_wav2vec2_sequences_reading_task.pkl

Successfully loaded data for 111 files.

Example -- File: '01_CF56_1.wav'
  - Sequence Type: <class 'numpy.ndarray'>
  - Sequence Shape: (3545, 768)
This shape means [Number of Time Steps, Embedding Dimension], which is the correct format for an LSTM/CNN.


In [None]:
import pandas as pd
import os
import sys
import pickle
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_curve, auc

# Setup Paths and Load Modules

# Add the 'src' directory to the Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import custom functions for data loading and the new dl(deep learning) CV strategy
from src.data_loader import load_androids_corpus
from src.dl_cv_strategies import run_pytorch_cv_with_early_stopping

# Set some plotting styles for nice-looking graphs
sns.set(style="whitegrid", font_scale=1.1)
plt.rcParams['figure.figsize'] = (12, 7)

In [None]:
# Load Data

# Path to the saved sequence data
SEQUENCES_PATH = '../data/Processed_Features/features_wav2vec2_sequences_reading_task.pkl'

# Need the original metadata to get labels and participant IDs
BASE_DATA_PATH = 'E:/Dissertation_Data/Androids-Corpus' 

sequences_dict = None
metadata_df = None

try:
    print(f"Loading sequence data from: {SEQUENCES_PATH}")
    with open(SEQUENCES_PATH, 'rb') as f:
        sequences_dict = pickle.load(f)
    print(f"Successfully loaded {len(sequences_dict)} sequences.")
    
    # Load the metadata using the data loader
    print(f"Loading metadata from: {BASE_DATA_PATH}")
    metadata_df, _ = load_androids_corpus(BASE_DATA_PATH, verbose=False)
    print("Metadata loaded successfully.")

except FileNotFoundError as e:
    print(f"ERROR: A required data file was not found. Please ensure you have run the sequence extraction.\n  - {e}")

Loading sequence data from: ../data/Processed_Features/features_wav2vec2_sequences_reading_task.pkl
Successfully loaded 111 sequences.
Loading metadata from: E:/Dissertation_Data/Androids-Corpus
Successfully loaded 112 Read task and 116 Interview task fold assignments.
Metadata loaded successfully.


In [None]:
# Run CNN-LSTM Experiment with Early Stopping

from src.dl_cv_strategies import run_pytorch_cv_with_early_stopping

# Define experiment parameters
N_EPOCHS = 50       
PATIENCE = 10       # Stop if validation loss doesn't improve for 10 epochs
BATCH_SIZE = 8
LEARNING_RATE = 0.0001

# Define path for saving results
RESULTS_SAVE_PATH = '../data/Processed_Features/results_wav2vec2_cnn_lstm_early_stopping.pkl'

if not os.path.exists(RESULTS_SAVE_PATH):
    print("Running CNN-LSTM CV with Early Stopping. This will take a while...")
    
    if sequences_dict is not None and metadata_df is not None:
        # Call the new, updated function
        cnn_lstm_results_df, cnn_lstm_predictions = run_pytorch_cv_with_early_stopping(
            sequences_dict=sequences_dict,
            metadata_df=metadata_df,
            epochs=N_EPOCHS,
            patience=PATIENCE,
            batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE
        )
        
        # Save the results
        results_to_save = {
            'results_df': cnn_lstm_results_df,
            'predictions': cnn_lstm_predictions
        }
        with open(RESULTS_SAVE_PATH, 'wb') as f:
            pickle.dump(results_to_save, f)
        print(f"CNN-LSTM results saved to {RESULTS_SAVE_PATH}")
    else:
        print("Data not loaded. Cannot run experiment.")
else:
    print(f"Loading pre-computed CNN-LSTM results from {RESULTS_SAVE_PATH}")
    with open(RESULTS_SAVE_PATH, 'rb') as f:
        loaded_results = pickle.load(f)
        cnn_lstm_results_df = loaded_results['results_df']
        cnn_lstm_predictions = loaded_results['predictions']

# Display the results for this experiment
print("\n--- CNN-LSTM CV with Early Stopping Results ---")
display(cnn_lstm_results_df)

Running CNN-LSTM CV with Early Stopping. This will take a while...


Running 5-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s]

  > Early stopping triggered at epoch 33
  > Early stopping triggered at epoch 28
  > Early stopping triggered at epoch 34
  > Early stopping triggered at epoch 26
  > Early stopping triggered at epoch 39
CNN-LSTM results saved to ../data/Processed_Features/results_wav2vec2_cnn_lstm_early_stopping.pkl

--- CNN-LSTM CV with Early Stopping Results ---


Unnamed: 0,fold,accuracy,f1_score,precision,recall,auc
0,1,0.73913,0.734615,0.746032,0.734848,0.757576
1,2,0.772727,0.772257,0.775,0.772727,0.801653
2,3,0.727273,0.725,0.735043,0.727273,0.735537
3,4,0.681818,0.681159,0.683333,0.681818,0.793388
4,5,0.681818,0.675789,0.728571,0.7,0.875


In [None]:
from src.dl_cv_strategies import run_pytorch_cv_with_early_stopping

# Define experiment parameters
N_EPOCHS = 50       
PATIENCE = 10       # Stop if validation loss doesn't improve for 10 epochs
BATCH_SIZE = 8
LEARNING_RATE = 0.0001

# Define path for saving results
RESULTS_SAVE_PATH = '../data/Processed_Features/results_wav2vec2_cnn_lstm_attention.pkl'

if not os.path.exists(RESULTS_SAVE_PATH):
    print("Running CNN-LSTM CV with Early Stopping. This will take a while...")
    
    if sequences_dict is not None and metadata_df is not None:
        # Call the new, updated function
        cnn_lstm_results_df, cnn_lstm_predictions = run_pytorch_cv_with_early_stopping(
            sequences_dict=sequences_dict,
            metadata_df=metadata_df,
            epochs=N_EPOCHS,
            patience=PATIENCE,
            batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE
        )
        
        # Save the results
        results_to_save = {
            'results_df': cnn_lstm_results_df,
            'predictions': cnn_lstm_predictions
        }
        with open(RESULTS_SAVE_PATH, 'wb') as f:
            pickle.dump(results_to_save, f)
        print(f"CNN-LSTM results saved to {RESULTS_SAVE_PATH}")
    else:
        print("Data not loaded. Cannot run experiment.")
else:
    print(f"Loading pre-computed CNN-LSTM results from {RESULTS_SAVE_PATH}")
    with open(RESULTS_SAVE_PATH, 'rb') as f:
        loaded_results = pickle.load(f)
        cnn_lstm_results_df = loaded_results['results_df']
        cnn_lstm_predictions = loaded_results['predictions']

# Display the results for this experiment
print("\n--- CNN-LSTM CV with Early Stopping Results ---")
display(cnn_lstm_results_df)

Running CNN-LSTM CV with Early Stopping. This will take a while...


Running 5-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s]

  > Early stopping triggered at epoch 46
  > Early stopping triggered at epoch 26
  > Early stopping triggered at epoch 27
  > Early stopping triggered at epoch 22
  > Early stopping triggered at epoch 30
CNN-LSTM results saved to ../data/Processed_Features/results_wav2vec2_cnn_lstm_attention.pkl

--- CNN-LSTM CV with Early Stopping Results ---


Unnamed: 0,fold,accuracy,f1_score,precision,recall,auc
0,1,0.73913,0.738636,0.746154,0.742424,0.75
1,2,0.590909,0.568627,0.614583,0.590909,0.85124
2,3,0.818182,0.816667,0.82906,0.818182,0.809917
3,4,0.818182,0.816667,0.82906,0.818182,0.867769
4,5,0.590909,0.590062,0.602564,0.6,0.691667


In [None]:
from src.dl_cv_strategies import run_pytorch_cv_with_early_stopping

# Define experiment parameters
N_EPOCHS = 50       
PATIENCE = 10       # Stop if validation loss doesn't improve for 10 epochs
BATCH_SIZE = 8
LEARNING_RATE = 0.0001

# Define path for saving results
RESULTS_SAVE_PATH = '../data/Processed_Features/results_wav2vec2_cnn_lstm_attention_swish.pkl'

if not os.path.exists(RESULTS_SAVE_PATH):
    print("Running CNN-LSTM CV with Early Stopping. This will take a while...")
    
    if sequences_dict is not None and metadata_df is not None:
        # Call the new, updated function
        cnn_lstm_results_df, cnn_lstm_predictions = run_pytorch_cv_with_early_stopping(
            sequences_dict=sequences_dict,
            metadata_df=metadata_df,
            epochs=N_EPOCHS,
            patience=PATIENCE,
            batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE
        )
        
        # Save the results
        results_to_save = {
            'results_df': cnn_lstm_results_df,
            'predictions': cnn_lstm_predictions
        }
        with open(RESULTS_SAVE_PATH, 'wb') as f:
            pickle.dump(results_to_save, f)
        print(f"CNN-LSTM results saved to {RESULTS_SAVE_PATH}")
    else:
        print("Data not loaded. Cannot run experiment.")
else:
    print(f"Loading pre-computed CNN-LSTM results from {RESULTS_SAVE_PATH}")
    with open(RESULTS_SAVE_PATH, 'rb') as f:
        loaded_results = pickle.load(f)
        cnn_lstm_results_df = loaded_results['results_df']
        cnn_lstm_predictions = loaded_results['predictions']

# Display the results for this experiment
print("\n--- CNN-LSTM CV with Early Stopping Results ---")
display(cnn_lstm_results_df)

Running CNN-LSTM CV with Early Stopping. This will take a while...


Running 5-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s]

  > Early stopping triggered at epoch 24
  > Early stopping triggered at epoch 32
  > Early stopping triggered at epoch 27
  > Early stopping triggered at epoch 28
  > Early stopping triggered at epoch 16
CNN-LSTM results saved to ../data/Processed_Features/results_wav2vec2_cnn_lstm_attention_swish.pkl

--- CNN-LSTM CV with Early Stopping Results ---


Unnamed: 0,fold,accuracy,f1_score,precision,recall,auc
0,1,0.869565,0.868571,0.892857,0.875,0.810606
1,2,0.772727,0.772257,0.775,0.772727,0.801653
2,3,0.681818,0.681159,0.683333,0.681818,0.719008
3,4,0.772727,0.768421,0.794643,0.772727,0.876033
4,5,0.636364,0.636364,0.641667,0.641667,0.783333


In [None]:
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch

# Define an augmentation pipeline
apply_augmentations = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
])

# Define experiment parameters
N_EPOCHS = 50
PATIENCE = 10
BATCH_SIZE = 8
LEARNING_RATE = 0.0001

# Define a new save path for this experiment 
AUGMENTED_RESULTS_SAVE_PATH = '../data/Processed_Features/results_wav2vec2_cnn_lstm_attention_swish_aug.pkl'

# Run the Experiment
# Check if the results file already exists to save time
if not os.path.exists(AUGMENTED_RESULTS_SAVE_PATH):
    print("Running CNN-LSTM CV with Data Augmentation...")
    
    # Check if the necessary data has been loaded 
    if 'sequences_dict' in locals() and 'metadata_df' in locals():
        # Call the training function, passing the augmentation pipeline as the new argument
        cnn_lstm_aug_results_df, cnn_lstm_aug_predictions = run_pytorch_cv_with_early_stopping(
            sequences_dict=sequences_dict,
            metadata_df=metadata_df,
            epochs=N_EPOCHS,
            patience=PATIENCE,
            batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE,
            augmentations=apply_augmentations # Pass the pipeline to the function
        )
        
        # Save the results to a pickle file for later analysis
        results_to_save = {
            'results_df': cnn_lstm_aug_results_df,
            'predictions': cnn_lstm_aug_predictions
        }
        with open(AUGMENTED_RESULTS_SAVE_PATH, 'wb') as f:
            pickle.dump(results_to_save, f)
        print(f"Augmented results saved to {AUGMENTED_RESULTS_SAVE_PATH}")
    else:
        print("Data (sequences_dict or metadata_df) not loaded. Please run the data loading cell first.")
else:
    print(f"Loading pre-computed augmented results from {AUGMENTED_RESULTS_SAVE_PATH}")
    with open(AUGMENTED_RESULTS_SAVE_PATH, 'rb') as f:
        loaded_results = pickle.load(f)
        cnn_lstm_aug_results_df = loaded_results['results_df']
        cnn_lstm_aug_predictions = loaded_results['predictions']

# Display the results for this experiment
print("\n--- CNN-LSTM CV with Augmentation Results ---")
if 'cnn_lstm_aug_results_df' in locals():
    display(cnn_lstm_aug_results_df)
else:
    print("Results could not be loaded or generated.")

Running CNN-LSTM CV with Data Augmentation...


Running 5-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s]

  > Early stopping triggered at epoch 21
  > Early stopping triggered at epoch 29
  > Early stopping triggered at epoch 17
  > Early stopping triggered at epoch 19
  > Early stopping triggered at epoch 32
Augmented results saved to ../data/Processed_Features/results_wav2vec2_cnn_lstm_attention_swish_aug.pkl

--- CNN-LSTM CV with Augmentation Results ---


Unnamed: 0,fold,accuracy,f1_score,precision,recall,auc
0,1,0.521739,0.342857,0.26087,0.5,0.613636
1,2,0.636364,0.633333,0.641026,0.636364,0.619835
2,3,0.5,0.333333,0.25,0.5,0.561983
3,4,0.5,0.333333,0.25,0.5,0.429752
4,5,0.681818,0.664488,0.690476,0.666667,0.708333
