In [1]:
import os
import sys
import pandas as pd

In [2]:
# Add 'src' directory to the Python path to import modules
# Notebooks are in 'notebooks/' written modules are in 'src/'
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import data loader and feature extraction functions from modules
from src.data_loader import load_androids_corpus

from src.mshds_extractor import extract_mshds_features
from src.opensmile_extractor import extract_opensmile_features
from src.foundation_model_extractor import extract_wav2vec2_embeddings

In [3]:
# Define base path to data
BASE_CORPUS_PATH = 'E:/Dissertation_Data/Androids-Corpus' # Change to wherever Androids-Corpus dataset is located locally

# Call function to load the data
print("Loading Androids Corpus...")
reading_df, interview_df = load_androids_corpus(BASE_CORPUS_PATH)

Loading Androids Corpus...
Successfully loaded 112 Read task and 116 Interview task fold assignments.

Processing Reading Task from: E:\Dissertation_Data\Androids-Corpus\Reading-Task\audio
Processed 111 files from Reading-Task.

Processing Interview Task clips from: E:\Dissertation_Data\Androids-Corpus\Interview-Task\audio_clip
Processed 866 clip files from Interview-Task (audio_clip).

--- Data Loading Complete ---


In [4]:
# Header of reading task DataFrame
print("\n--- Reading Task Data ---")
if not reading_df.empty:
    print(f"Shape: {reading_df.shape}")
    print(reading_df.head())
else:
    print("Reading DataFrame is empty.")


--- Reading Task Data ---
Shape: (111, 10)
  unique_participant_id original_id_nn    label  gender  age  education  \
0                  01_C             01  Control  Female   56          1   
1                  02_C             02  Control    Male   57          2   
2                  03_C             03  Control  Female   30          3   
3                  04_C             04  Control  Female   57          3   
4                  05_C             05  Control  Female   41          3   

                                            filepath       filename task_type  \
0  E:/Dissertation_Data/Androids-Corpus\Reading-T...  01_CF56_1.wav   Reading   
1  E:/Dissertation_Data/Androids-Corpus\Reading-T...  02_CM57_2.wav   Reading   
2  E:/Dissertation_Data/Androids-Corpus\Reading-T...  03_CF30_3.wav   Reading   
3  E:/Dissertation_Data/Androids-Corpus\Reading-T...  04_CF57_3.wav   Reading   
4  E:/Dissertation_Data/Androids-Corpus\Reading-T...  05_CF41_3.wav   Reading   

   fold  
0     1 

In [5]:
# Header of interview task DataFrame
print("\n--- Interview Task Data ---")
if not interview_df.empty:
    print(f"Shape: {interview_df.shape}")
    print(interview_df.head())
else:
    print("Interview DataFrame is empty.")


--- Interview Task Data ---
Shape: (866, 11)
  unique_participant_id original_id_nn    label  gender  age  education  \
0                  01_C             01  Control  Female   56          1   
1                  01_C             01  Control  Female   56          1   
2                  01_C             01  Control  Female   56          1   
3                  01_C             01  Control  Female   56          1   
4                  01_C             01  Control  Female   56          1   

                                            filepath          filename  \
0  E:/Dissertation_Data/Androids-Corpus\Interview...   01_CF56_1_1.wav   
1  E:/Dissertation_Data/Androids-Corpus\Interview...  01_CF56_1_10.wav   
2  E:/Dissertation_Data/Androids-Corpus\Interview...   01_CF56_1_2.wav   
3  E:/Dissertation_Data/Androids-Corpus\Interview...   01_CF56_1_3.wav   
4  E:/Dissertation_Data/Androids-Corpus\Interview...   01_CF56_1_4.wav   

  original_session_filename       task_type  fold  
0     

In [6]:
# Create small subset for testing
small_reading_df = reading_df.head(5).copy()
print("Created a small test DataFrame with 5 files:")
print(small_reading_df['filename'].tolist())

Created a small test DataFrame with 5 files:
['01_CF56_1.wav', '02_CM57_2.wav', '03_CF30_3.wav', '04_CF57_3.wav', '05_CF41_3.wav']


In [10]:
# Extract MSHDS features for the small test set
print("\nExtracting MSHDS features for the small test set...")
# Progress bar of feature extraction
test_features_df = extract_mshds_features(small_reading_df, verbose=True)
print("Extraction complete.")


Extracting MSHDS features for the small test set...


Extracting MSHDS Features:   0%|          | 0/5 [00:00<?, ?it/s]

Extraction complete.


In [11]:
# Check shape: Should have 5 rows and 26 columns (25 features + 1 filename)
print(f"\nShape of the output feature DataFrame: {test_features_df.shape}")

# Check for missing values. Should be all zeros.
# Non-zero counts, means some files had errors during extraction.
print("\nNumber of missing values per feature:")
print(test_features_df.isnull().sum())

# Display the first few rows to visually inspect the features
print("\nHead of the feature DataFrame:")
display(test_features_df)


Shape of the output feature DataFrame: (5, 26)

Number of missing values per feature:
filename                    0
Speaking_Rate               0
Articulation_Rate           0
Phonation_Ratio             0
Pause_Rate                  0
Mean_Pause_Duration         0
mean_F0                     0
stdev_F0_Semitone           0
mean_dB                     0
range_ratio_dB              0
HNR_dB                      0
Spectral_Slope              0
Spectral_Tilt               0
Cepstral_Peak_Prominence    0
mean_F1_Loc                 0
std_F1_Loc                  0
mean_B1_Loc                 0
std_B1_Loc                  0
mean_F2_Loc                 0
std_F2_Loc                  0
mean_B2_Loc                 0
std_B2_Loc                  0
Spectral_Gravity            0
Spectral_Std_Dev            0
Spectral_Skewness           0
Spectral_Kurtosis           0
dtype: int64

Head of the feature DataFrame:


Unnamed: 0,filename,Speaking_Rate,Articulation_Rate,Phonation_Ratio,Pause_Rate,Mean_Pause_Duration,mean_F0,stdev_F0_Semitone,mean_dB,range_ratio_dB,...,mean_B1_Loc,std_B1_Loc,mean_F2_Loc,std_F2_Loc,mean_B2_Loc,std_B2_Loc,Spectral_Gravity,Spectral_Std_Dev,Spectral_Skewness,Spectral_Kurtosis
0,01_CF56_1.wav,3.221843,4.078672,0.789924,0.342143,0.614,135.486474,3.477464,63.951329,2.15558,...,255.16868,286.616317,1633.716904,488.536986,594.657072,716.782932,355.206497,337.788948,5.964434,70.606271
1,02_CM57_2.wav,3.402797,4.119856,0.825951,0.2534,0.686857,95.854561,3.35266,65.361857,2.016766,...,242.927026,253.067458,1614.980076,448.984263,492.187366,439.490258,363.889176,393.58486,6.691924,121.864942
2,03_CF30_3.wav,4.388955,5.117387,0.857656,0.177931,0.8,201.023993,2.08271,65.586717,2.360642,...,260.924023,288.58863,1704.68601,484.051893,662.627477,905.501959,460.918213,421.40373,5.051323,55.662546
3,04_CF57_3.wav,4.384482,5.207409,0.84197,0.259297,0.609455,173.83401,2.489167,68.230038,3.05306,...,147.257021,133.237838,1553.763156,398.784342,371.078992,485.221797,576.765021,477.447754,4.396186,39.211027
4,05_CF41_3.wav,4.676964,5.254031,0.890167,0.20116,0.546,197.308449,3.552315,69.091985,3.231895,...,179.709255,253.194893,1557.978158,543.890373,571.127516,632.10968,457.281058,344.699651,6.319892,83.131251


In [12]:
# Extract MSHDS features for full reading task set

# Define the final output path for the processed features.
MSHDS_READING_SAVE_PATH = '../data/Processed_Features/features_mshds_reading_task.csv'

# Check if the feature file already exists to avoid re-running the extraction.
if not os.path.exists(MSHDS_READING_SAVE_PATH):
    
    print("\nExtracting MSHDS features for the full Reading Task set...")
    
    # Run the feature extraction function on the reading_df DataFrame.
    mshds_reading_features_df = extract_mshds_features(reading_df)
    
    # Merge the extracted features back with the original metadata from reading_df.
    # Use the 'filename' column as the key for the merge.
    full_reading_data = pd.merge(reading_df, mshds_reading_features_df, on='filename', how='left')
    
    print("\n--- Full feature extraction and merge complete. ---")
    print(f"Shape of final DataFrame: {full_reading_data.shape}")

    # Save the final DataFrame to a CSV for future use.
    print(f"\nSaving extracted features to: {MSHDS_READING_SAVE_PATH}")
    full_reading_data.to_csv(MSHDS_READING_SAVE_PATH, index=False)
    print("Done.")
    
else:
    print(f"\nMSHDS reading features already exist at: {MSHDS_READING_SAVE_PATH}. Skipping extraction.")
    # Load the existing file to ensure the 'full_reading_data' variable is available for any subsequent cells.
    full_reading_data = pd.read_csv(MSHDS_READING_SAVE_PATH)
    print("Loaded existing file.")


MSHDS reading features already exist at: ../data/Processed_Features/features_mshds_reading_task.csv. Skipping extraction.
Loaded existing file.


In [13]:
# Path to the OpenSMILE executable
OPENSMILE_EXE_PATH = 'E:/tools/opensmile-3.0.2/bin/SMILExtract.exe'

# Path to the OpenSMILE config file
CONFIG_FILE_PATH = os.path.join(BASE_CORPUS_PATH, 'Androids.conf')

# Path for saving the processed features
OPENSMILE_OUTPUT_PATH = '../data/Processed_Features/features_opensmile_reading_task.csv'

In [14]:
if not os.path.exists(OPENSMILE_OUTPUT_PATH):
    print(f"\nExtracting OpenSMILE features for the Reading Task...")
    if not os.path.exists(OPENSMILE_EXE_PATH):
        print(f"FATAL ERROR: OpenSMILE executable not found at '{OPENSMILE_EXE_PATH}'.")
    else:
        opensmile_features_df = extract_opensmile_features(
            input_df=reading_df,
            opensmile_exe_path=OPENSMILE_EXE_PATH,
            config_file_path=CONFIG_FILE_PATH
        )
        if not opensmile_features_df.empty:
            full_reading_data_opensmile = pd.merge(reading_df, opensmile_features_df, on='filename', how='left')
            print(f"Saving OpenSMILE features to: {OPENSMILE_OUTPUT_PATH}")
            full_reading_data_opensmile.to_csv(OPENSMILE_OUTPUT_PATH, index=False)
            print("OpenSMILE features saved.")
else:
    print(f"\nOpenSMILE features already exist at: {OPENSMILE_OUTPUT_PATH}. Loading from file.")
    full_reading_data_opensmile = pd.read_csv(OPENSMILE_OUTPUT_PATH)
    
print("\n--- All feature extraction checks complete. ---")


OpenSMILE features already exist at: ../data/Processed_Features/features_opensmile_reading_task.csv. Loading from file.

--- All feature extraction checks complete. ---


In [15]:
# Define the model to use and the output path
MODEL_NAME = "facebook/wav2vec2-base-960h"
EMBEDDINGS_OUTPUT_PATH = '../data/Processed_Features/features_wav2vec2_reading_task.csv'

In [16]:
# Check if features already exist to save time
if not os.path.exists(EMBEDDINGS_OUTPUT_PATH):
    print(f"\nExtracting embeddings from {MODEL_NAME} for the Reading Task...")
    wav2vec2_features_df = extract_wav2vec2_embeddings(reading_df)
    
    if not wav2vec2_features_df.empty:
        # Merge the features back with the original metadata
        full_reading_data_wav2vec2 = pd.merge(reading_df, wav2vec2_features_df, on='filename', how='left')
        
        print(f"Saving Wav2Vec2 embeddings to: {EMBEDDINGS_OUTPUT_PATH}")
        full_reading_data_wav2vec2.to_csv(EMBEDDINGS_OUTPUT_PATH, index=False)
        print("Embeddings saved.")
else:
    print(f"\nWav2Vec2 embeddings already exist at: {EMBEDDINGS_OUTPUT_PATH}. Loading from file.")
    full_reading_data_wav2vec2 = pd.read_csv(EMBEDDINGS_OUTPUT_PATH)

print("\n--- All feature extraction tasks are complete. ---")


Wav2Vec2 embeddings already exist at: ../data/Processed_Features/features_wav2vec2_reading_task.csv. Loading from file.

--- All feature extraction tasks are complete. ---


In [None]:
# Interview Task Feature Extraction
print("\n--- Starting Interview Task Feature Extraction ---")

from src.utils import aggregate_clip_features


--- Starting Interview Task Feature Extraction ---


In [17]:
# MSHDS Features csv for interview task
MSHDS_INTERVIEW_SAVE_PATH = '../data/Processed_Features/features_mshds_interview_task_aggregated.csv'
if not os.path.exists(MSHDS_INTERVIEW_SAVE_PATH):
    print("\nExtracting MSHDS clip-level features for Interview Task...")
    mshds_clips_df = extract_mshds_features(interview_df)
    
    print("Aggregating clip-level features to session level...")
    mshds_aggregated_df = aggregate_clip_features(mshds_clips_df, interview_df)
    
    mshds_aggregated_df.to_csv(MSHDS_INTERVIEW_SAVE_PATH, index=False)
    print(f"Saved aggregated MSHDS interview features to {MSHDS_INTERVIEW_SAVE_PATH}")
else:
    print(f"\nAggregated MSHDS interview features already exist. Skipping.")


Aggregated MSHDS interview features already exist. Skipping.


In [18]:
# OpenSMILE Features csv for interview task
OPENSMILE_INTERVIEW_SAVE_PATH = '../data/Processed_Features/features_opensmile_interview_task_aggregated.csv'
if not os.path.exists(OPENSMILE_INTERVIEW_SAVE_PATH):
    print("\nExtracting OpenSMILE clip-level features for Interview Task...")
    opensmile_clips_df = extract_opensmile_features(
        input_df=interview_df,
        opensmile_exe_path=OPENSMILE_EXE_PATH,
        config_file_path=CONFIG_FILE_PATH     
    )
    
    print("Aggregating clip-level features to session level...")
    opensmile_aggregated_df = aggregate_clip_features(opensmile_clips_df, interview_df)
    
    opensmile_aggregated_df.to_csv(OPENSMILE_INTERVIEW_SAVE_PATH, index=False)
    print(f"Saved aggregated OpenSMILE interview features to {OPENSMILE_INTERVIEW_SAVE_PATH}")
else:
    print(f"\nAggregated OpenSMILE interview features already exist. Skipping.")


Aggregated OpenSMILE interview features already exist. Skipping.


In [19]:
# Creating Wav2Vec2 Features csv for interview task
WAV2VEC2_INTERVIEW_SAVE_PATH = '../data/Processed_Features/features_wav2vec2_interview_task_aggregated.csv'
if not os.path.exists(WAV2VEC2_INTERVIEW_SAVE_PATH):
    print("\nExtracting Wav2Vec2 clip-level features for Interview Task...")
    wav2vec2_clips_df = extract_wav2vec2_embeddings(interview_df)
    
    print("Aggregating clip-level features to session level...")
    wav2vec2_aggregated_df = aggregate_clip_features(wav2vec2_clips_df, interview_df)
    
    wav2vec2_aggregated_df.to_csv(WAV2VEC2_INTERVIEW_SAVE_PATH, index=False)
    print(f"Saved aggregated Wav2Vec2 interview features to {WAV2VEC2_INTERVIEW_SAVE_PATH}")
else:
    print(f"\nAggregated Wav2Vec2 interview features already exist. Skipping.")

print("\n--- Interview Task Feature Extraction Complete ---")


Aggregated Wav2Vec2 interview features already exist. Skipping.

--- Interview Task Feature Extraction Complete ---
