In [1]:
import os
import sys
import pandas as pd

In [None]:
# Add 'src' directory to the Python path to import modules
# Notebooks are in 'notebooks/' written modules are in 'src/'
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import data loader and feature extraction functions from modules
from src.data_loader import load_androids_corpus

from src.mshds_extractor import extract_mshds_features
from src.opensmile_extractor import extract_opensmile_features
from src.foundation_model_extractor import extract_wav2vec2_embeddings

In [None]:
# Define base path to data
BASE_CORPUS_PATH = 'E:/Dissertation_Data/Androids-Corpus'

# Call function to load the data
print("Loading Androids Corpus...")
reading_df, interview_df = load_androids_corpus(BASE_CORPUS_PATH)

Loading Androids Corpus...
Successfully loaded 112 Read task and 116 Interview task fold assignments.

Processing Reading Task from: E:\Dissertation_Data\Androids-Corpus\Reading-Task\audio
Processed 111 files from Reading-Task.

Processing Interview Task clips from: E:\Dissertation_Data\Androids-Corpus\Interview-Task\audio_clip
Processed 866 clip files from Interview-Task (audio_clip).

--- Data Loading Complete ---


In [10]:
# Header of reading task DataFrame
print("\n--- Reading Task Data ---")
if not reading_df.empty:
    print(f"Shape: {reading_df.shape}")
    print(reading_df.head())
else:
    print("Reading DataFrame is empty.")


--- Reading Task Data ---
Shape: (111, 10)
  unique_participant_id original_id_nn    label  gender  age  education  \
0                  01_C             01  Control  Female   56          1   
1                  02_C             02  Control    Male   57          2   
2                  03_C             03  Control  Female   30          3   
3                  04_C             04  Control  Female   57          3   
4                  05_C             05  Control  Female   41          3   

                                            filepath       filename task_type  \
0  E:/Dissertation_Data/Androids-Corpus\Reading-T...  01_CF56_1.wav   Reading   
1  E:/Dissertation_Data/Androids-Corpus\Reading-T...  02_CM57_2.wav   Reading   
2  E:/Dissertation_Data/Androids-Corpus\Reading-T...  03_CF30_3.wav   Reading   
3  E:/Dissertation_Data/Androids-Corpus\Reading-T...  04_CF57_3.wav   Reading   
4  E:/Dissertation_Data/Androids-Corpus\Reading-T...  05_CF41_3.wav   Reading   

   fold  
0     1 

In [5]:
# Header of interview task DataFrame
print("\n--- Interview Task Data ---")
if not interview_df.empty:
    print(f"Shape: {interview_df.shape}")
    print(interview_df.head())
else:
    print("Interview DataFrame is empty.")


--- Interview Task Data ---
Shape: (866, 11)
  unique_participant_id original_id_nn    label  gender  age  education  \
0                  01_C             01  Control  Female   56          1   
1                  01_C             01  Control  Female   56          1   
2                  01_C             01  Control  Female   56          1   
3                  01_C             01  Control  Female   56          1   
4                  01_C             01  Control  Female   56          1   

                                            filepath          filename  \
0  E:/Dissertation_Data/Androids-Corpus\Interview...   01_CF56_1_1.wav   
1  E:/Dissertation_Data/Androids-Corpus\Interview...  01_CF56_1_10.wav   
2  E:/Dissertation_Data/Androids-Corpus\Interview...   01_CF56_1_2.wav   
3  E:/Dissertation_Data/Androids-Corpus\Interview...   01_CF56_1_3.wav   
4  E:/Dissertation_Data/Androids-Corpus\Interview...   01_CF56_1_4.wav   

  original_session_filename       task_type  fold  
0     

In [None]:
# Create small subset for testing
small_reading_df = reading_df.head(5).copy() # .copy() to avoid SettingWithCopyWarning
print("Created a small test DataFrame with 5 files:")
print(small_reading_df['filename'].tolist())

Created a small test DataFrame with 5 files:
['01_CF56_1.wav', '02_CM57_2.wav', '03_CF30_3.wav', '04_CF57_3.wav', '05_CF41_3.wav']


In [None]:
# Extract MSHDS features for the small test set
print("\nExtracting MSHDS features for the small test set...")
# Progress bar of feature extraction
test_features_df = extract_mshds_features(small_reading_df, verbose=True)
print("Extraction complete!")


Extracting MSHDS features for the small test set...


Extracting MSHDS Features:   0%|          | 0/5 [00:00<?, ?it/s]

Extraction complete!


In [None]:
# Check shape: Should have 5 rows and 26 columns (25 features + 1 filename)
print(f"\nShape of the output feature DataFrame: {test_features_df.shape}")

# Check for missing values. Should be all zeros.
# Non-zero counts, means some files had errors during extraction.
print("\nNumber of missing values per feature:")
print(test_features_df.isnull().sum())

# Display the first few rows to visually inspect the features
print("\nHead of the feature DataFrame:")
display(test_features_df)


Shape of the output feature DataFrame: (5, 26)

Number of missing values per feature:
filename                    0
Speaking_Rate               0
Articulation_Rate           0
Phonation_Ratio             0
Pause_Rate                  0
Mean_Pause_Duration         0
mean_F0                     0
stdev_F0_Semitone           0
mean_dB                     0
range_ratio_dB              0
HNR_dB                      0
Spectral_Slope              0
Spectral_Tilt               0
Cepstral_Peak_Prominence    0
mean_F1_Loc                 0
std_F1_Loc                  0
mean_B1_Loc                 0
std_B1_Loc                  0
mean_F2_Loc                 0
std_F2_Loc                  0
mean_B2_Loc                 0
std_B2_Loc                  0
Spectral_Gravity            0
Spectral_Std_Dev            0
Spectral_Skewness           0
Spectral_Kurtosis           0
dtype: int64

Head of the feature DataFrame:


Unnamed: 0,filename,Speaking_Rate,Articulation_Rate,Phonation_Ratio,Pause_Rate,Mean_Pause_Duration,mean_F0,stdev_F0_Semitone,mean_dB,range_ratio_dB,...,mean_B1_Loc,std_B1_Loc,mean_F2_Loc,std_F2_Loc,mean_B2_Loc,std_B2_Loc,Spectral_Gravity,Spectral_Std_Dev,Spectral_Skewness,Spectral_Kurtosis
0,01_CF56_1.wav,3.221843,4.078672,0.789924,0.342143,0.614,135.486474,3.477464,63.951329,2.15558,...,255.16868,286.616317,1633.716904,488.536986,594.657072,716.782932,355.206497,337.788948,5.964434,70.606271
1,02_CM57_2.wav,3.402797,4.119856,0.825951,0.2534,0.686857,95.854561,3.35266,65.361857,2.016766,...,242.927026,253.067458,1614.980076,448.984263,492.187366,439.490258,363.889176,393.58486,6.691924,121.864942
2,03_CF30_3.wav,4.388955,5.117387,0.857656,0.177931,0.8,201.023993,2.08271,65.586717,2.360642,...,260.924023,288.58863,1704.68601,484.051893,662.627477,905.501959,460.918213,421.40373,5.051323,55.662546
3,04_CF57_3.wav,4.384482,5.207409,0.84197,0.259297,0.609455,173.83401,2.489167,68.230038,3.05306,...,147.257021,133.237838,1553.763156,398.784342,371.078992,485.221797,576.765021,477.447754,4.396186,39.211027
4,05_CF41_3.wav,4.676964,5.254031,0.890167,0.20116,0.546,197.308449,3.552315,69.091985,3.231895,...,179.709255,253.194893,1557.978158,543.890373,571.127516,632.10968,457.281058,344.699651,6.319892,83.131251


In [None]:
# Extracting features for full reading task set
print("\nExtracting MSHDS features for the FULL Reading Task set...")
mshds_reading_features_df = extract_mshds_features(reading_df)

# Merge the features extracted from the reading task audio back with the original metadata from reading_df
full_reading_data = pd.merge(reading_df, mshds_reading_features_df, on='filename', how='left')

print("\n--- Full feature extraction and merge complete! ---")
print(f"Shape of final DataFrame: {full_reading_data.shape}")

# Save the final DataFrame to a CSV for future use
output_path = '../data/features_mshds_reading_task.csv'
print(f"\nSaving extracted features to: {output_path}")
full_reading_data.to_csv(output_path, index=False)
print("Done.")


Extracting MSHDS features for the FULL Reading Task set...


Extracting MSHDS Features:   0%|          | 0/111 [00:00<?, ?it/s]


--- Full feature extraction and merge complete! ---
Shape of final DataFrame: (111, 35)

Saving extracted features to: ../data/features_mshds_reading_task.csv
Done.


In [4]:
# Path to the OpenSMILE executable
OPENSMILE_EXE_PATH = 'E:/tools/opensmile-3.0.2/bin/SMILExtract.exe'

# Path to the OpenSMILE config file
CONFIG_FILE_PATH = os.path.join(BASE_CORPUS_PATH, 'Androids.conf')

# Path for saving the processed features
OPENSMILE_OUTPUT_PATH = '../data/Processed_Features/features_opensmile_reading_task.csv'

In [5]:
if not os.path.exists(OPENSMILE_OUTPUT_PATH):
    print(f"\nExtracting OpenSMILE features for the Reading Task...")
    if not os.path.exists(OPENSMILE_EXE_PATH):
        print(f"FATAL ERROR: OpenSMILE executable not found at '{OPENSMILE_EXE_PATH}'.")
    else:
        opensmile_features_df = extract_opensmile_features(
            input_df=reading_df,
            opensmile_exe_path=OPENSMILE_EXE_PATH,
            config_file_path=CONFIG_FILE_PATH
        )
        if not opensmile_features_df.empty:
            full_reading_data_opensmile = pd.merge(reading_df, opensmile_features_df, on='filename', how='left')
            print(f"Saving OpenSMILE features to: {OPENSMILE_OUTPUT_PATH}")
            full_reading_data_opensmile.to_csv(OPENSMILE_OUTPUT_PATH, index=False)
            print("OpenSMILE features saved.")
else:
    print(f"\nOpenSMILE features already exist at: {OPENSMILE_OUTPUT_PATH}. Loading from file.")
    full_reading_data_opensmile = pd.read_csv(OPENSMILE_OUTPUT_PATH)
    
print("\n--- All feature extraction checks complete. ---")


Extracting OpenSMILE features for the Reading Task...
Using temporary directory for OpenSMILE outputs: C:\Users\ayush\AppData\Local\Temp\tmpfi2tohtn


Extracting OpenSMILE Features:   0%|          | 0/111 [00:00<?, ?it/s]

Saving OpenSMILE features to: ../data/Processed_Features/features_opensmile_reading_task.csv
OpenSMILE features saved.

--- All feature extraction checks complete. ---


In [4]:
# Define the model to use and the output path
MODEL_NAME = "facebook/wav2vec2-base-960h"
EMBEDDINGS_OUTPUT_PATH = '../data/Processed_Features/features_wav2vec2_reading_task.csv'

In [5]:
# Check if features already exist to save time
if not os.path.exists(EMBEDDINGS_OUTPUT_PATH):
    print(f"\nExtracting embeddings from {MODEL_NAME} for the Reading Task...")
    wav2vec2_features_df = extract_wav2vec2_embeddings(reading_df)
    
    if not wav2vec2_features_df.empty:
        # Merge the features back with the original metadata
        full_reading_data_wav2vec2 = pd.merge(reading_df, wav2vec2_features_df, on='filename', how='left')
        
        print(f"Saving Wav2Vec2 embeddings to: {EMBEDDINGS_OUTPUT_PATH}")
        full_reading_data_wav2vec2.to_csv(EMBEDDINGS_OUTPUT_PATH, index=False)
        print("Embeddings saved.")
else:
    print(f"\nWav2Vec2 embeddings already exist at: {EMBEDDINGS_OUTPUT_PATH}. Loading from file.")
    full_reading_data_wav2vec2 = pd.read_csv(EMBEDDINGS_OUTPUT_PATH)

print("\n--- All feature extraction tasks are complete. ---")


Extracting embeddings from facebook/wav2vec2-base-960h for the Reading Task...
Using device: cpu


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracting wav2vec2-base-960h Embeddings:   0%|          | 0/111 [00:00<?, ?it/s]

Saving Wav2Vec2 embeddings to: ../data/Processed_Features/features_wav2vec2_reading_task.csv
Embeddings saved.

--- All feature extraction tasks are complete. ---
