In [2]:
import os
import pandas as pd
import glob
import re

In [None]:
# Setup file paths
BASE_CORPUS_PATH = 'E:/Dissertation_Data/Androids-Corpus' 
READING_TASK_AUDIO_ROOT = os.path.join(BASE_CORPUS_PATH, 'Reading-Task', 'audio')
INTERVIEW_TASK_CLIPS_ROOT = os.path.join(BASE_CORPUS_PATH, 'Interview-Task', 'audio_clip')
FOLD_LIST_CSV_PATH = os.path.join(BASE_CORPUS_PATH, 'fold-lists.csv')

In [4]:
print(f"Base corpus path: {os.path.abspath(BASE_CORPUS_PATH)}")
print(f"Attempting to load fold list from: {os.path.abspath(FOLD_LIST_CSV_PATH)}")

Base corpus path: e:\MSc_Project\data\Androids_Corpus
Attempting to load fold list from: e:\MSc_Project\data\Androids_Corpus\fold-lists.csv


In [None]:
# Load and process fold-lists.csv
read_task_file_to_fold_map = {}
interview_task_file_to_fold_map = {}

try:
    fold_df = pd.read_csv(FOLD_LIST_CSV_PATH, header=1) 
    
    read_fold_cols_actual = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
    interview_fold_cols_actual = ['fold1.1', 'fold2.1', 'fold3.1', 'fold4.1', 'fold5.1']

    # Populate map for Reading Task folds
    for col_name in read_fold_cols_actual:
        if col_name in fold_df:
            fold_number_match = re.search(r'(\d+)', col_name)
            if fold_number_match:
                fold_number = int(fold_number_match.group(1))
                for filename_in_csv in fold_df[col_name].dropna().astype(str):
                    
                    filename_key = os.path.splitext(filename_in_csv)[0].strip().strip("'") 
                    read_task_file_to_fold_map[filename_key] = fold_number
            else:
                print(f"Warning: Could not extract fold number from read column name: {col_name}")
        else:
            print(f"Warning: Read task column '{col_name}' not found in fold_df.")

    # Populate map for Interview Task folds
    for col_name in interview_fold_cols_actual:
        if col_name in fold_df:
            fold_number_match = re.search(r'(\d+)', col_name.split('.')[0]) 
            if fold_number_match:
                fold_number = int(fold_number_match.group(1))
                for filename_in_csv in fold_df[col_name].dropna().astype(str):
                    
                    # Strip whitespace and any leading/trailing single quotes
                    filename_key = os.path.splitext(filename_in_csv)[0].strip().strip("'")
                    interview_task_file_to_fold_map[filename_key] = fold_number
            else:
                print(f"Warning: Could not extract fold number from interview column name: {col_name}")
        else:
            print(f"Warning: Interview task column '{col_name}' not found in fold_df.")
                    
    print(f"\nLoaded {len(read_task_file_to_fold_map)} entries for Read task folds.")
    print(f"Loaded {len(interview_task_file_to_fold_map)} entries for Interview task folds.")

except FileNotFoundError:
    print(f"Error: {FOLD_LIST_CSV_PATH} (resolved to {os.path.abspath(FOLD_LIST_CSV_PATH)}) not found.")
except Exception as e:
    print(f"An error occurred while processing {FOLD_LIST_CSV_PATH}: {e}")


Loaded 112 entries for Read task folds.
Loaded 116 entries for Interview task folds.


In [None]:
# Process Reading-Task Audio Files
reading_task_data = []
filename_pattern = re.compile(r"(\d{1,2})_([PCX])([MF])(\d{2})_(\d)\.wav")
print(f"\nProcessing Reading Task from: {os.path.abspath(READING_TASK_AUDIO_ROOT)}")

for condition_folder in ['HC', 'PT']:
    condition_path = os.path.join(READING_TASK_AUDIO_ROOT, condition_folder)
    if not os.path.isdir(condition_path):
        print(f"Warning: Directory not found {condition_path}")
        continue
    for audio_filename_with_ext in os.listdir(condition_path):
        if audio_filename_with_ext.endswith('.wav'):
            filepath = os.path.join(condition_path, audio_filename_with_ext)
            match = filename_pattern.match(audio_filename_with_ext)
            if match:
                nn, cond_char, gen_char, age_s, edu_s = match.groups()
                filename_key_for_fold_lookup = os.path.splitext(audio_filename_with_ext)[0] # This is already clean
                fold_number = read_task_file_to_fold_map.get(filename_key_for_fold_lookup, -1)
                
                reading_task_data.append({
                    'filepath': filepath, 'filename': audio_filename_with_ext,
                    'unique_participant_id': f"{nn}_{cond_char}", 'original_id_nn': nn,
                    'label': "Patient" if cond_char == 'P' else "Control" if cond_char == 'C' else "Unknown",
                    'gender': "Male" if gen_char == 'M' else "Female", 'age': int(age_s),
                    'education': int(edu_s), 'task_type': 'Reading', 'fold': fold_number
                })
            else:
                if not audio_filename_with_ext.startswith('.'):
                    print(f"Warning: Could not parse filename {audio_filename_with_ext} in Reading-Task")

reading_df = pd.DataFrame(reading_task_data)
if not reading_df.empty:
    print(f"\nProcessed {len(reading_df)} files from Reading-Task.")
    print(reading_df.head())
    print(f"\nValue counts for 'fold' in Reading Task:\n{reading_df['fold'].value_counts(dropna=False).sort_index()}")
else:
    print("\nNo data processed for Reading-Task. Check paths and file structure.")


Processing Reading Task from: e:\MSc_Project\data\Androids_Corpus\Reading-Task\audio

Processed 111 files from Reading-Task.
                                            filepath       filename  \
0  ../data/Androids_Corpus\Reading-Task\audio\HC\...  01_CF56_1.wav   
1  ../data/Androids_Corpus\Reading-Task\audio\HC\...  02_CM57_2.wav   
2  ../data/Androids_Corpus\Reading-Task\audio\HC\...  03_CF30_3.wav   
3  ../data/Androids_Corpus\Reading-Task\audio\HC\...  04_CF57_3.wav   
4  ../data/Androids_Corpus\Reading-Task\audio\HC\...  05_CF41_3.wav   

  unique_participant_id original_id_nn    label  gender  age  education  \
0                  01_C             01  Control  Female   56          1   
1                  02_C             02  Control    Male   57          2   
2                  03_C             03  Control  Female   30          3   
3                  04_C             04  Control  Female   57          3   
4                  05_C             05  Control  Female   41          3 

In [None]:
# Process Interview-Task Audio Files (from audio_clip)
interview_task_data = []
print(f"\nProcessing Interview Task clips from: {os.path.abspath(INTERVIEW_TASK_CLIPS_ROOT)}")

if not os.path.isdir(INTERVIEW_TASK_CLIPS_ROOT):
    print(f"Warning: Directory not found {INTERVIEW_TASK_CLIPS_ROOT}")
else:
    for participant_session_folder_name in os.listdir(INTERVIEW_TASK_CLIPS_ROOT):
        participant_session_path = os.path.join(INTERVIEW_TASK_CLIPS_ROOT, participant_session_folder_name)
        if os.path.isdir(participant_session_path):
            match_folder = filename_pattern.match(participant_session_folder_name + ".wav")
            if match_folder:
                nn, cond_char, gen_char, age_s, edu_s = match_folder.groups()
                # The key for interview_task_file_to_fold_map is the session folder name
                fold_number = interview_task_file_to_fold_map.get(participant_session_folder_name, -1) 
                
                for clip_filename_with_ext in os.listdir(participant_session_path):
                    if clip_filename_with_ext.endswith('.wav'):
                        clip_filepath = os.path.join(participant_session_path, clip_filename_with_ext)
                        interview_task_data.append({
                            'filepath': clip_filepath, 'filename': clip_filename_with_ext,
                            'original_session_filename': participant_session_folder_name,
                            'unique_participant_id': f"{nn}_{cond_char}", 'original_id_nn': nn,
                            'label': "Patient" if cond_char == 'P' else "Control" if cond_char == 'C' else "Unknown",
                            'gender': "Male" if gen_char == 'M' else "Female", 'age': int(age_s),
                            'education': int(edu_s), 'task_type': 'Interview_Clip', 'fold': fold_number
                        })
            elif not participant_session_folder_name.startswith('.'):
                print(f"Warning: Could not parse interview session folder name: {participant_session_folder_name}")

interview_df = pd.DataFrame(interview_task_data)
if not interview_df.empty:
    print(f"\nProcessed {len(interview_df)} clip files from Interview-Task (audio_clip).")
    print(interview_df.head())
    print(f"\nValue counts for 'fold' in Interview Task:\n{interview_df['fold'].value_counts(dropna=False).sort_index()}")
else:
    print("\nNo data processed for Interview-Task. Check paths and file structure.")

print("\n--- Data Loading Complete ---")


Processing Interview Task clips from: e:\MSc_Project\data\Androids_Corpus\Interview-Task\audio_clip

Processed 866 clip files from Interview-Task (audio_clip).
                                            filepath          filename  \
0  ../data/Androids_Corpus\Interview-Task\audio_c...   01_CF56_1_1.wav   
1  ../data/Androids_Corpus\Interview-Task\audio_c...  01_CF56_1_10.wav   
2  ../data/Androids_Corpus\Interview-Task\audio_c...   01_CF56_1_2.wav   
3  ../data/Androids_Corpus\Interview-Task\audio_c...   01_CF56_1_3.wav   
4  ../data/Androids_Corpus\Interview-Task\audio_c...   01_CF56_1_4.wav   

  original_session_filename unique_participant_id original_id_nn    label  \
0                 01_CF56_1                  01_C             01  Control   
1                 01_CF56_1                  01_C             01  Control   
2                 01_CF56_1                  01_C             01  Control   
3                 01_CF56_1                  01_C             01  Control   
4        

: 