In [5]:
import os
import shutil

# - Summary of what this script does:
# - Organizes subject folders in ../data/raw into subfolders: audio, video, text, clinical
# - Moves files into the correct subfolder based on keywords
# - Copies Transcript CSVs into both text and clinical folders
# - Moves feature files to their respective folders and deletes the 'features' folder
# - Deletes:
#     - Any .wav files directly in subject root

RAW_DATA_PATH = '../data/raw'

audio_keywords = ['OpenSMILE', 'BoAW', 'MFCC', 'vgg16', 'densenet']
video_keywords = ['OpenFace', 'BoVW', 'CNN']
text_keywords = ['Transcript']
clinical_keywords = ['Transcript']

def classify_file(filename):
    if any(key in filename for key in audio_keywords):
        return 'audio'
    elif any(key in filename for key in video_keywords):
        return 'video'
    elif any(key in filename for key in text_keywords) and filename.endswith('.csv'):
        return 'text'
    elif any(key in filename for key in clinical_keywords) and filename.endswith('.csv'):
        return 'clinical'
    return None

for subject_dir in os.listdir(RAW_DATA_PATH):
    subject_path = os.path.join(RAW_DATA_PATH, subject_dir)
    if not os.path.isdir(subject_path):
        continue

    # 🧹 Delete any .wav file from root of subject folder
    for file in os.listdir(subject_path):
        file_path = os.path.join(subject_path, file)
        if os.path.isfile(file_path) and file.lower().endswith('.wav'):
            os.remove(file_path)
            print(f"🗑️ Deleted .wav from root of subject folder: {file_path}")

    # Create subdirectories
    for subfolder in ['audio', 'video', 'text', 'clinical']:
        os.makedirs(os.path.join(subject_path, subfolder), exist_ok=True)

    # Move/copy root-level files
    for file in os.listdir(subject_path):
        file_path = os.path.join(subject_path, file)
        if os.path.isfile(file_path):
            target_type = classify_file(file)
            if target_type == 'text':
                shutil.move(file_path, os.path.join(subject_path, 'text', file))
                # Also copy to clinical
                shutil.copy(os.path.join(subject_path, 'text', file), os.path.join(subject_path, 'clinical', file))
            elif target_type:
                shutil.move(file_path, os.path.join(subject_path, target_type, file))

    # Move feature files
    features_path = os.path.join(subject_path, 'features')
    if os.path.exists(features_path):
        for root, _, files in os.walk(features_path):
            for feature_file in files:
                if feature_file == 'processed':
                    continue
                full_path = os.path.join(root, feature_file)
                target_type = classify_file(feature_file)
                if target_type:
                    shutil.move(full_path, os.path.join(subject_path, target_type, feature_file))
        shutil.rmtree(features_path, ignore_errors=True)

print("✅ Dataset restructuring and cleanup completed.")


🗑️ Deleted .wav from root of subject folder: ../data/raw/302_P/302_AUDIO.wav
🗑️ Deleted .wav from root of subject folder: ../data/raw/301_P/301_AUDIO.wav
✅ Dataset restructuring and cleanup completed.
