# Collecting and Organizing Data

In [15]:
import csv
import os
import pandas as pd
from pathlib import Path

First we have to make a master dataframe with all the relevant data. This master dataframe will contain an entry for every single onset, for every single wav file in the audio file. If an audio file is multiple drum sounds, then there is a single onset for each drum sound, and an single audio file will contirnbute to multiple entries in the dataset. We will have to parse AVP and LVT seperately and then combine them. 

In [None]:
# function to parse AVP csv, get the onset time, instrument label, onset phoneme, coda phoneme, dataset, participant id, subset, csv file path, wav file path
def parse_avp_csv(csv_path):
    """
    Parses an AVP CSV with no header, returning a list of dicts.
    Each dict has:
      - onset_time (float)
      - instrument_label (str)
      - onset_phoneme (str)
      - coda_phoneme (str)
      - dataset (str) = "AVP"
      - participant_id (str)
      - subset (str) = "personal"
      - csv_file_path (str)
      - wav_file_path (str)
    """
    # Extract some metadata from the file path
    csv_dir = os.path.dirname(csv_path)             
    csv_file_name = os.path.basename(csv_path)     
    base_name, _ = os.path.splitext(csv_file_name) 
    
    # Determine participant_id from the file name
    parts = base_name.split("_")  
    participant_id = parts[0]    
    
    # Build the wav path. 
    wav_file_name = base_name + ".wav"
    wav_file_path = os.path.join(csv_dir, wav_file_name)
    
    dataset = "AVP"
    subset = "personal"
    
    # Parse each row of the CSV
    data = []
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) < 2:
                continue  # skip empty or malformed lines
            onset_time = float(row[0])
            instrument_label = row[1]
            onset_phoneme = row[2] if len(row) > 2 else ''
            coda_phoneme = row[3] if len(row) > 3 else ''
            
            entry = {
                'onset_time': onset_time,
                'instrument_label': instrument_label,
                'onset_phoneme': onset_phoneme,
                'coda_phoneme': coda_phoneme,
                'dataset': dataset,
                'participant_id': participant_id,
                'subset': subset,
                'csv_file_path': csv_path,
                'wav_file_path': wav_file_path
            }
            data.append(entry)
    
    return data

In [None]:
def collect_all_avp_data(root_dir):
    """
    Walks through the AVP dataset directory and collects all CSV data into a master DataFrame,
    maintaining the grouping of entries from the same CSV file
    """
    all_data = []
    personal_dir = os.path.join(root_dir, "Personal")
    
    # Walk through all participant directories in sorted order
    for participant_dir in sorted(os.listdir(personal_dir)):
        participant_path = os.path.join(personal_dir, participant_dir)
        
        # Skip if not a directory or hidden files
        if not os.path.isdir(participant_path) or participant_dir.startswith('.'):
            continue
            
        # Process CSV files in sorted order
        for file_name in sorted(os.listdir(participant_path)):
            if file_name.endswith('.csv'):
                csv_path = os.path.join(participant_path, file_name)
                
                try:
                    parsed_data = parse_avp_csv(csv_path)
                    # Add the source filename as a field for sorting
                    for entry in parsed_data:
                        entry['source_file'] = file_name
                    all_data.extend(parsed_data)
                except Exception as e:
                    print(f"Error processing {csv_path}: {str(e)}")
    
    # Convert to DataFrame
    df = pd.DataFrame(all_data)
    
    # Sort to maintain grouping:
    df = df.sort_values(['participant_id', 'source_file', 'onset_time'])
    
    # Optionally remove the temporary source_file column if you don't need it
    df = df.drop('source_file', axis=1)
    
    return df

In [None]:
def parse_lvt_csv(csv_path):
    """
    Parses an LVT CSV with no header, returning a list of dicts.
    Similar to parse_avp_csv but handles LVT-specific formatting.
    """
    # Extract metadata from the file path
    csv_dir = os.path.dirname(csv_path)             
    csv_file_name = os.path.basename(csv_path)      # e.g., "AFRP.csv"
    base_name, _ = os.path.splitext(csv_file_name)  # e.g., "AFRP"
    
    # Determine if this is from Frase or Improviso folder
    subset = "Frase" if "Frase" in csv_dir else "Improviso"
    
    participant_id = base_name
    
    # Build the wav path
    wav_file_name = base_name + "3.wav"
    wav_file_path = os.path.join(csv_dir, wav_file_name)
    
    # Mapping for instrument labels
    instrument_map = {
        "Kick": "kd",
        "Snare": "sd",
        "HH": "hhc"  # Assuming all HH in LVT are closed hi-hats
    }
    
    data = []
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) < 2:
                continue 
                
            onset_time = float(row[0])
            original_label = row[1]
            instrument_label = instrument_map.get(original_label, original_label)
            onset_phoneme = row[2] if len(row) > 2 else ''
            coda_phoneme = row[3] if len(row) > 3 else ''
            
            # onset_phoneme = get_standardized_phoneme(row[2], is_onset=True)   # converts 'ts' if needed
            # coda_phoneme = get_standardized_phoneme(row[3], is_onset=False)   # converts 'x' if needed
            
            entry = {
                'onset_time': onset_time,
                'instrument_label': instrument_label,
                'onset_phoneme': onset_phoneme,
                'coda_phoneme': coda_phoneme,
                'dataset': "LVT",
                'participant_id': participant_id,
                'subset': subset,
                'csv_file_path': csv_path,
                'wav_file_path': wav_file_path
            }
            data.append(entry)
    
    return data

In [None]:
def collect_all_lvt_data(root_dir):
    """
    Walks through the LVT dataset directory and collects all CSV data into a master DataFrame
    """
    all_data = []
    
    # Process both Frase and Improviso folders
    for subset_dir in ["Frase", "Improviso"]:
        subset_path = os.path.join(root_dir, subset_dir)
        
        # Skip if directory doesn't exist
        if not os.path.isdir(subset_path):
            continue
            
        # Process CSV files in sorted order
        for file_name in sorted(os.listdir(subset_path)):
            if file_name.endswith('.csv') and not file_name.startswith('.'):
                csv_path = os.path.join(subset_path, file_name)
                
                try:
                    parsed_data = parse_lvt_csv(csv_path)
                    # Add source file for sorting
                    for entry in parsed_data:
                        entry['source_file'] = file_name
                    all_data.extend(parsed_data)
                except Exception as e:
                    print(f"Error processing {csv_path}: {str(e)}")
    
    # Convert to DataFrame
    df = pd.DataFrame(all_data)
    
    # Sort to maintain grouping
    df = df.sort_values(['subset', 'participant_id', 'source_file', 'onset_time'])
    
    # Remove temporary sorting column
    df = df.drop('source_file', axis=1)
    
    return df

In [None]:
def create_all_datasets():
    avp_dataset_path = "../../AVP-LVT_Dataset/AVP_Dataset"
    lvt_dataset_path = "../../AVP-LVT_Dataset/LVT_Dataset"
    
    # Collect data from both datasets
    print("Processing AVP dataset...")
    avp_df = collect_all_avp_data(avp_dataset_path)
    
    print("Processing LVT dataset...")
    lvt_df = collect_all_lvt_data(lvt_dataset_path)
    
    # Save individual datasets
    print("\nSaving individual datasets...")
    # Ensure the directory exists
    os.makedirs('../data', exist_ok=True)
    
    avp_df.to_csv('../data/avp_dataset.csv', index=False)
    lvt_df.to_csv('../data/lvt_dataset.csv', index=False)
    
    # Combine and save master dataset
    print("Creating and saving master dataset...")
    master_df = pd.concat([avp_df, lvt_df], ignore_index=True)
    master_df.to_csv('../data/master_dataset.csv', index=False)
    
    # Print summary statistics
    print("\nDataset Summaries:")
    print(f"AVP Dataset: {len(avp_df)} events")
    print("\nAVP participants:", len(avp_df['participant_id'].unique()))
    print("AVP instrument distribution:")
    print(avp_df['instrument_label'].value_counts())
    
    print(f"\nLVT Dataset: {len(lvt_df)} events")
    print("LVT subsets:", lvt_df['subset'].unique())
    print("LVT participants:", len(lvt_df['participant_id'].unique()))
    print("LVT instrument distribution:")
    print(lvt_df['instrument_label'].value_counts())
    
    print(f"\nMaster Dataset: {len(master_df)} total events")
    print("Distribution by dataset:")
    print(master_df['dataset'].value_counts())
    
    return avp_df, lvt_df, master_df


In [21]:
avp_df, lvt_df, master_df = create_all_datasets()

Processing AVP dataset...
Processing LVT dataset...

Saving individual datasets...
Creating and saving master dataset...

Dataset Summaries:
AVP Dataset: 4873 events

AVP participants: 28
AVP instrument distribution:
instrument_label
kd     1447
sd     1253
hhc    1164
hho    1009
Name: count, dtype: int64

LVT Dataset: 841 events
LVT subsets: ['Frase' 'Improviso']
LVT participants: 40
LVT instrument distribution:
instrument_label
hhc    334
kd     329
sd     178
Name: count, dtype: int64

Master Dataset: 5714 total events
Distribution by dataset:
dataset
AVP    4873
LVT     841
Name: count, dtype: int64


Now we have master_dataset.csv, and master_df, both of which contain the info for every single onset for every single sound in the dataset. 

In [22]:
def analyze_phonemes():
    """
    Analyze and compare phonemes between AVP and LVT datasets
    """
    # Read both datasets
    avp_df = pd.read_csv('../data/avp_dataset.csv')
    lvt_df = pd.read_csv('../data/lvt_dataset.csv')
    
    print("AVP Unique Onset Phonemes:")
    print(sorted(avp_df['onset_phoneme'].unique()))
    print("\nAVP Unique Coda Phonemes:")
    print(sorted(avp_df['coda_phoneme'].unique()))
    
    print("\nLVT Unique Onset Phonemes:")
    print(sorted(lvt_df['onset_phoneme'].unique()))
    print("\nLVT Unique Coda Phonemes:")
    print(sorted(lvt_df['coda_phoneme'].unique()))

# Run the analysis
analyze_phonemes()

AVP Unique Onset Phonemes:
['!', 'dʒ', 'k', 'kg', 'kʃ', 'p', 's', 't', 'ts', 'tɕ', 'tʃ', 'tʒ', 'ʡʢ']

AVP Unique Coda Phonemes:
['I', 'a', 'e', 'h', 'i', 'o', 'u', 'x', 'æ', 'œ', 'ɐ', 'ɘ', 'ə', 'ɪ', 'ɯ', 'ʊ', 'ʌ']

LVT Unique Onset Phonemes:
['!', 'k', 'p', 's', 't', 'ts', 'tʃ', 'ʔ', 'ʡʢ']

LVT Unique Coda Phonemes:
['a', 'h', 'u', 'x', 'ʊ']
