# Audio Segmentation

Using the master dataset's onset times, cut each continuous audio recording into individual "boxemes" (isolated vocal percussion sounds).

In [3]:
import numpy as np
import librosa
import soundfile as sf
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import os


In [4]:
def segment_audio(master_df, output_dir, segment_duration=0.5):
    """
    Segments audio files and saves them to disk.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    
    segment_info = []
    
    # Group by wav_file_path
    for wav_path, group in tqdm(master_df.groupby('wav_file_path')):
        print(f"\nProcessing {wav_path}")
        
        try:
            # Use soundfile instead of librosa.load
            y, sr = sf.read(wav_path)
            
            # Process each onset in this file
            for idx, row in group.iterrows():
                start_sample = int(row['onset_time'] * sr)
                end_sample = start_sample + int(segment_duration * sr)
                
                # Handle edge cases
                if start_sample < 0:
                    start_sample = 0
                if end_sample > len(y):
                    end_sample = len(y)
                
                if end_sample > start_sample:
                    segment = y[start_sample:end_sample]
                    
                    # Pad if needed
                    if len(segment) < int(segment_duration * sr):
                        segment = np.pad(segment, 
                                      (0, int(segment_duration * sr) - len(segment)),
                                      mode='constant')
                    
                    segment_filename = (f"{row['dataset']}_{row['participant_id']}_"
                                     f"{row['instrument_label']}_{idx:04d}.wav")
                    
                    segment_path = output_dir / segment_filename
                    sf.write(str(segment_path), segment, sr)
                    
                    segment_info.append({
                        'segment_path': str(segment_path),
                        'instrument_label': row['instrument_label'],
                        'participant_id': row['participant_id'],
                        'dataset': row['dataset'],
                        'original_wav': wav_path,
                        'onset_time': row['onset_time'],
                        'onset_phoneme': row['onset_phoneme'],
                        'coda_phoneme': row['coda_phoneme']
                    })
        
        except Exception as e:
            print(f"Error processing {wav_path}: {str(e)}")
            continue
    
    segment_df = pd.DataFrame(segment_info)
    os.makedirs('../segment_info', exist_ok=True)
    segment_df.to_csv('../segment_info/segment_info.csv', index=False)
    
    return segment_df

In [5]:
master_df = pd.read_csv('../data/master_dataset.csv')
    
# Segment all audio files
print("Starting audio segmentation...")
segment_df = segment_audio(master_df, output_dir='../segments')
    
# Print summary
print("\nSegmentation Summary:")
print(f"Total segments extracted: {len(segment_df)}")
print("\nInstrument distribution:")
print(segment_df['instrument_label'].value_counts())
print("\nDataset distribution:")
print(segment_df['dataset'].value_counts())

Starting audio segmentation...


  6%|▌         | 10/180 [00:00<00:01, 98.11it/s]


Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_1/P1_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_1/P1_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_1/P1_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_1/P1_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_1/P1_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_10/P10_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_10/P10_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_10/P10_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_10/P10_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_10/P10_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal

 17%|█▋        | 31/180 [00:00<00:01, 99.88it/s]


Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_12/P12_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_13/P13_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_13/P13_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_13/P13_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_13/P13_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_13/P13_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_14/P14_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_14/P14_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_14/P14_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_14/P14_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Datase

 29%|██▉       | 52/180 [00:00<00:01, 88.24it/s]


Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_17/P17_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_17/P17_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_17/P17_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_17/P17_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_18/P18_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_18/P18_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_18/P18_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_18/P18_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_18/P18_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_19/P19_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Datase

 39%|███▉      | 70/180 [00:00<00:01, 84.45it/s]


Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_2/P2_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_2/P2_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_2/P2_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_20/P20_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_20/P20_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_20/P20_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_20/P20_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_20/P20_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_21/P21_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_21/P21_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Pers

 49%|████▉     | 88/180 [00:01<00:01, 77.72it/s]


Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_22/P22_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_22/P22_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_23/P23_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_23/P23_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_23/P23_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_23/P23_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_23/P23_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_24/P24_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_24/P24_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_24/P24_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Datase

 60%|██████    | 108/180 [00:01<00:00, 84.71it/s]


Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_26/P26_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_26/P26_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_26/P26_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_26/P26_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_26/P26_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_27/P27_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_27/P27_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_27/P27_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_27/P27_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_27/P27_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Datase

 72%|███████▏  | 130/180 [00:01<00:00, 96.00it/s]


Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_3/P3_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_4/P4_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_4/P4_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_4/P4_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_4/P4_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_4/P4_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_5/P5_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_5/P5_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_5/P5_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_5/P5_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participa

 79%|███████▉  | 142/180 [00:01<00:00, 101.05it/s]


Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_8/P8_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_8/P8_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_8/P8_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_8/P8_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_9/P9_HHclosed_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_9/P9_HHopened_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_9/P9_Improvisation_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_9/P9_Kick_Personal.wav

Processing ../../AVP-LVT_Dataset/AVP_Dataset/Personal/Participant_9/P9_Snare_Personal.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Frase/AFRP3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Frase/AZiP3.wav

Processing ../../AVP-LVT_Dataset/

100%|██████████| 180/180 [00:01<00:00, 96.49it/s] 



Processing ../../AVP-LVT_Dataset/LVT_Dataset/Frase/RicP3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Frase/RobP3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Frase/SofP3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Frase/ZgaP3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Frase/ZizP3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Improviso/AFRI3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Improviso/AZiI3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Improviso/BeaI3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Improviso/BicI3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Improviso/CatI3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Improviso/CavI3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Improviso/CraI3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Improviso/IsaI3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Improviso/JOlI3.wav

Processing ../../AVP-LVT_Dataset/LVT_Dataset/Improviso/JSiI3.wav

Processing ../../AVP-LVT_Data