In [None]:
import os
!pip install pyworld
import pyworld as pw
import numpy as np
import soundfile as sf
import librosa



In [None]:
def extract_features(y, sr):
    y = y.astype(np.float64)

    # === Use librosa.pyin's default frame_length and hop_length ===
    # Default frame_length = 2048, default hop_length = frame_length // 4 = 512
    pyin_hop_length = 2048 // 4 # Explicitly define for clarity
    pyin_frame_length = 2048

    # === Extract F0 using librosa.pyin ===
    f0, voiced_flag, _ = librosa.pyin(
        y, sr=sr,
        fmin=librosa.note_to_hz('C2'),
        fmax=librosa.note_to_hz('C7'),
        frame_length=pyin_frame_length, # Using default
        hop_length=pyin_hop_length    # Using default
    )

    # === Create accurate timestamps for F0 ===
    times_f0 = librosa.times_like(f0, sr=sr, hop_length=pyin_hop_length)

    # === Clean F0 ===
    f0_clean = f0.copy()
    mask = np.isnan(f0_clean) | (f0_clean == 0)
    if np.any(mask):
        # Ensure there are valid points to interpolate from
        valid_indices = np.flatnonzero(~mask)
        nan_indices = np.flatnonzero(mask)
        if len(valid_indices) > 0:
            f0_clean[mask] = np.interp(nan_indices, valid_indices, f0_clean[valid_indices])
        else: # All values are NaN or zero, fill with zero
            f0_clean[mask] = 0.0
    f0_clean = np.nan_to_num(f0_clean, nan=0.0) # Final check for any remaining NaNs

    # === RMS energy, aligned to the same hop rate ===
    # Use the same frame_length and hop_length as pyin for consistency
    rms = librosa.feature.rms(y=y, frame_length=pyin_frame_length, hop_length=pyin_hop_length)[0]

    # Align lengths (truncate/pad if librosa.feature.rms produces slightly different num_frames)
    min_len = min(len(f0_clean), len(rms))
    f0_clean = f0_clean[:min_len]
    times_f0 = times_f0[:min_len]
    rms = rms[:min_len]

    # === Spectral envelope ===
    # Ensure f0_clean has 0 for unvoiced for cheaptrick (pyin's voiced_flag can help here if preferred)
    f0_for_cheaptrick = f0_clean.copy()
    # Example: If you want to be strict based on voiced_flag (after aligning its length too)
    # voiced_flag_aligned = voiced_flag[:min_len]
    # f0_for_cheaptrick[~voiced_flag_aligned] = 0.0

    sp = pw.cheaptrick(y, f0_for_cheaptrick, times_f0, sr)

    return times_f0, f0_clean, rms, sp, (pyin_hop_length / sr)

In [None]:
root_input = "/content/drive/MyDrive/data/VocalSet Singing Voice Dataset/FULL"
root_output = "/content/drive/MyDrive/data/audio_reps/VocalSet4"

os.makedirs(root_output, exist_ok=True)

include_folders = {
    "arpeggios": ["straight", "vibrato", "fast_forte", "fast_piano", "slow_forte", "slow_piano"],
    "scales":    ["straight", "vibrato", "fast_forte", "fast_piano", "slow_forte", "slow_piano"],
    "long_tones": ["straight", "messa", "forte", "pp"],
}

In [None]:
# FRAME DATA GENERATION
# 🚀 Traverse each singer
for singer in os.listdir(root_input):
    singer_path = os.path.join(root_input, singer)
    if not os.path.isdir(singer_path):
        continue

    for top_level, allowed_subdirs in include_folders.items():
        top_level_path = os.path.join(singer_path, top_level)
        if not os.path.isdir(top_level_path):
            continue

        for subdir in allowed_subdirs:
            subdir_path = os.path.join(top_level_path, subdir)
            if not os.path.isdir(subdir_path):
                continue

            for filename in os.listdir(subdir_path):
                if not filename.endswith(".wav"):
                    continue

                file_path = os.path.join(subdir_path, filename)

                # 🗂 Construct output path
                rel_path = os.path.relpath(file_path, root_input)
                outname = os.path.splitext(rel_path)[0]
                full_output_path = os.path.join(root_output, outname)
                os.makedirs(os.path.dirname(full_output_path), exist_ok=True)

                if os.path.exists(full_output_path + "_features.npz"):
                  print(f"⏩ Skipping {filename}: features already exist")
                  continue

                # 📥 Load full audio
                y, sr = librosa.load(file_path, sr=None) # sr becomes 44100
                frame_length = 2048
                hop_length = 512
                rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
                times = librosa.frames_to_time(np.arange(len(rms)), sr=sr, hop_length=hop_length)

                # ✂️ Trim start/end silence based on threshold
                threshold = 0.005
                active_frames = np.where(rms > threshold)[0]
                if len(active_frames) < 3:
                    print(f"⚠️ Skipping {filename}: not enough active frames")
                    continue

                start_sample = int(times[active_frames[0]] * sr)
                end_sample = int(times[active_frames[-1]] * sr)
                y_trimmed = y[start_sample:end_sample]

                # 🎯 Extract pitch, RMS, spectrogram, and metadata
                time_vals, f0_vals, rms_vals, M, hop = extract_features(y_trimmed, sr=sr)

                print(M.shape)

                # 💾 Save single CSV: time, f0, rms, M0, M1, ...
                csv_data = np.column_stack((time_vals, f0_vals, rms_vals, M))
                np.savez_compressed(full_output_path + "_features.npz",
                    time=time_vals, f0=f0_vals, rms=rms_vals, M=M)

                print(f"✅ Saved trimmed features for {filename}: {csv_data.shape[0]} frames")

(489, 1025)
✅ Saved trimmed features for f9_arpeggios_straight_o.wav: 489 frames
(505, 1025)
✅ Saved trimmed features for f9_arpeggios_straight_e.wav: 505 frames
(500, 1025)
✅ Saved trimmed features for f9_arpeggios_straight_i.wav: 500 frames
(514, 1025)
✅ Saved trimmed features for f9_arpeggios_straight_a.wav: 514 frames
(527, 1025)
✅ Saved trimmed features for f9_arpeggios_straight_u.wav: 527 frames
(574, 1025)
✅ Saved trimmed features for f9_arpeggios_vibrato_i.wav: 574 frames
(553, 1025)
✅ Saved trimmed features for f9_arpeggios_vibrato_a.wav: 553 frames
(556, 1025)
✅ Saved trimmed features for f9_arpeggios_vibrato_e.wav: 556 frames
(550, 1025)
✅ Saved trimmed features for f9_arpeggios_vibrato_o.wav: 550 frames
(561, 1025)
✅ Saved trimmed features for f9_arpeggios_vibrato_u.wav: 561 frames
(155, 1025)
✅ Saved trimmed features for f9_arpeggios_c_fast_forte_o.wav: 155 frames
(128, 1025)
✅ Saved trimmed features for f9_arpeggios_f_fast_forte_a.wav: 128 frames
(145, 1025)
✅ Saved trimm