# TIMIT

In [None]:
import os
import numpy as np
import pyworld as pw
import soundfile as sf
import pysptk
import librosa 


def process_wav(wav_path):
        utt_id = os.path.splitext(os.path.basename(wav_path))[0]

        try:
            x, fs = sf.read(wav_path)
            if fs != 16000:
                print(f"[SKIP] {utt_id}: sample rate {fs} != 16000")
                return None, None
        except Exception as e:
            print(f"[ERROR] {utt_id} load failed: {e}")
            return None, None

        # WORLD 분석
        f0, timeaxis = pw.harvest(x, fs, frame_period=frame_period)
        f0 = pw.stonemask(x, f0, timeaxis, fs)
        sp = pw.cheaptrick(x, f0, timeaxis, fs)
        ap = pw.d4c(x, f0, timeaxis, fs)

        # MCC 변환
        mcc = pysptk.sp2mc(sp, order=order, alpha=alpha)

        # AP 평균 1차원
        ap_reduced = np.mean(ap, axis=1, keepdims=True)

        # log-F0 (0에 작은값 더해서 로그 안정화)
        log_f0 = np.log(f0 + 1e-8).reshape(-1, 1)

        # Voice/unvoiced flag (1 for voiced, 0 for unvoiced)
        vuv = (f0 > 0).astype(np.float32).reshape(-1, 1)

        # static feature concat (MCC + logF0 + AP)
        static_feat = np.hstack([mcc, log_f0, ap_reduced])

        # delta, delta-delta
        delta = librosa.feature.delta(static_feat, order=1)
        deltadelta = librosa.feature.delta(static_feat, order=2)

        # 최종 feature concat
        final_feat = np.hstack([static_feat, delta, deltadelta, vuv])

        return utt_id, final_feat


# 경로 설정
modes = ["TRAIN", "DEV", "TEST"]

for mode in modes:
    timit_train_dir = f"kaldi/egs/timit/s5/data/TIMIT/{mode}"
    out_dir = "timit_mcc_hubert"
    os.makedirs(out_dir, exist_ok=True)

    mcep_dim = 39
    alpha = 0.42  # 16kHz 기준
    frame_period = 20.0  # ms
    order = mcep_dim

    # utt2feat.scp와 utt_list.txt 생성 준비
    scp_path = os.path.join(out_dir, "utt2feat.scp")
    utt_list_path = os.path.join(out_dir, "utt_list.txt")

    with open(scp_path, "w") as scp_f, open(utt_list_path, "w") as utt_f:
        # 재귀적으로 모든 wav 처리
        for root, dirs, files in os.walk(timit_train_dir):
            for f in files:
                if not f.lower().endswith(".wav"):
                    continue
                wav_path = os.path.join(root, f)

                utt_id, feat = process_wav(wav_path)
                if utt_id is None:
                    continue

                feat_path = os.path.join(out_dir, f"{root.split('/')[-1]}_{utt_id}_feat.npy")
                np.save(feat_path, feat)

                scp_f.write(f"{utt_id}\t{feat_path}\n")
                utt_f.write(f"{utt_id}\n")

                print(f"[OK] {utt_id} saved, shape={feat.shape}")


# LibriSpeech

In [None]:
import os
import numpy as np
import pyworld as pw
import soundfile as sf
import pysptk
import librosa  # delta 계산용


def process_wav(wav_path):
        idx = os.path.splitext(os.path.basename(wav_path))[0]

        try:
            x, fs = sf.read(wav_path)
            if fs != 16000:
                print(f"[SKIP] {idx}: sample rate {fs} != 16000")
                return None, None
        except Exception as e:
            print(f"[ERROR] {idx} load failed: {e}")
            return None, None

        # WORLD Analysis
        f0, timeaxis = pw.harvest(x, fs, frame_period=frame_period)
        f0 = pw.stonemask(x, f0, timeaxis, fs)
        sp = pw.cheaptrick(x, f0, timeaxis, fs)
        ap = pw.d4c(x, f0, timeaxis, fs)

        # MCC Transformation
        mcc = pysptk.sp2mc(sp, order=order, alpha=alpha)

        # AP Mean 1D
        ap_reduced = np.mean(ap, axis=1, keepdims=True)

        # log-F0
        log_f0 = np.log(f0 + 1e-8).reshape(-1, 1)

        # Voice/unvoiced flag (1 for voiced, 0 for unvoiced)
        vuv = (f0 > 0).astype(np.float32).reshape(-1, 1)

        # static feature concat (MCC + logF0 + AP)
        static_feat = np.hstack([mcc, log_f0, ap_reduced])

        # delta, delta-delta
        delta = librosa.feature.delta(static_feat, order=1)
        deltadelta = librosa.feature.delta(static_feat, order=2)

        # Final feature concat
        final_feat = np.hstack([static_feat, delta, deltadelta, vuv])

        return idx, final_feat


# 경로 설정
modes = ["test-clean", "dev-other", "test-other"]

for mode in modes:
    timit_train_dir = f"kaldi/egs/librispeech/s5/data/{mode}/LibriSpeech/{mode}"
    out_dir = "librispeech_mcc"
    os.makedirs(out_dir, exist_ok=True)

    mcep_dim = 39
    alpha = 0.42  # 16kHz
    frame_period = 10.0  # ms
    order = mcep_dim

    # Prepare to generate utt2feat.scp and utt_list.txt
    scp_path = os.path.join(out_dir, "utt2feat.scp")
    utt_list_path = os.path.join(out_dir, "utt_list.txt")

    with open(scp_path, "w") as scp_f, open(utt_list_path, "w") as utt_f:
        # Processing all wav reculsively
        for root, dirs, files in os.walk(timit_train_dir):
            for f in files:
                if not f.lower().endswith(".flac"):
                    continue
                wav_path = os.path.join(root, f)

                idx, feat = process_wav(wav_path)
                if idx is None:
                    continue

                feat_path = os.path.join(out_dir, f"{idx}-feat.npy")
                np.save(feat_path, feat)

                scp_f.write(f"{idx}\t{feat_path}\n")
                utt_f.write(f"{idx}\n")

                print(f"[OK] {idx} saved, shape={feat.shape}")


# Vocoder Test Code

### Method1

In [None]:
wav_path = "kaldi/egs/timit/s5/data/TIMIT/TEST/DR8/FCMH1/SA1.WAV"

In [None]:
# import os
# import numpy as np
# import pyworld as pw
# import soundfile as sf
# import pysptk

# x, fs = sf.read(wav_path)
# f0, timeaxis = pw.harvest(x, fs)
# sp = pw.cheaptrick(x, f0, timeaxis, fs)
# ap = pw.d4c(x, f0, timeaxis, fs)

# # 2. sp → mcep
# alpha = 0.42
# order = 40
# mcep = pysptk.sp2mc(sp, order=order, alpha=alpha)

In [None]:
# alpha = 0.42
# order = 24
# fftlen = 1024
# frame_period = 10.0

# x, fs = sf.read(wav_path)
# f0, timeaxis = pw.harvest(x, fs, frame_period=10.0)
# sp = pw.cheaptrick(x, f0, timeaxis, fs)
# ap = pw.d4c(x, f0, timeaxis, fs)

# #spectrum --> mcc
# mcep = pysptk.sp2mc(sp, order=order, alpha=alpha)

# #mcc --> spectrum
# sp = pysptk.mc2sp(mcep, alpha=alpha, fftlen=fftlen)

# # WORLD synthesis
# synthesized = pw.synthesize(f0, sp, ap, fs, frame_period)

# # save
# sf.write("synthesized.wav", synthesized.astype(np.float32), fs)

In [None]:
import numpy as np
import pyworld as pw
import pysptk
import soundfile as sf
import scipy.ndimage
import torch

# diffsptk MLPG import
from diffsptk import MLPG
import diffsptk

wav_path = "kaldi/egs/timit/s5/data/TIMIT/TEST/DR4/MLLL0/SX283.WAV"
x, fs = sf.read(wav_path)
frame_period = 10.0

# 1. WORLD Analysis
f0, timeaxis = pw.harvest(x, fs, frame_period=frame_period)
f0 = pw.stonemask(x, f0, timeaxis, fs)
sp = pw.cheaptrick(x, f0, timeaxis, fs)
ap = pw.d4c(x, f0, timeaxis, fs)

alpha = 0.42
mcep_order = 24
mcc = pysptk.sp2mc(sp, order=order, alpha=alpha)

# 3. delta, delta-delta Calculation function
def calculate_deltas(feats):
    T, D = feats.shape
    padded = np.pad(feats, ((2, 2), (0, 0)), mode='edge')
    delta = (padded[2+1:T+2+1] - padded[2-1:T+2-1]) / 2
    delta_delta = padded[2+2:T+2+2] - 2 * padded[2:T+2] + padded[2-2:T+2-2]
    return delta, delta_delta


delta, delta_delta = calculate_deltas(mcc)

# 4. generation Input Vector for applying MLPG (static + delta + delta-delta)
features = np.hstack([mcc, delta, delta_delta])
features_torch = torch.from_numpy(features).float().unsqueeze(0)  # shape: (1, T, 3*D)

# MLPG Apply
T = features.shape[0]
mlpg = MLPG(size=T)
smoothed = mlpg(features_torch)  # shape: (1, T, D)

#  numpy Transformation
smoothed_mcc = smoothed.squeeze(0).numpy().T  # shape: (D, T)

# 8. MCC restructure → SP
fftlen = 1024

sp_recon = np.array([
    pysptk.mc2sp(m, alpha=alpha, fftlen=fftlen)
    for m in smoothed_mcc.T
])

# 10. WORLD vocoder synthesis
wav = pw.synthesize(f0, sp_recon.astype(np.double), ap, fs, frame_period=frame_period)

# 11. save
sf.write("reconstructed_mlpg_gt.wav", wav, fs)


### method2

In [None]:
import numpy as np
import pyworld as pw
import pysptk
import soundfile as sf
import scipy.ndimage
import torch

# diffsptk MLPG import
from diffsptk import MLPG
import diffsptk

wav_path = "kaldi/egs/timit/s5/data/TIMIT/TEST/DR8/FCMH1/SA1.WAV"
x, fs = sf.read(wav_path)
frame_period = 5.0

f0, timeaxis = pw.harvest(x, fs)
f0 = pw.stonemask(x, f0, timeaxis, fs)
sp = pw.cheaptrick(x, f0, timeaxis, fs)
ap = pw.d4c(x, f0, timeaxis, fs)

alpha = 0.42
mcep_order = 24
mcc = pysptk.sp2mc(sp, order=order, alpha=alpha)

def calculate_deltas(feats):
    T, D = feats.shape
    padded = np.pad(feats, ((2, 2), (0, 0)), mode='edge')
    delta = (padded[2+1:T+2+1] - padded[2-1:T+2-1]) / 2
    delta_delta = padded[2+2:T+2+2] - 2 * padded[2:T+2] + padded[2-2:T+2-2]
    return delta, delta_delta


delta, delta_delta = calculate_deltas(mcc)

features = np.hstack([mcc, delta, delta_delta])
features_torch = torch.from_numpy(features).float().unsqueeze(0) 

T = features.shape[0]
print(features.shape)
mlpg = MLPG(size=T) 
smoothed = mlpg(features_torch)  

smoothed_mcc = smoothed.squeeze(0).numpy().T

fftlen = 1024

sp_recon = np.array([
    pysptk.mc2sp(m, alpha=alpha, fftlen=fftlen)
    for m in smoothed_mcc.T
])


wav = pw.synthesize(f0, sp_recon.astype(np.double), ap, fs, frame_period=frame_period)

sf.write("reconstructed_mlpg.wav", wav, fs)
