<a href="https://colab.research.google.com/github/YosukeSugiura/-_-/blob/main/%E6%BC%94%E7%BF%92%E8%AA%B2%E9%A1%8C%EF%BC%91%EF%BC%9AWorld%E3%81%AB%E3%82%88%E3%82%8B%E5%A3%B0%E8%B3%AA%E5%A4%89%E6%8F%9B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""Voice_Conversion_experiments_online_demo.ipynb

Automatically generated by Colab.

# Voice Conversion (Teaching Notebook)

**Created:** 2025-09-28
**Author:** Yosuke Sugiura (Saitama University)
**Purpose:** Hands-on exercise on voice conversion using WORLD (Pitch/Formant/Aperiodicity) in Google Colab.
**Audience:** Undergraduate/Graduate audio/speech class
**Notes:** Tested on Colab (Python 3.12), Gradio 5.7.x, librosa 0.10.2.post1, pyworld 0.3.4.
"""

# =========================
# Setup (Colab)
# =========================
# Base build tools
!pip -q install -U pip setuptools wheel
!pip -q install -U soxr>=0.3.7

# NumPy pinned to satisfy TF (<2.2) & numba (<2.1), and keep OpenCV happy (>=2)
!pip -q install -U "numpy==2.0.2"

# Audio stack
!pip -q install -U librosa==0.10.2.post1 soundfile==0.12.1 pyworld==0.3.4

# Gradio 5.x (includes updated client); websockets >=14 to satisfy other libs too
!pip -q install -U "gradio==5.47.2" "websockets>=14,<16"

# 日本語フォント（図の日本語表示用）
!apt-get -y -qq install fonts-ipafont-gothic > /dev/null

# =========================
# Cell 2 — Core functions
# =========================
# WORLD-based voice conversion with:
# - Pitch (F0) shift in semitones
# - Formant shift via Mel-scale spectral-envelope warping
# - Aperiodicity control (single slider: breathiness b)
# - Robust loader: decode at native SR, resample to 24 kHz only if needed

import numpy as np
import librosa, soundfile as sf
import pyworld as pw
import matplotlib
matplotlib.rcParams["font.family"] = "IPAGothic"   # 日本語フォント
matplotlib.rcParams["axes.unicode_minus"] = False  # マイナス記号対策

SR = 24000  # working sample rate for WORLD

def _f64c(x):
    """Make array float64 & C-contiguous (pyworld is strict)."""
    return np.ascontiguousarray(x, dtype=np.float64)

def load_wav(path, target_sr=SR, do_trim=False):
    """
    Load audio at its native sample rate, then resample to target_sr only if needed.
    Optionally trim leading/trailing silence (disabled by default).
    """
    y, sr_in = librosa.load(path, sr=None, mono=True)  # decode at native SR
    if do_trim:
        y, _ = librosa.effects.trim(y, top_db=30)
    if sr_in != target_sr:
        y = librosa.resample(y, orig_sr=sr_in, target_sr=target_sr, res_type="soxr_hq")
    y = librosa.util.normalize(y)
    return _f64c(y), target_sr

def world_analyze(x, sr):
    """WORLD analysis: F0 (harvest), spectral envelope (cheaptrick), aperiodicity (d4c)."""
    x = _f64c(x)
    f0, t = pw.harvest(x, sr, f0_floor=50.0, f0_ceil=600.0)
    sp = pw.cheaptrick(x, f0, t, sr)   # [n_frame, n_bin], linear-freq envelope
    ap = pw.d4c(x, f0, t, sr)          # [n_frame, n_bin], 0..1
    return _f64c(f0), _f64c(sp), _f64c(ap), t

def world_synthesize(f0, sp, ap, sr):
    """WORLD synthesis with guards and peak normalization."""
    f0 = _f64c(f0); sp = _f64c(sp); ap = _f64c(ap)
    # Replace bad values if any
    if not np.isfinite(f0).all(): f0 = np.nan_to_num(f0, nan=0.0, posinf=0.0, neginf=0.0)
    if not np.isfinite(sp).all(): sp = np.nan_to_num(sp, nan=1e-6, posinf=1.0, neginf=1e-6)
    if not np.isfinite(ap).all(): ap = np.nan_to_num(ap, nan=0.0, posinf=1.0, neginf=0.0)
    y = pw.synthesize(f0, sp, ap, sr)
    peak = float(np.max(np.abs(y)) + 1e-9)
    return (y / peak).astype(np.float32)

def _spec_image_path_from_audio(y, sr, title="スペクトログラム"):
    """Create and save a spectrogram PNG from waveform y. Returns file path (PNG)."""
    try:
        import matplotlib
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt
        import librosa
        import librosa.display as lbd
        import numpy as np
        import tempfile

        # STFT magnitude -> dB
        D = np.abs(librosa.stft(y, n_fft=1024, hop_length=256, win_length=1024))
        Db = librosa.amplitude_to_db(D, ref=np.max)

        fig = plt.figure(figsize=(8, 3))
        ax = fig.add_subplot(111)
        lbd.specshow(Db, sr=sr, hop_length=256, x_axis="time", y_axis="linear", ax=ax)
        ax.set_title(title)
        ax.set_xlabel("時間 [s]")
        ax.set_ylabel("周波数 [Hz]")
        fig.tight_layout()

        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
        fig.savefig(tmp.name, dpi=150)
        plt.close(fig)
        return tmp.name
    except Exception:
        return None

def shift_pitch(f0, semitone):
    """Shift F0 by semitones (unvoiced frames stay at 0)."""
    ratio = 2.0 ** (semitone / 12.0)
    f0_new = f0.copy()
    mask = f0_new > 0.0
    f0_new[mask] *= ratio
    return _f64c(f0_new)

# ----- Mel utilities -----
def hz2mel(f):  return 2595.0 * np.log10(1.0 + f / 700.0)
def mel2hz(m):  return 700.0 * (10.0**(m / 2595.0) - 1.0)

def warp_spectral_envelope_mel(sp, sr, formant_ratio, tilt_db_per_oct=0.0):
    """
    Formant shifting by warping WORLD spectral envelope along the Mel axis.
    * Interpolate in log-amplitude to preserve shapes.
    * Per-frame power normalization to avoid overall timbre drift.
    * tilt_db_per_oct は授業簡略化のため 0 固定で利用。
    """
    sp = _f64c(sp)
    nframe, nbin = sp.shape
    f_lin = np.linspace(0.0, sr/2, nbin)
    f_safe = np.maximum(f_lin, 20.0)
    m_lin = hz2mel(f_safe)

    m_src = m_lin / max(float(formant_ratio), 1e-6)
    f_src = mel2hz(m_src)

    pos = np.clip(f_src / (sr/2) * (nbin - 1), 0, nbin - 1)
    lo = np.floor(pos).astype(int)
    hi = np.clip(lo + 1, 0, nbin - 1)
    w  = (pos - lo)[None, :]

    logsp = np.log(np.maximum(sp, 1e-12))
    logsp_warp = (1 - w) * logsp[:, lo] + w * logsp[:, hi]
    sp_warp = np.exp(logsp_warp)

    if abs(tilt_db_per_oct) > 1e-6:
        ref = 1000.0
        ratio = np.maximum(f_safe / ref, 1e-6)
        gain = 10.0 ** ((tilt_db_per_oct * np.log2(ratio)) / 20.0)
        sp_warp *= gain[None, :]

    src_pow = np.sum(sp, axis=1, keepdims=True) + 1e-9
    dst_pow = np.sum(sp_warp, axis=1, keepdims=True) + 1e-9
    sp_warp *= (src_pow / dst_pow)
    return _f64c(sp_warp)

# --- 非周期性（息っぽさ）b：ロジット1本化 ---
def adjust_aperiodicity_unified(ap, b, k=2.0):
    ap = np.clip(ap, 1e-6, 1-1e-6)
    logit = np.log(ap/(1-ap))
    ap2 = 1.0/(1.0 + np.exp(-(logit + k*b)))
    return np.clip(ap2, 0.0, 1.0)

def match_stats_to_target_mel(f0_src, sp_src, f0_tgt, sp_tgt, sr):
    """Rough target-guided alignment: median F0 + Mel spectral centroid (not used in UI)."""
    nz_src = f0_src[f0_src > 1.0]
    nz_tgt = f0_tgt[f0_tgt > 1.0]
    semi = 12*np.log2(np.median(nz_tgt)/np.median(nz_src)) if (len(nz_src) and len(nz_tgt)) else 0.0

    def mel_centroid(sp, sr):
        f = np.linspace(0, sr/2, sp.shape[1]); f = np.maximum(f, 20.0)
        m = hz2mel(f); vals = []
        for S in sp:
            w = S + 1e-9
            vals.append((m*w).sum()/w.sum())
        return float(np.median(vals))

    c_src = mel_centroid(sp_src, sr)
    c_tgt = mel_centroid(sp_tgt, sr)
    formant_ratio = float(np.clip(c_tgt/max(c_src,1e-9), 0.6, 1.8))
    return float(semi), formant_ratio

# =========================
# Cell 1 — Reset server
# =========================
# Prevent event-loop conflicts on re-run in Colab.
import gradio as gr
try:
    gr.close_all()
except Exception:
    pass

import nest_asyncio
nest_asyncio.apply()

# =========================
# Cell 3 — Gradio UI (label → hint → slider; no Target; URL only)
# =========================
import gradio as gr
import tempfile, os, traceback, json
import soundfile as sf
import numpy as np

def _to_path(obj):
    if obj is None:
        return None
    if isinstance(obj, (str, os.PathLike)):
        return str(obj)
    if isinstance(obj, dict) and "name" in obj:
        return obj["name"]
    if isinstance(obj, (list, tuple)) and len(obj) > 0:
        v = obj[0]
        if isinstance(v, (str, os.PathLike)):
            return str(v)
        if isinstance(v, dict) and "name" in v:
            return v["name"]
    raise ValueError("Unsupported audio object from UI.")

def process(source_audio, pitch_semitone, formant_ratio, breathiness_b):
    try:
        src_path = _to_path(source_audio)
        if not (src_path and os.path.exists(src_path)):
            return None, "入力音声を読み込めませんでした。録音またはファイルを指定してください。", None, None

        x, sr = load_wav(src_path, SR)
        if len(x) < int(0.3 * SR):
            return None, "音声が短すぎます（0.5秒以上を推奨）。", None, None

        # WORLD解析
        f0, sp, ap, _ = world_analyze(x, sr)

        # 変換（ピッチ／フォルマント／非周期性=統一スライダ）
        f0_m = shift_pitch(f0, pitch_semitone)
        sp_m = warp_spectral_envelope_mel(sp, sr, formant_ratio, tilt_db_per_oct=0.0)  # 傾きは固定0
        ap_m = adjust_aperiodicity_unified(ap, b=float(breathiness_b), k=2.0)

        y = world_synthesize(f0_m, sp_m, ap_m, sr)

        # WAV保存（変換後）
        import tempfile
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        sf.write(tmp.name, y, sr)

        info = (
            f"ピッチ: {pitch_semitone:+.1f} semitone｜"
            f"フォルマント比(Mel): ×{formant_ratio:.2f}｜"
            f"非周期性（息っぽさ）b: {breathiness_b:+.2f}"
        )

        # スペクトログラム（前・後）
        spec_src  = _spec_image_path_from_audio(x, sr, title="スペクトログラム（変換前）")
        spec_conv = _spec_image_path_from_audio(y, sr, title="スペクトログラム（変換後）")

        return tmp.name, info, spec_src, spec_conv

    except Exception:
        import traceback
        return None, f"**エラー**\n\n```traceback\n{traceback.format_exc()}\n```", None, None

# 原音（録音そのまま）をWAVで保存するヘルパ
def _save_original(source_audio):
    try:
        src_path = _to_path(source_audio)
        if not (src_path and os.path.exists(src_path)):
            return None
        y, sr = sf.read(src_path)
        if y.ndim > 1:
            y = y[:, 0]
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        sf.write(tmp.name, y, sr)
        return tmp.name
    except Exception:
        return None

with gr.Blocks() as demo:
    gr.Markdown("## WORLDベース音声変換（Melスケール・フォルマントワープ）")
    gr.Markdown("短い**話し声**を録音／アップロードし、スライダを調整して**変換**を押してください。最下部に**変換前／変換後**のスペクトログラムが**横並び**で表示されます。")

    # UI微調整：×(Clear)ボタンを隠す
    gr.HTML("""
    <style>
      button[aria-label="Clear"] { display: none !important; }  /* Audioの×ボタン非表示 */
      .param-title { font-weight:600; margin:8px 0 2px; }
      .param-hint  { font-size:0.85rem; color:#6b7280; margin:0 0 6px; }
    </style>
    """)

    # 入力音声
    src = gr.Audio(sources=["microphone","upload"], type="filepath",
                   label="入力音声（自分の声）",
                   waveform_options={"show_recording_waveform": True})

    # その場で即ダウンロード開始（1クリック）
    save_dl = gr.DownloadButton("原音WAVをダウンロード", variant="primary")

    def _save_original(ui_audio):
        try:
            src_path = _to_path(ui_audio)
            if not (src_path and os.path.exists(src_path)):
                return None  # 未選択ならダウンロードは発火しない
            y, sr = sf.read(src_path)
            if y.ndim > 1:
                y = y[:, 0]
            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
            sf.write(tmp.name, y, sr)
            return tmp.name  # DownloadButton の value に渡る → 直後に保存ダイアログ
        except Exception:
            return None

    # DownloadButton 自身を outputs に指定して value を更新＝即ダウンロード
    save_dl.click(_save_original, inputs=[src], outputs=[save_dl])

    # スライダ群
    gr.HTML("<div class='param-title'>ピッチ（semitone）</div>"
            "<div class='param-hint'>上げる→高く若め／下げる→低く落ち着き</div>")
    pitch = gr.Slider(-12, 12, value=0.0, step=0.1, label="", show_label=False)

    gr.HTML("<div class='param-title'>フォルマント比（Mel）</div>"
            "<div class='param-hint'>1.0より大→明るめ・子ども声寄り／1.0より小→暗め・大人声寄り</div>")
    formant = gr.Slider(0.6, 1.8, value=1.00, step=0.01, label="", show_label=False)

    gr.HTML("<div class='param-title'>非周期性（息っぽさ） b</div>"
            "<div class='param-hint'>-1でクリア／+1で息多め（推奨 ±0.6）</div>")
    breath = gr.Slider(-1.0, 1.0, value=0.0, step=0.05, label="", show_label=False)

    # 実行と出力
    btn = gr.Button("変換")
    out_audio = gr.Audio(label="変換後音声")  # 右上に標準のダウンロードボタンあり
    out_text  = gr.Markdown()                # 簡潔情報のみ（デバッグ非表示）
    with gr.Row():
        out_spec_src  = gr.Image(label="スペクトログラム（変換前）", type="filepath")
        out_spec_conv = gr.Image(label="スペクトログラム（変換後）", type="filepath")

    btn.click(process, inputs=[src, pitch, formant, breath],
              outputs=[out_audio, out_text, out_spec_src, out_spec_conv])

# Launch without inline iframe; print URL only
demo.queue()
res = demo.launch(share=True, inline=False, debug=False, show_error=True)

from IPython.display import clear_output
clear_output(wait=True)
url = getattr(res, "share_url", None) or getattr(res, "public_url", None)
if url is None:
    try:
        _, _, url = res
    except Exception:
        url = "(no share URL)"

# --- Show only URL + QR in Colab output ---
from IPython.display import clear_output, display, Markdown, Image
clear_output(wait=True)

# Get public URL safely across gradio versions
url = getattr(res, "share_url", None) or getattr(res, "public_url", None)
if url is None:
    try:
        _, _, url = res  # older tuple return
    except Exception:
        url = None

if not url:
    print("No share URL.")
else:
    # install a tiny QR lib (pure-Python)
    try:
        import segno
    except Exception:
        import sys, subprocess
        subprocess.run([sys.executable, "-m", "pip", "-q", "install", "segno"], check=False)
        import segno

    qr_path = "/content/gradio_url_qr.png"
    segno.make(url).save(qr_path, scale=8, border=2)  # scaleでサイズ調整

    display(Markdown(f"### Public URL\n**{url}**"))
    display(Image(qr_path))
    #print("QR saved:", qr_path)
