## VC-SD: Demonstration

This script demonstrate the voice conversion, voice design and controllability of the VC-SD framework. Please note, this script is for demonstration purposes only, final models, training schemes, practical implementation etc. is not provided.

For each example you can choose the audio files from the current repository or upload from your own PC.

## Install dependencies

**DO NOT RUN THIS SCRIPT INSIDE THE REPOSITORY AS IT WILL OVERRIDE DEPENDENCIES**.

The google colab version will need a different ipywidgets version than normal machines.



In [1]:
#@title Setup and Imports
!pip install ipywidgets==7.7.1

from google.colab import output
output.enable_custom_widget_manager()

!pip install torch librosa numpy descript-audiotools torchfcpe ipyfilechooser
print("All libraries installed successfully!")

All libraries installed successfully!


## Import dependencies

In [3]:
#@title Import

!git clone https://github.com/abargum/vc-sd-reproduction.git

import os
os.chdir('vc-sd-reproduction')

import io
import json
import torch
import librosa
import numpy as np
from utils.demo_utils import *
from audiotools import transforms as tfm
import ipywidgets as widgets
from IPython.display import display
from ipyfilechooser import FileChooser
from ipywidgets import widgets

def load_audio_from_state(state, start_idx=0):
    """Load audio either from an uploaded buffer or from a file path."""
    if state.uploaded_audio is not None:
        samples, sr = state.uploaded_audio
    else:
        if not os.path.isfile(state.value):
            raise FileNotFoundError(f"File not found: '{state.value}'")
        samples, sr = librosa.load(state.value, sr=16000, mono=True)
    return samples[start_idx:], sr

def linear_map(x, src_min, src_max, dst_min, dst_max):
    x = np.clip(x, src_min, src_max)
    return dst_min + (x - src_min) * (dst_max - dst_min) / (src_max - src_min)

def years_to_age_param(years):
    """Convert years back to age parameter"""
    return linear_map(
        years,
        src_min=15,
        src_max=90,
        dst_min=-0.75,
        dst_max=3.5,
    )

def semitones_to_pitch(semitones):
    """Convert semitones to pitch multiplier (octaves)"""
    return 2 ** (semitones / 12.0)

def gender_param_to_label(gender_param):
    """Convert gender parameter to label"""
    if gender_param < 0:
        return "Male"
    else:
        return "Female"

def make_audio_picker(label, default_value):
    class PickerState:
        value = default_value
        uploaded_audio = None

    state = PickerState()

    selected_label = widgets.HTML(
        value=f'<span style="color:#555">üìÑ Current file: <b>{os.path.basename(default_value)}</b></span>'
    )

    def update_label(fname):
        selected_label.value = f'<span style="color:#1a7a1a">üìÑ Current file: <b>{os.path.basename(fname)}</b></span>'

    mode_toggle = widgets.ToggleButtons(
        options=['üìÅ Browse Here', 'üíª Upload'],
        description='',
        button_style='',
        layout=widgets.Layout(margin='0 0 6px 0')
    )

    fc = FileChooser(
        path=os.path.dirname(default_value) if os.path.dirname(default_value) else '.',
        filter_pattern=['*.wav', '*.mp3', '*.flac'],
        title=f'<b>{label}</b>'
    )

    def on_fc_change(chooser):
        if chooser.selected:
            state.value = chooser.selected
            state.uploaded_audio = None
            update_label(chooser.selected)

    fc.register_callback(on_fc_change)
    fc_box = widgets.VBox([fc])

    upload_widget = widgets.FileUpload(
        accept='.wav,.mp3,.flac',
        multiple=False,
        description='Upload Audio',
        layout=widgets.Layout(width='250px')
    )
    upload_status = widgets.Label(value='No file uploaded yet.')

    def on_upload(change):
        if upload_widget.value:
            uploaded = list(upload_widget.value.values())[0] \
                if isinstance(upload_widget.value, dict) \
                else upload_widget.value[0]

            fname   = uploaded['metadata']['name'] \
                if 'metadata' in uploaded else uploaded['name']
            content = uploaded['content'] \
                if 'content' in uploaded else uploaded['data']

            audio_bytes = io.BytesIO(bytes(content))
            samples, sr = librosa.load(audio_bytes, sr=16000, mono=True)

            state.value = fname
            state.uploaded_audio = (samples, sr)
            upload_status.value = f'‚úÖ Loaded: {fname}  ({len(samples)/sr:.2f}s)'
            update_label(fname)

    upload_widget.observe(on_upload, names='value')

    pc_box = widgets.VBox([upload_widget, upload_status])
    pc_box.layout.display = 'none'

    def on_toggle(change):
        if change['new'] == 'üìÅ Browse Here':
            fc_box.layout.display = ''
            pc_box.layout.display = 'none'
            state.uploaded_audio = None
            if fc.selected:
                state.value = fc.selected
                update_label(fc.selected)
        else:
            fc_box.layout.display = 'none'
            pc_box.layout.display = ''

    mode_toggle.observe(on_toggle, names='value')

    header = widgets.HTML(f'<b style="font-size:14px">{label}</b>')
    container = widgets.VBox([
        widgets.HBox(
            [header, selected_label],
            layout=widgets.Layout(
                justify_content='space-between',
                align_items='center',
                width='100%'
            )
        ),
        mode_toggle,
        fc_box,
        pc_box,
    ], layout=widgets.Layout(
        border='1px solid #ccc',
        padding='8px',
        margin='4px 0',
        border_radius='6px'
    ))

    return container, state

transform = tfm.Compose(
            tfm.VolumeNorm(),
            tfm.RescaleAudio())

vc_model = torch.jit.load("pretrained/model-nc.ts")
vc_model = vc_model.eval()

Cloning into 'vc-sd-reproduction'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 92 (delta 15), reused 24 (delta 9), pack-reused 59 (from 2)[K
Receiving objects: 100% (92/92), 65.02 MiB | 18.53 MiB/s, done.
Resolving deltas: 100% (24/24), done.


## Voice Design

In the next cell, you‚Äôll be able to transform a voice using simple, interactive controls.

Run the cell, then use the sliders to design a new voice profile for your input audio. Feel free to experiment ‚Äî even small adjustments can noticeably change the result.

---

### Controls

- **Audio File**: Select the audio file you want to convert. To use a different file, simply change the file path.

- **Gender**: Adjusts the perceived timbre of the voice: **-1.72** ‚Üí more typically masculine, **1.94** ‚Üí more typically feminine.
      
- **Age**: Changes the perceived age of the output. Due to age cues being subtle, this can be thought of as a timbre variation control.

- **Tremble**: Adds a tremble (vibrato-like effect) to the voice: **0** ‚Üí no tremble, **12** ‚Üí strong tremble.

- **Ambitus**: Controls how expressive the voice sounds: **0.5** ‚Üí flatter, more robotic, **1.5** ‚Üí wider pitch range, more emotional.

- **Pitch**: Shifts the overall pitch of the converted voice up or down.

---

üí° **Tip:** Try adjusting one slider at a time to clearly hear what each parameter changes ‚Äî then combine them to craft unique voice styles.

In [4]:
#@title Create Audio

input_container, input_state = make_audio_picker('Input Audio', 'audio/librispeech2.wav')

gender_slider  = widgets.FloatSlider(value=-0.1, min=-1.72, max=1.94, step=0.01, description='Gender:', continuous_update=False)
age_slider     = widgets.IntSlider(value=35, min=15, max=90, step=1, description='Age (years):', continuous_update=False)
tremble_slider = widgets.FloatSlider(value=1.0, min=0.0, max=12.0, step=0.1, description='Tremble:', continuous_update=False)
ambitus_slider = widgets.FloatSlider(value=1.0, min=0.5, max=1.5, step=0.01, description='Ambitus:', continuous_update=False)
pitch_slider   = widgets.IntSlider(value=0, min=-12, max=12, step=1, description='Pitch (semitones):', continuous_update=False)

gender_label  = widgets.Label(value='Male (-1.72) ‚Üí Female (1.94)')
age_label     = widgets.Label(value='Age in years')
tremble_label = widgets.Label(value='Tremble Amount')
ambitus_label = widgets.Label(value='Pitch Variance')
pitch_label   = widgets.Label(value='-12 to +12 semitones')

process_button = widgets.Button(description='Process Audio', button_style='success')
output_age_gender = widgets.Output()

def process_audio(b):
    with output_age_gender:
        output_age_gender.clear_output()

        gender     = gender_slider.value
        age_years  = age_slider.value
        age        = years_to_age_param(age_years)
        tremble    = tremble_slider.value
        ambitus    = ambitus_slider.value
        semitones  = pitch_slider.value
        pitch      = semitones_to_pitch(semitones)

        print(f"Audio file: {input_state.value}")
        print(f"Gender:     {gender_param_to_label(gender)}")
        print(f"Age:        {age_years} years")
        print(f"Tremble:    {tremble}")
        print(f"Ambitus:    {ambitus}")
        print(f"Pitch:      {semitones:+d} semitones")
        print()

        try:
            x_np, sr = load_audio_from_state(input_state)
            x = torch.tensor(x_np, dtype=torch.float32).unsqueeze(0).unsqueeze(0)

            speaker_gender  = torch.tensor([gender],  dtype=torch.float32)
            speaker_age     = torch.tensor([age],     dtype=torch.float32)
            speaker_tremble = torch.tensor([tremble], dtype=torch.float32)
            speaker_ambitus = torch.tensor([ambitus], dtype=torch.float32)
            speaker_pitch   = torch.tensor([pitch],   dtype=torch.float32)

            with torch.no_grad():
                vc_model.reset_pitch()
                vc_model.set_new_speaker(speaker_gender, speaker_age)
                vc_model.set_tremble_depth(speaker_tremble)
                vc_model.set_ambitus_scaler(speaker_ambitus)
                vc_model.set_pitch_mult(speaker_pitch)

            out = vc_model(normalize(x, transform))

            display_audios([("INPUT", x, sr), ("CONVERTED", out, sr)])

        except Exception as e:
            print(f"Error: {e}")

process_button.on_click(process_audio)

display(widgets.VBox([
    input_container,
    widgets.HBox([gender_slider,  gender_label]),
    widgets.HBox([age_slider,     age_label]),
    widgets.HBox([tremble_slider, tremble_label]),
    widgets.HBox([ambitus_slider, ambitus_label]),
    widgets.HBox([pitch_slider,   pitch_label]),
    process_button,
    output_age_gender
]))

VBox(children=(VBox(children=(HBox(children=(HTML(value='<b style="font-size:14px">Input Audio</b>'), HTML(val‚Ä¶

## üéß Convert by Audio Reference

Instead of designing a voice with sliders, you can also **convert your input to match a reference recording**.

Simply provide a **target audio file**, and the system will analyze its vocal characteristics, such as timbre, and tone, and apply them to your input audio.

**In short**: Input content + reference voice = your message, delivered in a new vocal style.

In [5]:
#@title Create Audio
input_container,  input_state  = make_audio_picker('Input Audio',  'audio/librispeech2.wav')
target_container, target_state = make_audio_picker('Target Audio', 'targets/p228_004.wav')

target_start_sample = widgets.IntText(
    value=8000,
    description='Target Start (in samples):',
    continuous_update=False
)

process_button = widgets.Button(description='Process Audio', button_style='success')
output_reference = widgets.Output()

def process_audio(b):
    with output_reference:
        output_reference.clear_output()
        try:
            start_idx = target_start_sample.value

            x_np, sr = load_audio_from_state(input_state)
            x = torch.tensor(x_np, dtype=torch.float32).unsqueeze(0).unsqueeze(0)

            t_np, sr = load_audio_from_state(target_state, start_idx=start_idx)
            t = torch.tensor(t_np, dtype=torch.float32).unsqueeze(0).unsqueeze(0)

            print(f"Input audio:  {input_state.value}")
            print(f"Target audio: {target_state.value}")
            print(f"Start sample: {start_idx}")
            print()

            with torch.no_grad():
                vc_model.reset_pitch()
                vc_model.set_embedding_from_audio(t)
                vc_model.set_tremble_depth(torch.zeros(1, dtype=torch.float32))
                vc_model.set_ambitus_scaler(torch.ones(1, dtype=torch.float32))
                vc_model.set_pitch_mult(torch.ones(1, dtype=torch.float32))

            out = vc_model(normalize(x, transform))
            display_audios([("INPUT", x, sr), ("TARGET", t, sr), ("CONVERTED", out, sr)])

        except Exception as e:
            print(f"Error: {e}")

process_button.on_click(process_audio)

display(widgets.VBox([
    input_container,
    target_container,
    target_start_sample,
    process_button,
    output_reference
]))

VBox(children=(VBox(children=(HBox(children=(HTML(value='<b style="font-size:14px">Input Audio</b>'), HTML(val‚Ä¶

## üéß Convert by Predefined Library

You can also **convert your input to match a predefined speaker ID**. In this case from the VCTK dataset.

Simply provide a **speaker ID**, (p225 - p360) and the system will add the vocal characteristics to your input.

In [6]:
#@title Create Audio
with open('utils/speaker_dict.json', 'r') as f:
    speaker_dict = json.load(f)
speaker_ids = sorted(speaker_dict.keys())

input_container, input_state = make_audio_picker('Input Audio', 'audio/librispeech2.wav')

speaker_selected_label = widgets.HTML(
    value=f'<span style="color:#555">üéôÔ∏è Selected: <b>p231</b></span>'
)

def update_speaker_label(sid):
    speaker_selected_label.value = f'<span style="color:#1a7a1a">üéôÔ∏è Selected: <b>{sid}</b></span>'

speaker_select = widgets.Select(
    options=speaker_ids,
    value='p231' if 'p231' in speaker_ids else speaker_ids[0],
    rows=8,
    layout=widgets.Layout(width='100%')
)

update_speaker_label(speaker_select.value)

def on_speaker_select(change):
    update_speaker_label(change['new'])

speaker_select.observe(on_speaker_select, names='value')

speaker_header = widgets.HTML('<b style="font-size:14px">Speaker ID</b>')
speaker_container = widgets.VBox([
    widgets.HBox(
        [speaker_header, speaker_selected_label],
        layout=widgets.Layout(justify_content='space-between', align_items='center', width='100%')
    ),
    speaker_select,
], layout=widgets.Layout(
    border='1px solid #ccc',
    padding='8px',
    margin='4px 0',
    border_radius='6px'
))

process_button = widgets.Button(description='Process Audio', button_style='success')
output_embedding = widgets.Output()

def process_audio(b):
    with output_embedding:
        output_embedding.clear_output()
        try:
            x_np, sr = load_audio_from_state(input_state)
            x = torch.tensor(x_np, dtype=torch.float32).unsqueeze(0).unsqueeze(0)

            target = [speaker_select.value]
            speaker_embedding_avg, speaker_embedding_one, speaker_mean = get_speaker_embeddings_json(
                target, 'utils/speaker_dict.json'
            )

            print(f"Input audio: {input_state.value}")
            print(f"Speaker ID:  {target[0]}")
            print(f"F0 Mean:     {speaker_mean[0]:.2f}")
            print()

            speaker_mean_t        = torch.tensor([speaker_mean[0]], dtype=torch.float32)
            speaker_embedding_avg = speaker_embedding_avg[0]

            with torch.no_grad():
                vc_model.reset_pitch()
                vc_model.set_new_speaker_from_embedding(speaker_mean_t, speaker_embedding_avg)
                vc_model.set_tremble_depth(torch.zeros(1, dtype=torch.float32))
                vc_model.set_ambitus_scaler(torch.ones(1, dtype=torch.float32))
                vc_model.set_pitch_mult(torch.ones(1, dtype=torch.float32))

            out = vc_model(normalize(x, transform))
            display_audios([("INPUT", x, sr), ("CONVERTED", out, sr)])

        except Exception as e:
            print(f"Error: {e}")

process_button.on_click(process_audio)

display(widgets.VBox([
    input_container,
    speaker_container,
    process_button,
    output_embedding
]))

VBox(children=(VBox(children=(HBox(children=(HTML(value='<b style="font-size:14px">Input Audio</b>'), HTML(val‚Ä¶