In [2]:
import argparse
import IPython.display as ipd
import json
import librosa
import os

# To prevent the path from becoming corrupted when this cell is executed more than once.
try:
    path
except:
    path = "../"
    os.chdir(path)
    
import phonemizer
import random
from scipy.io.wavfile import write
import torch
import torchaudio
from tqdm import tqdm
from transformers import HubertModel

from unitspeech.unitspeech import UnitSpeech
from unitspeech.duration_predictor import DurationPredictor
from unitspeech.encoder import Encoder
from unitspeech.speaker_encoder.ecapa_tdnn import ECAPA_TDNN_SMALL
from unitspeech.text import cleaned_text_to_sequence, phonemize, symbols
from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder
from unitspeech.util import HParams, fix_len_compatibility, intersperse, process_unit, generate_path, sequence_mask
from unitspeech.vocoder.env import AttrDict
from unitspeech.vocoder.meldataset import mel_spectrogram
from unitspeech.vocoder.models import BigVGAN

In [3]:
# PREPARE ARGUMENTS, CHECKPOINT PATH

# reference audio path for finetuning
reference_path = "reference-speech.wav"

# pretrained model path (Follow README)
encoder_path = "unitspeech/checkpoints/unit_encoder.pt"
decoder_path = "unitspeech/checkpoints/pretrained_decoder.pt"
speaker_encoder_path = "unitspeech/speaker_encoder/checkpts/speaker_encoder.pt"
finetune_config_path = "unitspeech/checkpoints/finetune.json"

# Arguments for finetuning

# If the voice is highly unique, increasing the number of iterations can be helpful. 
# However, excessively large iteration numbers can lead to a degradation in pronunciation. 
# We recommend starting with 500 iterations and, if the results are unsatisfactory, gradually increasing the number of iterations.
n_iters = 500
learning_rate = 2e-5
fp16_run = False

with open(finetune_config_path, "r") as f:
    data = f.read()
finetune_config = json.loads(data)

hps_finetune = HParams(**finetune_config)

segment_size = fix_len_compatibility(
    hps_finetune.train.out_size_second * hps_finetune.data.sampling_rate // hps_finetune.data.hop_length,
    len(hps_finetune.decoder.dim_mults) - 1
)

num_units = hps_finetune.data.n_units

In [4]:
# load pretrained checkpoint

# Vocoder
print('Initializing Vocoder...')
with open(hps_finetune.train.vocoder_config_path) as f:
    h = AttrDict(json.load(f))
vocoder = BigVGAN(h)
vocoder.load_state_dict(torch.load(hps_finetune.train.vocoder_ckpt_path, map_location=lambda loc, storage: loc)['generator'])
_ = vocoder.cuda().eval()
vocoder.remove_weight_norm()

# Speaker Encoder for extracting speaker embedding
print('Initializing Speaker Encoder...')
spk_embedder = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
state_dict = torch.load(speaker_encoder_path, map_location=lambda storage, loc: storage)
spk_embedder.load_state_dict(state_dict['model'], strict=False)
_ = spk_embedder.cuda().eval()

# Unit Extractor for extraction unit and duration, which are used for finetuning
print('Initializing Unit Extracter...')
dense_model_name = "mhubert-base-vp_en_es_fr"
quantizer_name, vocab_size = "kmeans", 1000

unit_extractor = SpeechEncoder.by_name(
    dense_model_name=dense_model_name,
    quantizer_model_name=quantizer_name,
    vocab_size=vocab_size,
    deduplicate=True,
    need_f0=False
)
_ = unit_extractor.cuda().eval()

Initializing Vocoder...
Removing weight norm...
Initializing Speaker Encoder...


Using cache found in /home/astanea/.cache/torch/hub/s3prl_s3prl_main
2024-05-09 22:06:44 | INFO | s3prl.util.download | Requesting URL: https://huggingface.co/s3prl/converted_ckpts/resolve/main/wavlm_large.pt
2024-05-09 22:06:44 | INFO | s3prl.util.download | Using URL's local file: /home/astanea/.cache/s3prl/download/f2d5200177fd6a33b278b7b76b454f25cd8ee866d55c122e69fccf6c7467d37d.wavlm_large.pt
2024-05-09 22:06:50 | INFO | s3prl.upstream.wavlm.WavLM | WavLM Config: {'extractor_mode': 'layer_norm', 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': 'gelu', 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'normalize': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.0, 'dropout_input': 0.0, 'dropout_features': 0.0, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selectio

Initializing Unit Extracter...


2024-05-09 22:07:06 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/astanea/dev/UnitSpeech
2024-05-09 22:07:06 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/annl/s2st/data/voxpopuli/mHuBERT/en_es_fr', 'fine_tuning': False, 'labels': ['km'], 'label_dir': '/checkpoint/wnhsu/experiments/hubert/kmeans/mhubert_vp_en_es_fr_it2_400k/en_es_fr.layer9.km500', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2024-05-09 22:07:06 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout':

In [23]:
from unitspeech.util import (
    duration_loss,
    fix_len_compatibility,
    load_speaker_embs,
    plot_tensor,
    save_plot,
    sequence_mask,
)

In [24]:
# Preprocess the reference audio in a format suitable for fine-tuning
wav, sr = librosa.load(reference_path)
wav = torch.FloatTensor(wav).unsqueeze(0)
mel = mel_spectrogram(wav, hps_finetune.data.n_fft, hps_finetune.data.n_feats, hps_finetune.data.sampling_rate, hps_finetune.data.hop_length,
                      hps_finetune.data.win_length, hps_finetune.data.mel_fmin, hps_finetune.data.mel_fmax, center=False)

# Load the normalization parameters for mel-spectrogram normalization.
mel_min = mel.min(-1, keepdim=True)[0]
mel_max = mel.max(-1, keepdim=True)[0]

with torch.no_grad():
    reference_audio = vocoder.forward(mel.cuda()).cpu().squeeze().clamp(-1, 1).numpy()

save_plot(mel.squeeze().cpu(), f'notebooks/logdir/original_mel-UNSCALED.png') 
mel = (mel - mel_min) / (mel_max - mel_min) * 2 - 1 # normalize mel-spectrogram in range [-1, 1]
save_plot(mel.squeeze().cpu(), f'notebooks/logdir/original_mel-SCALEDS.png') 
mel = mel.cuda()
resample_fn = torchaudio.transforms.Resample(sr, 16000).cuda()
wav = resample_fn(wav.cuda())
spk_emb = spk_embedder(wav)
# Compute L2 norm (Euclidean norm) of the tensor (sqrt of the sum of the abs squares of its elements)
# Why? To normalize the speaker embedding vector: can compare tensors of different sizes
spk_emb = spk_emb / spk_emb.norm()

# Extract the units and unit durations to be used for fine-tuning.
encoded = unit_extractor(wav.to("cuda")) # => units with f_unit freq: 16Khz 


unit, duration = process_unit(encoded, hps_finetune.data.sampling_rate, hps_finetune.data.hop_length)



In [25]:
# Original processing
print(wav.shape[-1] / 16000)
print(wav.shape)
print(mel.shape)
print(spk_emb.shape)
print(unit.shape)
print(duration.shape)

4.9806875
torch.Size([1, 79691])
torch.Size([1, 80, 429])
torch.Size([1, 256])
torch.Size([189])
torch.Size([189])


In [26]:
# Initialize model and optimizer
unit_encoder = Encoder(
    n_vocab=num_units,
    n_feats=hps_finetune.data.n_feats,
    **hps_finetune.encoder
)

unit_encoder_dict = torch.load(encoder_path, map_location=lambda loc, storage: loc)
unit_encoder.load_state_dict(unit_encoder_dict['model'])
_ = unit_encoder.cuda().eval()

unitspeech = UnitSpeech(
    n_feats=hps_finetune.data.n_feats,
    **hps_finetune.decoder
)

decoder_dict = torch.load(decoder_path, map_location=lambda loc, storage: loc)
unitspeech.load_state_dict(decoder_dict['model'])
_ = unitspeech.cuda().train()

# NOTE: During fine-tunning we say the rest of params are frozen because we only update the decoder params in the optimizer
optimizer = torch.optim.Adam(params=unitspeech.parameters(), lr=learning_rate)

if fp16_run:
    scaler = torch.cuda.amp.GradScaler()

In [27]:
# Reshape the input to match the dimensions and convert it to a PyTorch tensor.
unit = unit.unsqueeze(0).cuda()
print(f"Unit shape: {unit.shape}")
duration = duration.unsqueeze(0).cuda()
print(f"Duration shape: {duration.shape}")
mel = mel.cuda()
print(f"Mel shape: {mel.shape}")

unit_lengths = torch.LongTensor([unit.shape[-1]]).cuda()
print(f"Unit lengths: {unit_lengths}")
mel_lengths = torch.LongTensor([mel.shape[-1]]).cuda()
print(f"Mel lengths: {mel_lengths}")
spk_emb = spk_emb.cuda().unsqueeze(1)
print(f"Speaker embedding shape: {spk_emb.shape}")

# Prepare unit encoder output for finetuning
with torch.no_grad():
    cond_x, x, x_mask = unit_encoder(unit, unit_lengths)
    print(f"\ncond_x shape: {cond_x.shape}")
    print(f"x shape: {x.shape}")
    print(f"x_mask shape: {x_mask.shape}")

mel_max_length = mel.shape[-1]
print(f"\nMel max length: {mel_max_length}")
mel_mask = sequence_mask(mel_lengths, mel_max_length).unsqueeze(1).to(x_mask)
print(f"mel_mask shape: {mel_mask.shape}")
attn_mask = x_mask.unsqueeze(-1) * mel_mask.unsqueeze(2)
print(f"attn_mask shape: {attn_mask.shape}")

attn = generate_path(duration, attn_mask.squeeze(1))
print(f"attn shape: {attn.shape}")

Unit shape: torch.Size([1, 189])
Duration shape: torch.Size([1, 189])
Mel shape: torch.Size([1, 80, 429])
Unit lengths: tensor([189], device='cuda:0')
Mel lengths: tensor([429], device='cuda:0')
Speaker embedding shape: torch.Size([1, 1, 256])

cond_x shape: torch.Size([1, 80, 189])
x shape: torch.Size([1, 192, 189])
x_mask shape: torch.Size([1, 1, 189])

Mel max length: 429
mel_mask shape: torch.Size([1, 1, 429])
attn_mask shape: torch.Size([1, 1, 189, 429])
attn shape: torch.Size([1, 189, 429])


In [28]:
def fine_tune(cond_x, y, y_mask, y_lengths, y_max_length, attn, spk_emb, segment_size, n_feats, decoder):
    if y_max_length < segment_size:
        pad_size = segment_size - y_max_length
        y = torch.cat([y, torch.zeros_like(y)[:, :, :pad_size]], dim=-1)
        y_mask = torch.cat([y_mask, torch.zeros_like(y_mask)[:, :, :pad_size]], dim=-1)

    max_offset = (y_lengths - segment_size).clamp(0)
    offset_ranges = list(zip([0] * max_offset.shape[0], max_offset.cpu().numpy()))
    out_offset = torch.LongTensor([
        torch.tensor(random.choice(range(start, end)) if end > start else 0)
        for start, end in offset_ranges
    ]).to(y_lengths)

    attn_cut = torch.zeros(attn.shape[0], attn.shape[1], segment_size, dtype=attn.dtype, device=attn.device)
    y_cut = torch.zeros(y.shape[0], n_feats, segment_size, dtype=y.dtype, device=y.device)
    y_cut_lengths = []
    for i, (y_, out_offset_) in enumerate(zip(y, out_offset)):
        y_cut_length = segment_size + (y_lengths[i] - segment_size).clamp(None, 0)
        y_cut_lengths.append(y_cut_length)
        cut_lower, cut_upper = out_offset_, out_offset_ + y_cut_length
        y_cut[i, :, :y_cut_length] = y_[:, cut_lower:cut_upper]
        attn_cut[i, :, :y_cut_length] = attn[i, :, cut_lower:cut_upper]
    y_cut_lengths = torch.LongTensor(y_cut_lengths)
    y_cut_mask = sequence_mask(y_cut_lengths).unsqueeze(1).to(y_mask)

    if y_cut_mask.shape[-1] < segment_size:
        y_cut_mask = torch.nn.functional.pad(y_cut_mask, (0, segment_size - y_cut_mask.shape[-1]))

    attn = attn_cut
    y = y_cut
    y_mask = y_cut_mask

    # Align encoded text with mel-spectrogram and get cond_y segment
    cond_y = torch.matmul(attn.squeeze(1).transpose(1, 2).contiguous(), cond_x.transpose(1, 2).contiguous())
    cond_y = cond_y.transpose(1, 2).contiguous()
    cond_y = cond_y * y_mask

    # Compute loss of score-based decoder
    diff_loss, xt = decoder.compute_loss(y, y_mask, cond_y, spk_emb=spk_emb)

    return diff_loss

In [29]:
# Fine-tuning.
# for _ in tqdm(range(n_iters)):
for _ in tqdm(range(100)):
    cond_x = cond_x.detach()
    mel = mel.detach()
    mel_mask = mel_mask.detach()
    mel_lengths = mel_lengths.detach()
    spk_emb = spk_emb.detach()
    attn = attn.detach()

    unitspeech.zero_grad()
      
    with torch.cuda.amp.autocast(enabled=fp16_run):
        diff_loss = fine_tune(cond_x, mel, mel_mask, mel_lengths, mel_max_length, attn, spk_emb, segment_size, hps_finetune.data.n_feats, unitspeech)

    loss = sum([diff_loss])

    if fp16_run:
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        _ = torch.nn.utils.clip_grad_norm_(unitspeech.parameters(), max_norm=1)
        scaler.step(optimizer)
        scaler.update()
    else:
        loss.backward()
        _ = torch.nn.utils.clip_grad_norm_(unitspeech.parameters(), max_norm=1)
        optimizer.step()

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:22<00:00,  4.49it/s]


# Text-to-Speech
**If you are interested in the voice conversion task, please skip this part and proceed to the voice conversion section below.**

In [30]:
# Please provide the transcript you would like to synthesize with the desired voice. 
# We recommend entering the transcript in normalized text format.
# text = "If we are lucky, once in a lifetime we might achieve greatness"
text = "Does the quick brown fox jump over the lazy dog?"


# The text gradient scale is responsible for pronunciation and audio quality. 
# The default value is 1, and increasing the value improves pronunciation accuracy but may reduce speaker similarity. 
# We recommend starting with 0 and gradually increasing it if the pronunciation is not satisfactory.
text_gradient_scale = 1.0

# The speaker gradient scale is responsible for speaker similarity. 
# Increasing the value enhances speaker similarity but may slightly degrade pronunciation and audio quality. 
# For unique voices, we recommend using a larger value for the speaker gradient scale.
spk_gradient_scale = 1.0

# We have confirmed that our duration predictor is not accurately following the duration of the reference audio as expected.
# As a result, while the reference audio's tone and speaking style are well adapted, there are differences in speech rate. 
# To address this issue, we use the "length_scale" argument as in Grad-TTS to mitigate the discrepancy.
# If the value of "length_scale" is greater than 1, the speech rate will be slower. 
# Conversely, if the value is less than 1, the speech rate will be faster.
length_scale = 1.0

# The number of diffusion steps during sampling refers to the number of iterations performed to improve audio quality.
# Generally, larger values lead to better audio quality but slower sampling speeds. 
# Conversely, smaller values allow for faster sampling but may result in lower audio quality.
# We recommend using a value of 50 for this parameter.
diffusion_step = 100 # 100

In [31]:
# Load modules for one-shot text-to-speech
text_encoder_path = "unitspeech/checkpoints/text_encoder.pt"
duration_predictor_path = "unitspeech/checkpoints/duration_predictor.pt"
tts_config_path = "unitspeech/checkpoints/text-to-speech.json"

with open(tts_config_path, "r") as f:
    data = f.read()
tts_config = json.loads(data)

hps_tts = HParams(**tts_config)

global_phonemizer = phonemizer.backend.EspeakBackend(
    language='en-us', preserve_punctuation=True, with_stress=True, language_switch="remove-flags"
)

In [32]:
# Initialize & load model
text_encoder = Encoder(
    n_vocab=len(symbols) + 1,
    n_feats=hps_tts.data.n_feats,
    **hps_tts.encoder
)

text_encoder_dict = torch.load(text_encoder_path, map_location=lambda loc, storage: loc)
text_encoder.load_state_dict(text_encoder_dict['model'])
_ = text_encoder.cuda().eval()

duration_predictor = DurationPredictor(
    **hps_tts.duration_predictor
)

duration_predictor_dict = torch.load(duration_predictor_path, map_location=lambda loc, storage: loc)
duration_predictor.load_state_dict(duration_predictor_dict['model'])
_ = duration_predictor.cuda().eval()

_ = unitspeech.cuda().eval()

In [33]:
# text-to-speech function
@torch.no_grad()
def text_to_speech(
    text_encoder, duration_predictor, decoder, phoneme, phoneme_lengths, spk_emb, num_downsamplings_in_unet,
    diffusion_step, text_gradient_scale, spk_gradient_scale, length_scale
):
    # 85 is the number of phonemes i got after phonemizing the text
    # phoneme = (1, 85)
    # phoneme_lengths = (85)
    # cond_x = (1, 80, 85)
    # x = (1, 192, 85)
    # x_mask = (1, 1, 85)
    cond_x, x, x_mask = text_encoder(phoneme, phoneme_lengths)
    # logw = (1, 1, 85)
    logw = duration_predictor(x, x_mask, w=None, g=spk_emb, reverse=True)
    # w = (1, 1, 85)
    # w_ceil = (1, 1, 85)
    w = torch.exp(logw) * x_mask
    w_ceil = torch.ceil(w) * length_scale

    y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() # (235)
    y_max_length = int(y_lengths.max())
    y_max_length_ = fix_len_compatibility(y_max_length, num_downsamplings_in_unet) # (240)

    # Using obtained durations `w` construct alignment map `attn`
    y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype) # (1 , 1, 240)
    attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2) # (1, 1, 85, 240)
    attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) # (1, 1, 85, 240)

    # Align encoded text and get mu_y
    cond_y = torch.matmul(attn.squeeze(1).transpose(1, 2).contiguous(), cond_x.transpose(1, 2).contiguous()) # (1, 80, 240)
    cond_y = cond_y.transpose(1, 2).contiguous()

    z = torch.randn_like(cond_y, device=cond_y.device)

    # Generate sample by performing reverse dynamics
    decoder_outputs = decoder(
        z, y_mask, cond_y, spk_emb, diffusion_step,
        text_gradient_scale=text_gradient_scale, spk_gradient_scale=spk_gradient_scale
    )
    decoder_outputs = decoder_outputs[:, :, :y_max_length]
    return decoder_outputs

In [34]:
# Prepare input
# phoneme = phonemize(text, global_phonemizer)
# phoneme = cleaned_text_to_sequence(phoneme)
# phoneme = intersperse(phoneme, len(symbols))  # add a blank token, whose id number is len(symbols)
# phoneme = torch.LongTensor([phoneme]).cuda().unsqueeze(0) # (batch , phonemes)
# phoneme_lengths = torch.LongTensor([phoneme.shape[-1]]).cuda() # (lengths)

# # Prepare input => ORIGINAL
phoneme = phonemize(text, global_phonemizer)
phoneme = cleaned_text_to_sequence(phoneme)
phoneme = intersperse(phoneme, len(symbols))  # add a blank token, whose id number is len(symbols)
phoneme = torch.LongTensor(phoneme).cuda().unsqueeze(0)
phoneme_lengths = torch.LongTensor([phoneme.shape[-1]]).cuda()

phoneme.shape, phoneme_lengths

(torch.Size([1, 115]), tensor([115], device='cuda:0'))

In [18]:
with torch.no_grad():
    y_enc, y_dec, attn = unitspeech.execute_text_to_speech(phoneme = phoneme,
                                                        phoneme_lengths=phoneme_lengths,
                                                        spk_emb=spk_emb,
                                                        text_encoder=text_encoder,
                                                        duration_predictor=duration_predictor,
                                                        num_downsamplings_in_unet=len(hps_tts.decoder.dim_mults) - 1,
                                                        diffusion_steps=diffusion_step)
    save_plot(y_enc.squeeze().cpu(),
                f'notebooks/logdir/y_enc.png')
    save_plot(y_dec.squeeze().cpu(),
                f'notebooks/logdir/y_dec_NORM.png')
    save_plot(attn.squeeze().cpu(),
                f'notebooks/logdir/attn.png')

    # y_dec = ((y_dec + 1) / 2 * (mel_max.to(y_dec.device) - mel_min.to(y_dec.device))
    #                     + mel_min.to(y_dec.device)) # (1, 80, 235)
    save_plot(y_dec.squeeze().cpu(),
                f'notebooks/logdir/y_dec_UNNORMALIZED.png')
    synthesized_audio = vocoder.forward(y_dec).cpu().squeeze().clamp(-1, 1).numpy() # (60160)

print('Reference voice to adapt to')
ipd.display(ipd.Audio(reference_audio, rate=sr))
print('Generated audio')
ipd.display(ipd.Audio(synthesized_audio, rate=sr)) # Decoder generates spectrograms which were samples with sr=22050

Reference voice to adapt to


Generated audio


  return scaled.astype("<h").tobytes(), nchan


In [17]:
assert 1 == 2

AssertionError: 

In [25]:
# mel_max = torch.load("unitspeech/checkpoints/mel_max.pt").unsqueeze(-1)
# mel_min = torch.load("unitspeech/checkpoints/mel_min.pt").unsqueeze(-1)

# mel_max = torch.load("unitspeech/checkpoints/mel_normalization/LibriTTS/mel_max.pt").unsqueeze(-1)
# mel_min = torch.load("unitspeech/checkpoints/mel_normalization/LibriTTS/mel_min.pt").unsqueeze(-1)

mel_max = torch.load("unitspeech/checkpoints/mel_normalization/LJSpeech/mel_max.pt").unsqueeze(-1)
mel_min = torch.load("unitspeech/checkpoints/mel_normalization/LJSpeech/mel_min.pt").unsqueeze(-1)

mel_max.shape, mel_min.shape

(torch.Size([80, 1]), torch.Size([80, 1]))

In [None]:
assert 1 == 2

In [35]:
# phoneme = phonemize(text, global_phonemizer)
# phoneme = cleaned_text_to_sequence(phoneme)
# phoneme = intersperse(phoneme, len(symbols))  # add a blank token, whose id number is len(symbols)
# phoneme = torch.LongTensor([phoneme]).cuda().unsqueeze(0) # (batch , phonemes)
# phoneme_lengths = torch.LongTensor([phoneme.shape[-1]]).cuda() # (lengths)
with torch.no_grad():
    mel_generated = text_to_speech(
        text_encoder, duration_predictor, unitspeech,
        phoneme, phoneme_lengths, spk_emb, len(hps_tts.decoder.dim_mults) - 1,
        diffusion_step, text_gradient_scale, spk_gradient_scale, length_scale
    )
    mel_generated = ((mel_generated + 1) / 2 * (mel_max.to(mel_generated.device) - mel_min.to(mel_generated.device))
                     + mel_min.to(mel_generated.device)) # (1, 80, 235)
    synthesized_audio = vocoder.forward(mel_generated).cpu().squeeze().clamp(-1, 1).numpy() # (60160)

print('Reference voice to adapt to')
ipd.display(ipd.Audio(reference_audio, rate=sr))
print('Generated audio')
ipd.display(ipd.Audio(synthesized_audio, rate=sr)) # Decoder generates spectrograms which were samples with sr=22050

Reference voice to adapt to


Generated audio


  return scaled.astype("<h").tobytes(), nchan


# Voice Conversion

In [None]:
# Please specify the path of the source voice you want to change.
# source_path = "PATH OF SOURCE SPEECH"
source_path = "voice_to_change.wav"

# The text gradient scale is responsible for pronunciation and audio quality. 
# The default value is 1, and increasing the value improves pronunciation accuracy but may reduce speaker similarity. 
# We recommend starting with 0 and gradually increasing it if the pronunciation is not satisfactory.
text_gradient_scale = 1.0

# The speaker gradient scale is responsible for speaker similarity. 
# Increasing the value enhances speaker similarity but may slightly degrade pronunciation and audio quality. 
# For unique voices, we recommend using a larger value for the speaker gradient scale.
spk_gradient_scale = 1.0

# The number of diffusion steps during sampling refers to the number of iterations performed to improve audio quality.
# Generally, larger values lead to better audio quality but slower sampling speeds. 
# Conversely, smaller values allow for faster sampling but may result in lower audio quality.
# We recommend using a value of 50 for this parameter.
diffusion_step = 50

In [None]:
contentvec_encoder_path = "unitspeech/checkpoints/contentvec_encoder.pt"
vc_config_path = "unitspeech/checkpoints/voice-conversion.json"

with open(vc_config_path, "r") as f:
    data = f.read()
vc_config = json.loads(data)

hps_vc = HParams(**vc_config)

In [None]:
class HubertModelWithFinalProj(HubertModel):
    def __init__(self, config):
        super().__init__(config)

        # The final projection layer is only used for backward compatibility.
        # Following https://github.com/auspicious3000/contentvec/issues/6
        # Remove this layer is necessary to achieve the desired outcome.
        self.final_proj = torch.nn.Linear(config.hidden_size, config.classifier_proj_size)

In [None]:
# Initialize & load model
contentvec_extractor = HubertModelWithFinalProj.from_pretrained("lengyue233/content-vec-best")
_ = contentvec_extractor.cuda().eval()
        
contentvec_encoder = Encoder(
    n_vocab=len(symbols) + 1,
    n_feats=hps_vc.data.n_feats,
    **hps_vc.encoder
)

contentvec_encoder_dict = torch.load(contentvec_encoder_path, map_location=lambda loc, storage: loc)
contentvec_encoder.load_state_dict(contentvec_encoder_dict['model'])
_ = contentvec_encoder.cuda().eval()

In [None]:
@torch.no_grad()
def voice_conversion(
    contentvec_encoder, decoder, contentvec, contentvec_length, mel_length, spk_emb, num_downsamplings_in_unet,
    diffusion_step, text_gradient_scale, spk_gradient_scale
):
    cond_x, x, x_mask = contentvec_encoder(contentvec, contentvec_length)
    cond_y = cond_x
    y_lengths = torch.LongTensor([contentvec_length]).to(contentvec.device)

    encoder_outputs = torch.nn.functional.interpolate(
        cond_y, size=mel_length, mode='linear'
    )
    y_max_length = mel_length
    y_max_length_ = fix_len_compatibility(mel_length, num_downsamplings_in_unet)
    cond_y = torch.cat([encoder_outputs, torch.zeros_like(encoder_outputs)[:, :, :y_max_length_ - mel_length]], dim=-1)
    y_mask = sequence_mask(torch.LongTensor([mel_length]).to(y_lengths.device), y_max_length_)\
        .unsqueeze(1).to(x_mask.dtype)

    z = torch.randn_like(cond_y, device=cond_y.device)

    # Generate sample by performing reverse dynamics
    decoder_outputs = decoder(
        z, y_mask, cond_y, spk_emb, diffusion_step,
        text_gradient_scale=text_gradient_scale, spk_gradient_scale=spk_gradient_scale
    )
    decoder_outputs = decoder_outputs[:, :, :y_max_length]
    return decoder_outputs

In [None]:
# Extract the contentvec and prepare source input
wav, sr = librosa.load(source_path)
print('Source speech')
ipd.display(ipd.Audio(wav, rate=sr))
wav = torch.FloatTensor(wav).unsqueeze(0)

resample_fn = torchaudio.transforms.Resample(sr, 16000).to("cuda")
wav = wav.cuda()
mel_length = wav.shape[-1] // hps_vc.data.hop_length

wav = resample_fn(wav)

with torch.no_grad():
    contentvec = contentvec_extractor(wav)["last_hidden_state"]
    
contentvec = contentvec.cuda()
contentvec_length = torch.LongTensor([contentvec.shape[1]]).cuda()

# Voice conversion
with torch.no_grad():
    mel_generated = voice_conversion(
        contentvec_encoder, unitspeech,
        contentvec, contentvec_length, mel_length, spk_emb, len(hps_vc.decoder.dim_mults) - 1,
        diffusion_step, text_gradient_scale, 2.0
    )    
    mel_generated = ((mel_generated + 1) / 2 * (mel_max.to(mel_generated.device) - mel_min.to(mel_generated.device))
                     + mel_min.to(mel_generated.device))
    
    synthesized_audio = vocoder.forward(mel_generated).cpu().squeeze().clamp(-1, 1).numpy()

print('Reference voice to adapt to')
ipd.display(ipd.Audio(reference_audio, rate=sr))
print('Generated audio')
ipd.display(ipd.Audio(synthesized_audio, rate=sr))

Source speech


Reference voice to adapt to


Generated audio
