In [1]:
import IPython.display as ipd
import torch
from torch.utils.data import DataLoader
import sys
sys.path.append("/home/jupyter-ambeshs/DOT6/vits2")
from utils.task import load_checkpoint
from utils.hparams import get_hparams_from_file
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from model.models import SynthesizerTrn
from text import symbols
from utils.task import load_vocab
from text import tokenizer

def get_text(text: str, hps, vocab) -> torch.LongTensor:
    text_norm = tokenizer(text, vocab, hps.data.text_cleaners, language=hps.data.language)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

## LJ Speech


In [2]:
model = "yt_influencers"
checkpoint = "G_286000.pth"
device = torch.device("cuda:1")

In [3]:
hps = get_hparams_from_file(f"./datasets/{model}/config.yaml")
filter_length = hps.data.n_mels if hps.data.use_mel else hps.data.n_fft // 2 + 1
segment_size = hps.train.segment_size // hps.data.hop_length
vocab = load_vocab(hps.data.vocab_file)
net_g = SynthesizerTrn(len(vocab), filter_length, segment_size, **hps.model).to(device)
_ = net_g.eval()
_ = load_checkpoint(f"./datasets/{model}/logs/{checkpoint}", net_g, None)



INFO:root:Loaded checkpoint './datasets/yt_influencers/logs/G_286000.pth' (iteration 1145)


In [4]:
stn_tst = get_text(
    "आप अपने जीते हुए अमाउंट को रश वॉलेट में देख सकते हो ये आपके अकाउंट में रिजल्ट आने के पंद्रह मिनट के अंदर क्रेडिट हो जाता है अगर आपको और कुछ हेल्प चाहिए तो कृपया बताओ", 
    # "मेरा नाम नताशा है, आपकी आज कैसी सहायता कर सकती है",
    hps,
    vocab
)
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).to(device)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)

    out = net_g.infer(x_tst, x_tst_lengths, sid=0, noise_scale=0.667, noise_scale_w=0.333, length_scale=1)
    audio = out[0][0, 0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sample_rate, normalize=False))



## MADASR23


In [2]:
from IPython.display import Audio

In [24]:
Audio("datasets/yt_influencers/spk1/_syfOzSVsr0/0003.wav")

In [18]:
import pandas as pd
df = pd.read_csv("datasets/yt_influencers/spk1/_syfOzSVsr0.tsv", sep = '\t')

In [26]:
df.query("path=='_syfOzSVsr0/0003.wav'").values

array([['_syfOzSVsr0/0003.wav',
        'वैल्यू फंड का रिव्यू करने वाले हैं जो कि जे एम फाइनेैंशियल म्यूचुअल']],
      dtype=object)

In [2]:
model = "madasr23_base"
checkpoint = "G_1000.pth"

In [56]:
hps = get_hparams_from_file(f"./datasets/{model}/config.yaml")
filter_length = hps.data.n_mels if hps.data.use_mel else hps.data.n_fft // 2 + 1
segment_size = hps.train.segment_size // hps.data.hop_length
net_g = SynthesizerTrn(len(symbols), filter_length, segment_size, n_speakers=hps.data.n_speakers, **hps.model).to(device)
_ = net_g.eval()
_ = load_checkpoint(f"./datasets/{model}/logs/{checkpoint}", net_g, None)

INFO:root:Loaded checkpoint './logs/madasr23_base/G_55000.pth' (iteration 31)


In [64]:
stn_tst = get_text("রোপক বা প্ল্যান্টার মেশিন দুই ধরনের হয় বৈদ্যুতিন এবং হাইড্রলিক যান্ত্রিক", hps)
with torch.no_grad():
    x_tst = stn_tst.to(device).unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    sid = torch.LongTensor([78]).to(device)

    out = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1)
    audio = out[0][0, 0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sample_rate, normalize=False))

## VCTK


In [None]:
model = "vctk_base"
checkpoint = "G_1000.pth"

In [None]:
hps = get_hparams_from_file(f"./datasets/{model}/config.yaml")
filter_length = hps.data.n_mels if hps.data.use_mel else hps.data.n_fft // 2 + 1
segment_size = hps.train.segment_size // hps.data.hop_length
net_g = SynthesizerTrn(len(symbols), filter_length, segment_size, n_speakers=hps.data.n_speakers, **hps.model).to(device)
_ = net_g.eval()
_ = load_checkpoint(f"./datasets/{model}/logs/{checkpoint}", net_g, None)

In [None]:
stn_tst = get_text("VITS is Awesome!", hps)
with torch.no_grad():
    x_tst = stn_tst.to(device).unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    sid = torch.LongTensor([4]).to(device)

    out = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1)
    audio = out[0][0, 0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sample_rate, normalize=False))

### Voice Conversion


In [None]:
dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
collate_fn = TextAudioSpeakerCollate()
loader = DataLoader(dataset, num_workers=8, shuffle=False, batch_size=1, pin_memory=True, drop_last=True, collate_fn=collate_fn)
data_list = list(loader)

In [None]:
with torch.no_grad():
    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.to(device) for x in data_list[0]]
    sid_tgt1 = torch.LongTensor([1]).to(device)
    sid_tgt2 = torch.LongTensor([2]).to(device)
    sid_tgt3 = torch.LongTensor([4]).to(device)
    audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0, 0].data.cpu().float().numpy()
    audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0, 0].data.cpu().float().numpy()
    audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0, 0].data.cpu().float().numpy()
print("Original SID: %d" % sid_src.item())
ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sample_rate, normalize=False))
print("Converted SID: %d" % sid_tgt1.item())
ipd.display(ipd.Audio(audio1, rate=hps.data.sample_rate, normalize=False))
print("Converted SID: %d" % sid_tgt2.item())
ipd.display(ipd.Audio(audio2, rate=hps.data.sample_rate, normalize=False))
print("Converted SID: %d" % sid_tgt3.item())
ipd.display(ipd.Audio(audio3, rate=hps.data.sample_rate, normalize=False))