## Multi-Accent and Multi-Lingual Voice Clone Demo with MeloTTS

# Requirements

In [None]:
conda create -n openvoice python=3.9
conda activate openvoice
git clone git@github.com:myshell-ai/OpenVoice.git
cd OpenVoice
pip install -e .

# Imports

In [None]:
print("test")

In [None]:
!pip3 install git+https://github.com/myshell-ai/MeloTTS.git


In [None]:
!ls

In [None]:
!pip install -e .

In [None]:
import os
import glob
import torch
from melo.api import TTS
from scipy.io import wavfile


from openvoice import se_extractor
from openvoice.api import ToneColorConverter

# Initialization

In this example, we will use the checkpoints from OpenVoiceV2. OpenVoiceV2 is trained with more aggressive augmentations and thus demonstrate better robustness in some cases.

In [None]:
torch.cuda.is_available()

In [None]:
ckpt_converter = 'checkpoints_v2/converter'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
#device = "cpu"
output_dir = 'outputs_v2'
speed = 1.0

device

## Tone converter

In [None]:
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)

In [None]:
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

In [None]:
os.makedirs(output_dir, exist_ok=True)

### Obtain Tone Color Embedding
We only extract the tone color embedding for the target speaker. The source tone color embeddings can be directly loaded from `checkpoints_v2/ses` folder.

In [None]:

reference_speaker = 'resources/major/major_2_02.wav' # This is the voice you want to clone
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)

In [None]:
speaker_key = "en-newest"
source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)

# Define input

In [None]:
# Input
text = "Let's make that our password, for when we meet again. And where shall I go now? The net is vast and limitless."

# Init model

In [None]:
def init_model(device:str="cpu", language = "EN_NEWEST"):
    model = TTS(language=language, device=device)
    return model
    

In [None]:
language = "EN_NEWEST"
model = TTS(language=language, device=device)
speaker_ids = model.hps.data.spk2id
speaker_id = 0

# Run model
Run the base tts without voice clone and save the output

In [None]:
def run_tts(input_text:str, model, output_dir, source_se, target_se, speaker_id=0, speed=1.0):
    
    src_path = f'{output_dir}/tmp.wav'
    
    print(f"Reading: {input_text}")
    model.tts_to_file(input_text, speaker_id, src_path, speed=speed)

    #print(f"\t Converting to speaker: {target_se["audio_name"]}")
    save_path = f'{output_dir}/output_v2_{text[0:10]}.wav'
    # Run the tone color converter
    encode_message = "@MyShell"
    tone_color_converter.convert(
        audio_src_path=src_path, 
        src_se=source_se, 
        tgt_se=target_se, 
        output_path=save_path,
        message=encode_message)


    # Read the wav file back in
    samplerate, wav = wavfile.read(save_path)

    return wav, samplerate
    


In [None]:
wav, samplerate = run_tts(text, model, "outputs_v2", source_se, target_se)

In [None]:
import IPython

IPython.display.Audio(wav, rate=samplerate)

In [None]:
text = "Sorry for the delay. I was using an old version of ipython. Everything works great now. Having player directly in notebook is very neat to play around with sounds "
wav, samplerate = run_tts(text, model, "outputs_v2", source_se, target_se)
IPython.display.Audio(wav, rate=samplerate)