In [None]:
import os
import torch
import transformers
from TTS.api import TTS # only in v0.22
from TTS.tts.models.vits import Vits
from TTS.tts.models.xtts import Xtts

from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.configs.vits_config import VitsConfig

from TTS.tts.utils.synthesis import synthesis
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer

from neon_tts_plugin_coqui import CoquiTTS as neonTTS
from IPython.display import Audio
from scipy.io import wavfile
import numpy as np

import matplotlib.pyplot as plt
import random

import librosa
import soundfile as sf

from pydub import AudioSegment

In [None]:
def generate_and_save_melspec(mel_spectrogram, sampling_rate, output_path="../artifacts/melgram.png"):
    # Generate mel spectrogram
    print(f"Mel spectrogram shape: {mel_spectrogram.shape}")
    print(f"Sampling rate: {sampling_rate}")

    # Save mel spectrogram as numpy array
    np.save(output_path + ".npy", mel_spectrogram)
    print(f"Mel spectrogram saved to {output_path}.npy")

    # Optionally, visualize and save the mel spectrogram as an image
    plt.figure(figsize=(10, 4))
    plt.imshow(mel_spectrogram, aspect='auto', origin='lower')
    plt.colorbar()
    plt.title('Mel Spectrogram')
    plt.tight_layout()
    plt.savefig(output_path + ".png")
    print(f"Mel spectrogram visualization saved to {output_path}.png")


# Vocoder test

In [81]:
def apply_vocoder(synthesizer, mel_spectrogram, output_path="../artifacts/vocoder.wav"):
    # Convert to tensor if it's not already
    if not isinstance(mel_spectrogram, torch.Tensor):
        mel_spectrogram = torch.FloatTensor(mel_spectrogram)

    # Ensure mel_spectrogram is the right shape (add batch dimension if needed)
    if mel_spectrogram.dim() == 2:
        mel_spectrogram = mel_spectrogram.unsqueeze(0)

    # Move to the same device as the vocoder (CUDA in this case)
    mel_spectrogram = mel_spectrogram.cuda()

    # Generate waveform
    with torch.no_grad():
        waveform = synthesizer.vocoder_model(mel_spectrogram)

    # Convert to numpy array
    waveform = waveform.cpu().numpy().squeeze()

    # Normalize audio to [-1, 1] range
    waveform = waveform / np.max(np.abs(waveform))

    # Get the sampling rate from the synthesizer
    sample_rate = synthesizer.vocoder_model.config.audio.sample_rate
    if sample_rate is None:
        # Fallback to a common sample rate if not found in config
        sample_rate = 22050
        print(f"Warning: Sample rate not found in vocoder config. Using default: {sample_rate}")


    # Save as wav file
    sf.write(output_path, waveform, sample_rate)
    print(f"Audio saved to {output_path}")

    return waveform, sample_rate

In [None]:
from TTS.utils.synthesizer import Synthesizer

In [None]:
vocoder_path = "/media/bramiozo/DATA-FAST/TTS/tts_models/gle/hifigan_vocoder_seanos"

In [None]:
synthesizer = Synthesizer()
synthesizer._load_vocoder(model_file=os.path.join(vocoder_path, "model_file.pth.tar"), 
                          model_config=os.path.join(vocoder_path, "config.json"), 
                          use_cuda=True)

In [99]:
tts_path = "/media/bramiozo/DATA-FAST/TTS/tts_models/gle/tts-vits-cv-ga_seanos"
tts_model = TTS(progress_bar=True,
                model_path=os.path.join(tts_path, "model_file.pth.tar"),
                config_path=os.path.join(tts_path, "config.json"))

_neonTTS = neonTTS(lang="ga", config={})

 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:44100
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.
 > initialization of language-embedding layers.


  return torch.load(f, map_location="cpu")
  return torch.load(f, map_location=map_location, **kwargs)


2024-07-25 21:16:05.334 - OVOS - ovos_plugin_manager.g2p:create:142 - ERROR - The selected G2P plugin could not be loaded.
Traceback (most recent call last):
  File "/media/bramiozo/Storage1/bramiozo/VIRTUALENVS/Python/seanos-bFLQpzeS-py3.10/lib/python3.10/site-packages/ovos_plugin_manager/g2p.py", line 139, in create
    g2p = clazz(g2p_config)
TypeError: 'NoneType' object is not callable
2024-07-25 21:16:05.354 - OVOS - ovos_plugin_manager.templates.tts:__init__:205 - ERROR - G2P plugin not loaded, there will be no mouth movements
Traceback (most recent call last):
  File "/media/bramiozo/Storage1/bramiozo/VIRTUALENVS/Python/seanos-bFLQpzeS-py3.10/lib/python3.10/site-packages/ovos_plugin_manager/templates/tts.py", line 203, in __init__
    self.g2p = OVOSG2PFactory.create(cfg)
  File "/media/bramiozo/Storage1/bramiozo/VIRTUALENVS/Python/seanos-bFLQpzeS-py3.10/lib/python3.10/site-packages/ovos_plugin_manager/g2p.py", line 139, in create
    g2p = clazz(g2p_config)
TypeError: 'NoneType

In [None]:
irish_lyrics = """
Bhí loch ag mo sheanmháthair,
Áit ina raibh na lachain ag snámh,
Le héadain bhána geal,
Is cluimhreach chomh bog le scamall.

Ó, a lachain álainn, a sheod,
Le do ghlór binn i gconaí ag glaoch,
I do loch ghlé geal,
Agus tú chomh saor le gaoth.

Sa mhaidin chiúin go moch,
Bhí an lacha ag éirí as a suan,
Le heireabaill ag crith,
Is a sciatháin ag sracadh an uisce.

Ó, a lachain álainn, a sheod,
Le do ghlór binn i gconaí ag glaoch,
I do loch ghlé geal,
Agus tú chomh saor le gaoth.

Nuair a tháinig an tráthnóna,
Bhí na lachain fós ann,
Le spraoi is súgradh leo,
Agus an ghrian ag dul faoi chiúin.

Ó, a lachain álainn, a sheod,
Le do ghlór binn i gconaí ag glaoch,
I do loch ghlé geal,
Agus tú chomh saor le gaoth.

Anois tá cuimhne agam ort,
A lachain mo sheanmháthar,
Áit álainn ar domhan,
Nach n-imeoidh uaim go bráth.

Ó, a lachain álainn, a sheod,
Le do ghlór binn i gconaí ag glaoch,
I do loch ghlé geal,
Agus tú chomh saor le gaoth
"""

dutch_lyrics = """
Zooals ik eenmaal beminde,
Zoo minde er op aarde nooit een,
Maar 'k vond, tot wien ik mij wendde,
Slechts harten van ijs en van steen.

Toen stierf mijn geloof aan de vriendschap,
Mijn hoop en mijn liefde verdween,
En zooals mijn hart toen haatte,
Zoo haatte er op aarde nooit een.

En sombere, bittere liederen
Zijn aan mijn lippen ontgleên;
Zoo somber en bitter als ik zong,
Zoo zong er op aarde nooit een.

Verveeld heeft mij eindlijk dat haten,
Dat eeuwig gezang en geween,
Ik zweeg, en zooals ik nu zwijg,
Zoo zweeg er op aarde nooit een.
"""

In [125]:
# randomly select from speaker list 
speaker_id = random.choice(tts_model.speakers)
print(speaker_id)

synth = tts_model.synthesizer
sampling_rate = synth.output_sample_rate

irish_waveform = synth.tts(irish_lyrics, speaker_name=speaker_id)
irish_waveform = np.array(irish_waveform)
irish_waveform = np.squeeze(irish_waveform)

MCV_d2514d88494807647e1a8b612bcfb207
 > Text splitted to sentences.
['Bhí loch ag mo sheanmháthair,', 'Áit ina raibh na lachain ag snámh,', 'Le héadain bhána geal,', 'Is cluimhreach chomh bog le scamall.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glaoch,', 'I do loch ghlé geal,', 'Agus tú chomh saor le gaoth.', 'Sa mhaidin chiúin go moch,', 'Bhí an lacha ag éirí as a suan,', 'Le heireabaill ag crith,', 'Is a sciatháin ag sracadh an uisce.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glaoch,', 'I do loch ghlé geal,', 'Agus tú chomh saor le gaoth.', 'Nuair a tháinig an tráthnóna,', 'Bhí na lachain fós ann,', 'Le spraoi is súgradh leo,', 'Agus an ghrian ag dul faoi chiúin.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glaoch,', 'I do loch ghlé geal,', 'Agus tú chomh saor le gaoth.', 'Anois tá cuimhne agam ort,', 'A lachain mo sheanmháthar,', 'Áit álainn ar domhan,', 'Nach n-imeoidh uaim go bráth.', 'Ó, a lachain álainn, a sheod,

In [126]:
mel_spectrogram = librosa.feature.melspectrogram(
    y=irish_waveform, 
    sr=sampling_rate,
    n_mels=80,  # You may need to adjust this based on your model's config
    n_fft=1024,  # You may need to adjust this
    hop_length=256  # You may need to adjust this
)
# Convert to log scale
mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
print(f"Mel spectrogram shape: {mel_spectrogram.shape}")

Mel spectrogram shape: (80, 16851)


In [127]:
#generate_and_save_melspec(mel_spectrogram, sampling_rate)

In [128]:
# irish_waveform as the original
wavfile.write(f"../artifacts/test_irish_finetuned_speaker{speaker_id}.wav", rate=sampling_rate, data=irish_waveform)

# transformed_waveform
#new_wav, sample_rate = apply_vocoder(synthesizer, mel_spectrogram, output_path=f"../artifacts/vocoder_speaker{speaker_id}.wav")

In [None]:
result = _neonTTS.get_tts(irish_lyrics,
                         "../artifacts/test_irish_original.wav",
                          speaker={
                              "language": "ga"
                          })

# XTTS API v0.22.0

In [None]:
MODEL_PATH = "/media/koekiemonster/DATA-FAST/TTS/tts_models/multilingual/multi-dataset/XTTS-v2"
CONFIG_PATH = "/media/koekiemonster/DATA-FAST/TTS/tts_models/multilingual/multi-dataset/XTTS-v2/config.json"

_tts = TTS(progress_bar=True,
            model_path=MODEL_PATH,
            config_path=CONFIG_PATH)
_tts.to('gpu')
_tts.tts_to_file(text=dutch_lyrics, 
                 language="nl", 
                 speaker_wav="../assets/english_bram.wav", 
                 file_path="../artifacts/test_dutch.wav")

## Direct ViTS

In [None]:
MODEL_PATH = "/media/bramiozo/DATA-FAST/TTS/tts_models/multilingual/multi-dataset/tts-vits-cv-ga"
CONFIG_PATH = "/media/bramiozo/DATA-FAST/TTS/tts_models/multilingual/multi-dataset/tts-vits-cv-ga/config.json"

config = VitsConfig()
config.load_json(CONFIG_PATH)
ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)
model = Vits.init_from_config(config)
model.load_checkpoint(config, 
                      checkpoint_path=os.path.join(MODEL_PATH,"model_file.pth.tar"), 
                      eval=True, 
                      strict=False, 
                      cache=False)
model.ap=ap
model.tokenizer=tokenizer
model.cuda()


In [None]:
wav, alignment, _, _ = synthesis(
    model,
    irish_lyrics,
    config,
    style_wav="../assets/english_bram.wav",
    use_cuda=True
).values()

# Save the output waveform
ap.save_wav(wav, "../artifacts/test_irish.wav")

In [93]:
result = _neonTTS.get_tts(irish_lyrics,
                         "../artifacts/test_irish_original.wav",
                          speaker={
                              "language": "ga"
                          })

 > Text splitted to sentences.
['Bhí loch ag mo sheanmháthair,', 'Áit ina raibh na lachain ag snámh,', 'Le héadain bhána geal,', 'Is cluimhreach chomh bog le scamall.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glaoch,', 'I do loch ghlé geal,', 'Agus tú chomh saor le gaoth.', 'Sa mhaidin chiúin go moch,', 'Bhí an lacha ag éirí as a suan,', 'Le heireabaill ag crith,', 'Is a sciatháin ag sracadh an uisce.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glaoch,', 'I do loch ghlé geal,', 'Agus tú chomh saor le gaoth.', 'Nuair a tháinig an tráthnóna,', 'Bhí na lachain fós ann,', 'Le spraoi is súgradh leo,', 'Agus an ghrian ag dul faoi chiúin.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glaoch,', 'I do loch ghlé geal,', 'Agus tú chomh saor le gaoth.', 'Anois tá cuimhne agam ort,', 'A lachain mo sheanmháthar,', 'Áit álainn ar domhan,', 'Nach n-imeoidh uaim go bráth.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glao

In [94]:
wavresult = _neonTTS.get_audio(irish_lyrics,  audio_format="ipython")

 > Text splitted to sentences.
['Bhí loch ag mo sheanmháthair,', 'Áit ina raibh na lachain ag snámh,', 'Le héadain bhána geal,', 'Is cluimhreach chomh bog le scamall.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glaoch,', 'I do loch ghlé geal,', 'Agus tú chomh saor le gaoth.', 'Sa mhaidin chiúin go moch,', 'Bhí an lacha ag éirí as a suan,', 'Le heireabaill ag crith,', 'Is a sciatháin ag sracadh an uisce.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glaoch,', 'I do loch ghlé geal,', 'Agus tú chomh saor le gaoth.', 'Nuair a tháinig an tráthnóna,', 'Bhí na lachain fós ann,', 'Le spraoi is súgradh leo,', 'Agus an ghrian ag dul faoi chiúin.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glaoch,', 'I do loch ghlé geal,', 'Agus tú chomh saor le gaoth.', 'Anois tá cuimhne agam ort,', 'A lachain mo sheanmháthar,', 'Áit álainn ar domhan,', 'Nach n-imeoidh uaim go bráth.', 'Ó, a lachain álainn, a sheod,', 'Le do ghlór binn i gconaí ag glao

In [95]:
wavfile.write("../artifacts/test_irish.wav", rate=wavresult['rate'], data=np.array(wavresult['data']))

In [None]:
 # useful to know if we want to apply our custom finetuned model 
 #model_path = _neonTTS._download_huggingface('neongeckocom/tts-vits-cv-ga')

In [None]:
Audio(np.array(wavresult['data']), rate=wavresult['rate'], normalize=False, autoplay=True)

In [None]:
song_to_edit = AudioSegment.from_file('../artifacts/test_irish_reencoded.wav', format='wav')


In [None]:
def add_reverb(sound, decay_factor=0.1, delays=[50, 100, 150, 200, 400, 800, 1600, 3200]):
    output = sound
    for delay in delays:
        overlay = sound - (3 * delay)
        output = output.overlay(overlay, position=delay)
        decay_factor *= decay_factor  # Decay the reverb effect
    return output


In [None]:
song_edited = add_reverb(song_to_edit)
song_edited.export("../artifacts/test_reverb.wav", format="wav")