In [124]:
# add autoreload
%load_ext autoreload
%autoreload 2

import os
import torch
import transformers
from TTS.api import TTS # only in v0.22
from TTS.tts.models.vits import Vits
from TTS.tts.models.xtts import Xtts

from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.configs.vits_config import VitsConfig

from TTS.tts.utils.synthesis import synthesis
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer


from neon_tts_plugin_coqui import CoquiTTS as neonTTS
from IPython.display import Audio
from neon_tts_plugin_coqui.configs import tts_config
from scipy.io import wavfile
from scipy.signal import welch
from scipy.signal import iirnotch, filtfilt, sosfiltfilt, butter, sosfilt, sosfreqz

import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from tensorflow.python.ops.gen_logging_ops import audio_summary

matplotlib.use('Qt5Agg')

import noisereduce as nr
from src import preprocessing

import random

import librosa
import soundfile as sf

from pydub import AudioSegment

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def generate_and_save_melspec(mel_spectrogram, sampling_rate, output_path="../artifacts/melgram.png"):
    # Generate mel spectrogram
    print(f"Mel spectrogram shape: {mel_spectrogram.shape}")
    print(f"Sampling rate: {sampling_rate}")

    # Save mel spectrogram as numpy array
    np.save(output_path + ".npy", mel_spectrogram)
    print(f"Mel spectrogram saved to {output_path}.npy")

    # Optionally, visualize and save the mel spectrogram as an image
    plt.figure(figsize=(10, 4))
    plt.imshow(mel_spectrogram, aspect='auto', origin='lower')
    plt.colorbar()
    plt.title('Mel Spectrogram')
    plt.tight_layout()
    plt.savefig(output_path + ".png")
    print(f"Mel spectrogram visualization saved to {output_path}.png")


# Vocoder test

In [None]:
def apply_vocoder(synthesizer, mel_spectrogram, output_path="../artifacts/vocoder.wav"):
    # Convert to tensor if it's not already
    if not isinstance(mel_spectrogram, torch.Tensor):
        mel_spectrogram = torch.FloatTensor(mel_spectrogram)

    # Ensure mel_spectrogram is the right shape (add batch dimension if needed)
    if mel_spectrogram.dim() == 2:
        mel_spectrogram = mel_spectrogram.unsqueeze(0)

    # Move to the same device as the vocoder (CUDA in this case)
    mel_spectrogram = mel_spectrogram.cuda()

    # Generate waveform
    with torch.no_grad():
        waveform = synthesizer.vocoder_model(mel_spectrogram)

    # Convert to numpy array
    waveform = waveform.cpu().numpy().squeeze()

    # Normalize audio to [-1, 1] range
    waveform = waveform / np.max(np.abs(waveform))

    # Get the sampling rate from the synthesizer
    sample_rate = synthesizer.vocoder_model.config.audio.sample_rate
    if sample_rate is None:
        # Fallback to a common sample rate if not found in config
        sample_rate = 22050
        print(f"Warning: Sample rate not found in vocoder config. Using default: {sample_rate}")


    # Save as wav file
    sf.write(output_path, waveform, sample_rate)
    print(f"Audio saved to {output_path}")

    return waveform, sample_rate

In [None]:
from TTS.utils.synthesizer import Synthesizer

In [None]:
# try https://huggingface.co/nvidia/tts_hifigan

In [None]:
vocoder_path = "/media/bramiozo/DATA-FAST/TTS/tts_models/gle/hifigan_vocoder_seanos"

In [None]:
synthesizer = Synthesizer()
synthesizer._load_vocoder(model_file=os.path.join(vocoder_path, "model_file.pth.tar"), 
                          model_config=os.path.join(vocoder_path, "config.json"), 
                          use_cuda=True)

In [None]:
tts_path = "/media/bramiozo/DATA-FAST/TTS/tts_models/gle/tts-vits-cv-ga_seanos"
tts_model = TTS(progress_bar=True,
                model_path=os.path.join(tts_path, "withPhonemes_withSpeakerEncoder_fft4098_allSingers_seperatedByPrep/model_file.pth.tar"),
                config_path=os.path.join(tts_path, "withPhonemes_withSpeakerEncoder_fft4098_allSingers_seperatedByPrep/config.json"))

In [None]:
tts_synthesizer = Synthesizer(tts_checkpoint=os.path.join(tts_path, "withPhonemes_withSpeakerEncoder_fft4098_allSingers_seperatedByPrep/model.pth"),
                              tts_config_path=os.path.join(tts_path, "withPhonemes_withSpeakerEncoder_fft4098_allSingers_seperatedByPrep/config.json"),
                              tts_speakers_file=os.path.join(tts_path, "withPhonemes_withSpeakerEncoder_fft4098_allSingers_seperatedByPrep/speakers.pth"),
                              tts_languages_file=os.path.join(tts_path, "language_ids.pth"),
                              vocoder_config=os.path.join(vocoder_path, "config.json"),
                              vocoder_checkpoint=os.path.join(vocoder_path, "best_model.pth")
                              )

In [None]:
_neonTTS = neonTTS(lang="ga", config={})

In [114]:
irish_lyrics = """
Bhí loch ag mo sheanmháthair,
Áit ina raibh na lachain ag snámh,
Le héadain bhána geal,
Is cluimhreach chomh bog le scamall.
"""

"""
Ó, a lachain álainn, a sheod,
Le do ghlór binn i gconaí ag glaoch,
I do loch ghlé geal,
Agus tú chomh saor le gaoth.

Sa mhaidin chiúin go moch,
Bhí an lacha ag éirí as a suan,from scipy.signal import welch

Le heireabaill ag crith,
Is a sciatháin ag sracadh an uisce.

Ó, a lachain álainn, a sheod,
Le do ghlór binn i gconaí ag glaoch,
I do loch ghlé geal,
Agus tú chomh saor le gaoth.

Nuair a tháinig an tráthnóna,
Bhí na lachain fós ann,
Le spraoi is súgradh leo,
Agus an ghrian ag dul faoi chiúin.

Ó, a lachain álainn, a sheod,
Le do ghlór binn i gconaí ag glaoch,
I do loch ghlé geal,
Agus tú chomh saor le gaoth.

Anois tá cuimhne agam ort,
A lachain mo sheanmháthar,
Áit álainn ar domhan,
Nach n-imeoidh uaim go bráth.

Ó, a lachain álainn, a sheod,
Le do ghlór binn i gconaí ag glaoch,
I do loch ghlé geal,
Agus tú chomh saor le gaoth
"""

dutch_lyrics = """
Zooals ik eenmaal beminde,
Zoo minde er op aarde nooit een,
Maar 'k vond, tot wien ik mij wendde,
Slechts harten van ijs en van steen.

Toen stierf mijn geloof aan de vriendschap,
Mijn hoop en mijn liefde verdween,
En zooals mijn hart toen haatte,
Zoo haatte er op aarde nooit een.

En sombere, bittere liederen
Zijn aan mijn lippen ontgleên;
Zoo somber en bitter als ik zong,
Zoo zong er op aarde nooit een.

Verveeld heeft mij eindlijk dat haten,
Dat eeuwig gezang en geween,
Ik zweeg, en zooals ik nu zwijg,
Zoo zweeg er op aarde nooit een.
"""

In [77]:
tts_model.speakers

['MCV_02e74f10e0327ad868d138f2b4fdd6f0',
 'MCV_035a6d8d8721f075d3eeeb25e4413502',
 'MCV_04e0d96daa7a8d1dc9948d13bd14c81d',
 'MCV_0528c8bed09c276fb6ab058820dbf9e0',
 'MCV_07974b511961d09cb0da27c0c959ab8d',
 'MCV_08a595a894bb72b93411e8af59144f6a',
 'MCV_0923e0456b7e0aaafb778ecb93a6352e',
 'MCV_0a319772898ef8163359af630a288669',
 'MCV_0beba018a63e28a8583543c6f39f10a6',
 'MCV_0cd4065ee07ccc1e3805d3455e077764',
 'MCV_0e0de9a39de96770d50e6fedbcaf0f69',
 'MCV_13ffa4195f5362d35b67afd6e82b5726',
 'MCV_1435de22b9df4a83b5e1898c05995708',
 'MCV_15926dcc42125114405323fb1f4cb6ae',
 'MCV_1679091c5a880faf6fb5e6087eb1b2dc',
 'MCV_17113e351f6032420215195a3525fde3',
 'MCV_1b9d967c542355502c235c49000312c1',
 'MCV_1be1458e6efbe7005b46b09db27d70db',
 'MCV_1ef4c8bd82871dec9d22ed3e1ba1d154',
 'MCV_1f0e3dad99908345f7439f8ffabdffc4',
 'MCV_1f4b737c6f401b82c2fe066989a534f6',
 'MCV_1fdf088d8283fdec89062842d6015afb',
 'MCV_1ff1de774005f8da13f42943881c655f',
 'MCV_21b30f1a25a5841e6cae632652267c86',
 'MCV_22d5ad8e3e

In [145]:
good_speakers = [ 'MCV_e4da3b7fbbce2345d7772b0674a318d5',
                  'MCV_d3d9446802a44259755d38e6d163e820',
                  'MCV_3c59dc048e8850243be8079a5c74d079',
                  'MCV_8e296a067a37563370ded05f5a3bf3ec',
                  'MCV_4e732ced3463d06de0ca9a15b6153677', 
                  'MCV_fbb1a0cf39f11efcc031d950dabb052e', 
                  'MCV_f289a00eda27794bd63b377387375254',
                  'MCV_e44c6be17899cf79fdc1997268b5366d',
                  'MCV_fbb1a0cf39f11efcc031d950dabb052e', 
                  'MCV_8f14e45fceea167a5a36dedd4bea2543'
                  ]


In [146]:
len(tts_model.speakers), [s in tts_model.speakers for s in good_speakers]

(178, [True, True, True, True, True, True, True, True, True, True])

In [147]:
# randomly select from speaker list 
synth = tts_model.synthesizer
sampling_rate = synth.output_sample_rate
files_written = []
waveforms = []
for speaker_id in good_speakers:    
    irish_waveform = synth.tts(irish_lyrics, speaker_name=speaker_id)
    irish_waveform = np.array(irish_waveform)
    irish_waveform = np.squeeze(irish_waveform)
    
    # irish_waveform as the original
    fwrite = f"../artifacts/RAW_irish_finetuned_speaker{speaker_id}.wav"
    wavfile.write(fwrite, rate=sampling_rate, data=irish_waveform)
    files_written.append(fwrite)
    waveforms.append(irish_waveform)

 > Text splitted to sentences.
['Bhí loch ag mo sheanmháthair,', 'Áit ina raibh na lachain ag snámh,', 'Le héadain bhána geal,', 'Is cluimhreach chomh bog le scamall.']
 > Processing time: 2.5453386306762695
 > Real-time factor: 0.20660824598897007
 > Text splitted to sentences.
['Bhí loch ag mo sheanmháthair,', 'Áit ina raibh na lachain ag snámh,', 'Le héadain bhána geal,', 'Is cluimhreach chomh bog le scamall.']
 > Processing time: 2.957944869995117
 > Real-time factor: 0.1830123809806061
 > Text splitted to sentences.
['Bhí loch ag mo sheanmháthair,', 'Áit ina raibh na lachain ag snámh,', 'Le héadain bhána geal,', 'Is cluimhreach chomh bog le scamall.']
 > Processing time: 3.0347492694854736
 > Real-time factor: 0.18175853268186304
 > Text splitted to sentences.
['Bhí loch ag mo sheanmháthair,', 'Áit ina raibh na lachain ag snámh,', 'Le héadain bhána geal,', 'Is cluimhreach chomh bog le scamall.']
 > Processing time: 2.9352593421936035
 > Real-time factor: 0.18520072708362603
 > Tex

In [148]:
# load in 10 random reference waveforms
import random
base_dir = '/media/bramiozo/DATA-FAST/TTS/tts_models/gle/seannos_datasource/clips'
ref_wav_files = os.listdir(base_dir)
reference_waves = []
for ref_wav_file in random.choices(ref_wav_files, k=20):
    rate, data = wavfile.read(os.path.join(base_dir, ref_wav_file))
    reference_waves.append((rate, data))

In [149]:
int_files = []
for raw_audio_file in files_written:
    output_path = raw_audio_file.replace('.wav', '_int16.wav')
    sample_rate, data = wavfile.read(raw_audio_file)
    
    if data.dtype == np.float32 or data.dtype == np.float64:
        # Normalize and convert to 16-bit PCM
        data_int16 = np.int16(data * 32767)  # Scale floats to 16-bit integer range        
        # Save the converted data as a new .wav file
        wavfile.write(output_path, sample_rate, data_int16)
        int_files.append(output_path)

In [82]:
for raw_audio_file in int_files:
    rate, data = wavfile.read(raw_audio_file)
    reduced_noise = nr.reduce_noise(y=data, sr=rate, n_fft=4096, win_length=4096, hop_length=512)
    raw_audio_file_denoised = raw_audio_file.replace('.wav', '_denoised.wav')    
    wavfile.write(raw_audio_file_denoised, rate, reduced_noise)

for raw_audio_file in int_files:
    rate, data = wavfile.read(raw_audio_file)
    reduced_noise = nr.reduce_noise(y=data, sr=rate, n_fft=2048, win_length=2048, hop_length=256)
    raw_audio_file_denoised = raw_audio_file.replace('.wav', '_denoised2.wav')    
    wavfile.write(raw_audio_file_denoised, rate, reduced_noise)
    
for raw_audio_file in int_files:
    rate, data = wavfile.read(raw_audio_file)
    reduced_noise = nr.reduce_noise(y=data, sr=rate, n_fft=2048, win_length=1028, hop_length=256)
    raw_audio_file_denoised = raw_audio_file.replace('.wav', '_denoised3.wav')    
    wavfile.write(raw_audio_file_denoised, rate, reduced_noise)
    
for raw_audio_file in int_files:
    rate, data = wavfile.read(raw_audio_file)
    reduced_noise = nr.reduce_noise(y=data, sr=rate, n_fft=2048, win_length=1028, hop_length=128)
    raw_audio_file_denoised = raw_audio_file.replace('.wav', '_denoised4.wav')    
    wavfile.write(raw_audio_file_denoised, rate, reduced_noise)

In [83]:
use_mono = False
use_noise_reduction = True
use_high_pass = True
use_low_pass = True
use_autotrim = False
use_pitcher = False
use_resampler = False
use_noise_adder = False
use_kernel_smoother = True
use_special_smoother = True
kwargs = {
    'noise_freq': 128,
    'octave_change': 0.25,
    'target_sample_rate': 44_000,
    'noise_level': 1e-4,
    'smoothing_kernel': (1,1,1),
    'window_length': 33,
    'poly_order': 3,
    'kernel_type': 'savgol_filter' # 
}

for raw_audio_file in int_files:
    output_path = raw_audio_file.replace('.wav', '_postprocessed.wav')
    audio_segment = AudioSegment.from_wav(raw_audio_file)
    processed_audio = preprocessing.process_audio(audio_segment,
                                    use_noise_reduction,
                                    use_high_pass,
                                    use_low_pass,
                                    use_autotrim,
                                    use_pitcher,
                                    use_mono,
                                    use_resampler,
                                    use_noise_adder,
                                    use_kernel_smoother,
                                    use_special_smoother,
                                    **kwargs
                                    )
    processed_audio.export(output_path, format="wav")
    print(f"Saved processed audio to {output_path}")

Saved processed audio to ../artifacts/RAW_irish_finetuned_speakerMCV_e4da3b7fbbce2345d7772b0674a318d5_int16_postprocessed.wav
Saved processed audio to ../artifacts/RAW_irish_finetuned_speakerMCV_d3d9446802a44259755d38e6d163e820_int16_postprocessed.wav
Saved processed audio to ../artifacts/RAW_irish_finetuned_speakerMCV_3c59dc048e8850243be8079a5c74d079_int16_postprocessed.wav
Saved processed audio to ../artifacts/RAW_irish_finetuned_speakerMCV_8e296a067a37563370ded05f5a3bf3ec_int16_postprocessed.wav
Saved processed audio to ../artifacts/RAW_irish_finetuned_speakerMCV_4e732ced3463d06de0ca9a15b6153677_int16_postprocessed.wav
Saved processed audio to ../artifacts/RAW_irish_finetuned_speakerMCV_fbb1a0cf39f11efcc031d950dabb052e_int16_postprocessed.wav
Saved processed audio to ../artifacts/RAW_irish_finetuned_speakerMCV_f289a00eda27794bd63b377387375254_int16_postprocessed.wav
Saved processed audio to ../artifacts/RAW_irish_finetuned_speakerMCV_e44c6be17899cf79fdc1997268b5366d_int16_postproces

In [157]:
plt.figure(figsize=(10, 6))
for raw_audio_file in int_files:
    rate, data = wavfile.read(raw_audio_file)
    frequencies, psd = welch(data, fs=rate, nperseg=1024)   
    plt.semilogy(frequencies, psd)
    plt.title('Power Spectral Density (PSD), synthetic material')
    plt.xlabel('Frequency [Hz]')
    plt.ylabel('Power/Frequency [dB/Hz]')
    plt.grid()
    plt.xlim(0,20_000)
plt.show()

In [154]:
plt.figure(figsize=(10, 6))
for sr, waveform in reference_waves:
    frequencies, psd = welch(waveform, fs=44_000, nperseg=1024)   
    plt.semilogy(frequencies, psd)
    plt.title('Power Spectral Density (PSD), source material')
    plt.xlabel('Frequency [Hz]')
    plt.ylabel('Power/Frequency [dB/Hz]')
    plt.grid()
    plt.xlim(0,20_000)
plt.show()

In [122]:
def repeated_filtering(data, 
                       filter_freqs=[2330, 2750, 3350, 4350], 
                       Qs=[50,50,70,70],
                       fs=44_000):
    assert(len(filter_freqs) == len(Qs)), "You need as many Qs as Freqs"
    
    filtered_wav = data.copy()
    for freq,Q in zip(filter_freqs,Qs):
        b,a = iirnotch(freq, Q, fs)
        filtered_wav = filtfilt(b, a, filtered_wav)
    return filtered_wav

def repeated_filtering_sos(data, 
                       filter_freqs=[2330, 2750, 3350, 4350], 
                       Rs=[50, 50, 50, 50],
                       order=4,
                       fs=44_000):
    assert(len(filter_freqs) == len(Rs)), "You need as many Qs as Freqs"
    
    filtered_wav = data.copy()
    for freq,R in zip(filter_freqs,Rs):
        f_low, f_high = freq-R, freq+R 
        sos = butter(order, [f_low, f_high], btype='bandstop', fs=fs, output='sos')
        filtered_wav = sosfilt(sos, filtered_wav)
    return filtered_wav

In [169]:
kwargs = {
    'filter_freqs' :[2350],# , 2750, 3120, 4340],
    'Qs' : [120], #,80,100,100],
    'fs' : 44_000
}
plt.figure(figsize=(10, 6))
for raw_audio_file in int_files:
    rate, data = wavfile.read(raw_audio_file)
    filtered_waveform = repeated_filtering(data, **kwargs)
    
    raw_audio_file_denoised = raw_audio_file.replace('.wav', '_denoised_notch.wav')    
    wavfile.write(raw_audio_file_denoised, rate, filtered_waveform)

    frequencies, psd = welch(filtered_waveform, fs=rate, nperseg=1024)  
    plt.semilogy(frequencies, psd)
    plt.title('Power Spectral Density (PSD)')
    plt.xlabel('Frequency [Hz]')
    plt.ylabel('Power/Frequency [dB/Hz]')
    plt.grid()
    plt.xlim(0,20_000)
plt.axhline(y=480, color='k')
plt.show()

In [164]:
kwargs = {
    'filter_freqs' :[2350, 2750, 3120, 4340],
    'Rs' : [50,50,50,50],
    'fs' : 44_000,
    'order': 4
}
plt.figure(figsize=(10, 6))
for raw_audio_file in int_files:
    rate, data = wavfile.read(raw_audio_file)
    filtered_waveform = repeated_filtering_sos(data, **kwargs)
    frequencies, psd = welch(filtered_waveform, fs=rate, nperseg=1024)   
    
    raw_audio_file_denoised = raw_audio_file.replace('.wav', '_denoised_butter.wav')    
    wavfile.write(raw_audio_file_denoised, rate, filtered_waveform)
    
    plt.semilogy(frequencies, psd)
    plt.title('Power Spectral Density (PSD)')
    plt.xlabel('Frequency [Hz]')
    plt.ylabel('Power/Frequency [dB/Hz]')
    plt.grid()
    plt.xlim(0,20_000)
    
plt.axhline(y=480, color='k')
plt.show()