In [1]:
import argparse
import IPython.display as ipd
import json
import librosa
import os

# To prevent the path from becoming corrupted when this cell is executed more than once.
try:
    path
except:
    path = "../"
    os.chdir(path)
    
import phonemizer
import random
from scipy.io.wavfile import write
import torch
import torchaudio
from tqdm import tqdm
from transformers import HubertModel

from unitspeech.unitspeech import UnitSpeech
from unitspeech.duration_predictor import DurationPredictor
from unitspeech.encoder import Encoder
from unitspeech.speaker_encoder.ecapa_tdnn import ECAPA_TDNN_SMALL
from unitspeech.text import cleaned_text_to_sequence, phonemize, symbols
from unitspeech.textlesslib.textless.data.speech_encoder import SpeechEncoder
from unitspeech.util import HParams, fix_len_compatibility, intersperse, process_unit, generate_path, sequence_mask
from unitspeech.vocoder.env import AttrDict
from unitspeech.vocoder.meldataset import mel_spectrogram
from unitspeech.vocoder.models import BigVGAN

from conf.hydra_config import (
    MainConfig,
)
import pandas as pd
import numpy as np

import soundfile as sf

from unitspeech.util import (
    fix_len_compatibility,
    save_plot,
    sequence_mask,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cfg = MainConfig
device = torch.device("cuda" if torch.cuda.is_available() and cfg.train.on_GPU else "cpu")

print(f"Running from {os.getcwd()}")
print(f"Device: {device}")

Running from /workspace/local
Device: cuda


In [3]:
column_names = ['path', 'transcript', 'speaker_id']
reference_speech_samples = pd.read_csv('evaluation/synthesized_audio_AWGN.csv', delimiter="|", header=None, names=column_names)
reference_speech_samples.value_counts()

path                                                     transcript                                                                           speaker_id
/outputs/evaluation/with-finetune_AWGN/bal_ivan_026.wav  Mulțumesc dumneavoastră, zise Ivan, tresărind.                                       0             1
/outputs/evaluation/with-finetune_AWGN/mrl_rnd1_221.wav  La fiecare zgomot tresăream, de câte ori auzeam sirene mă ascundeam.                 24            1
/outputs/evaluation/with-finetune_AWGN/mrl_rnd2_044.wav  În acest caz, nu se poate spune de o schimbare.                                      24            1
/outputs/evaluation/with-finetune_AWGN/mrl_rnd2_031.wav  Apă otrăvită, după cum veți vedea în cele ce urmează.                                24            1
/outputs/evaluation/with-finetune_AWGN/mrl_rnd1_493.wav  Nu a murit, dar a rămas mutilată și asta m-a marcat.                                 24            1
                                                         

In [7]:
def sv56_normalize(audio, sr, target_level=-26.0):
    # Placeholder function for the actual SV56 algorithm
    # Compute the current level
    rms = np.sqrt(np.mean(audio**2))
    current_level = 20 * np.log10(rms)
    
    # Compute the gain required to achieve the target level
    gain_db = target_level - current_level
    gain = 10**(gain_db / 20)
    
    # Apply gain to normalize the audio
    normalized_audio = audio * gain
    return normalized_audio

# Iterate over each row in the dataframe
for index, row in reference_speech_samples.iterrows():
    # Load the audio file
    path = row['path']
    audio, sr = librosa.load(path, sr=16_000)
    # Play the audio file

    # Get parent directory 
    parent_dir = os.path.dirname(path)
    parent_dir = parent_dir + "_16K"

    os.makedirs(parent_dir, exist_ok=True)

    # Get file name
    file_name = os.path.basename(path)

    # Normalize the audio using the SV56 algorithm
    normalized_audio = sv56_normalize(audio, sr)
    
    new_path = os.path.join(parent_dir, file_name)
    write(new_path, 16_000, audio)