In [34]:
import librosa
import os
import torch
import torchaudio
from unitspeech.speaker_encoder.ecapa_tdnn import ECAPA_TDNN_SMALL
from conf.hydra_config import MainConfig
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

try:
    path
except:
    path = "../"
    os.chdir(path)

In [35]:
cfg = MainConfig
device = torch.device("cuda" if torch.cuda.is_available() and cfg.train.on_GPU else "cpu")

print(f"Running from {os.getcwd()}")
print(f"Device: {device}")

Running from /workspace/local
Device: cuda


In [71]:
column_names = ['path', 'transcript', 'speaker_id']

reference_speech_samples = pd.read_csv('evaluation/evaluation.csv', delimiter="|", header=None, names=column_names)
# reference_speech_samples = pd.read_csv('evaluation/synthesized_audio_CONDY.csv', delimiter="|", header=None, names=column_names)
# reference_speech_samples = pd.read_csv('evaluation/synthesized_audio_CONDY_sv56.csv', delimiter="|", header=None, names=column_names)

# synthesized_speech_samples = pd.read_csv('evaluation/synthesized_audio_AWGN.csv', delimiter="|", header=None, names=column_names)
# synthesized_speech_samples = pd.read_csv('evaluation/synthesized_audio_AWGN_sv56.csv', delimiter="|", header=None, names=column_names)
synthesized_speech_samples = pd.read_csv('evaluation/synthesized_audio_AWGN_500_sv56.csv', delimiter="|", header=None, names=column_names)

synthesized_speech_samples.value_counts()

path                                                              transcript                                                                           speaker_id
/outputs/evaluation/with-finetune_AWGN_500_sv56/bal_ivan_026.wav  Mulțumesc dumneavoastră, zise Ivan, tresărind.                                       0             1
/outputs/evaluation/with-finetune_AWGN_500_sv56/mrl_rnd1_221.wav  La fiecare zgomot tresăream, de câte ori auzeam sirene mă ascundeam.                 24            1
/outputs/evaluation/with-finetune_AWGN_500_sv56/mrl_rnd2_044.wav  În acest caz, nu se poate spune de o schimbare.                                      24            1
/outputs/evaluation/with-finetune_AWGN_500_sv56/mrl_rnd2_031.wav  Apă otrăvită, după cum veți vedea în cele ce urmează.                                24            1
/outputs/evaluation/with-finetune_AWGN_500_sv56/mrl_rnd1_493.wav  Nu a murit, dar a rămas mutilată și asta m-a marcat.                                 24            1
   

# Get a list of the unique speakers

In [53]:
speakers = synthesized_speech_samples.speaker_id.unique()
speakers

array([ 0,  6,  8, 24, 37])

# Real Time Factor (RTF)

In [54]:
processing_time_50 = np.load("evaluation/processing_time_with-finetune_AWGN.npy")
print(f"Processing time mean: {processing_time_50.mean():.2f}")
print(f"Processing time max: {processing_time_50.max():.2f}")
print(f"Processig time min: {processing_time_50.min():.2f}")
speech_duration_50 = np.load("evaluation/speech_duration_with-finetune_AWGN.npy")
print(f"Speech duration mean: {speech_duration_50.mean():.2f}")
print(f"Speech duration max: {speech_duration_50.max():.2f}")
print(f"Speech duration min: {speech_duration_50.min():.2f}")
# Compute real time factor
RTF = processing_time_50.mean() / speech_duration_50.mean()
print(f"Average Real time factor AWGN: {RTF:.4f}")

# processing_time = np.load("evaluation/processing_time_with-finetune_CONDY.npy")
# print(f"Processing time mean: {processing_time.mean():.2f}")
# print(f"Processing time max: {processing_time.max():.2f}")
# print(f"Processig time min: {processing_time.min():.2f}")
# speech_duration = np.load("evaluation/speech_duration_with-finetune_CONDY.npy")
# print(f"Speech duration mean: {speech_duration.mean():.2f}")
# print(f"Speech duration max: {speech_duration.max():.2f}")
# print(f"Speech duration min: {speech_duration.min():.2f}")
# # Compute real time factor
# RTF = processing_time.mean() / speech_duration.mean()
# print(f"Average Real time factor CONDY: {RTF:.4f}")

processing_time_500 = np.load("evaluation/processing_time_with-finetune_AWGN_500.npy")
print(f"Processing time mean: {processing_time_500.mean():.2f}")
print(f"Processing time max: {processing_time_500.max():.2f}")
print(f"Processig time min: {processing_time_500.min():.2f}")
speech_duration_500 = np.load("evaluation/speech_duration_with-finetune_AWGN_500.npy")
print(f"Speech duration mean: {speech_duration_500.mean():.2f}")
print(f"Speech duration max: {speech_duration_500.max():.2f}")
print(f"Speech duration min: {speech_duration_500.min():.2f}")
# Compute real time factor
RTF = processing_time_500.mean() / speech_duration_500.mean()
print(f"Average Real time factor: {RTF:.4f}")

Processing time mean: 3.67
Processing time max: 8.80
Processig time min: 1.01
Speech duration mean: 3.98
Speech duration max: 9.67
Speech duration min: 0.67
Average Real time factor AWGN: 0.9224
Processing time mean: 33.81
Processing time max: 81.37
Processig time min: 9.12
Speech duration mean: 3.98
Speech duration max: 9.67
Speech duration min: 0.67
Average Real time factor: 8.4981


- Value < 1: The program is faster than real time -> system can be used to genereate speech in real time applications

In [30]:
speaker_id = [0, 6, 8, 24, 37]
num_speakers = 5
samples_per_speaker = 100
colors = plt.cm.rainbow(np.linspace(0, 1, num_speakers))

plt.figure(figsize=(10, 6))
for i in range(num_speakers):
    start_idx = i * samples_per_speaker
    end_idx = (i + 1) * samples_per_speaker
    plt.scatter(speech_duration_50[start_idx:end_idx], processing_time_50[start_idx:end_idx], color=colors[i])
for i in range(num_speakers):
    start_idx = i * samples_per_speaker
    end_idx = (i + 1) * samples_per_speaker
    plt.scatter(speech_duration_500[start_idx:end_idx], processing_time_500[start_idx:end_idx], color=colors[i])
# Add title and labels
plt.xlabel("Speech duration [s]")
plt.ylabel("Processing time [s]")
plt.savefig("RTF_all.png")

# Mean Opinion Score (MOS)

In [46]:
import torch
import torchaudio as ta
from s3prl.hub import mos_wav2vec2
import torchaudio as ta

mos_predictor = mos_wav2vec2().cuda()

2024-06-30 11:51:47 | INFO | s3prl.util.download | Requesting URL: https://www.dropbox.com/s/s9zpouk5svu1a4l/wav2vec2-dev-SRCC-best.ckpt?dl=1
2024-06-30 11:51:48 | INFO | s3prl.util.download | Using URL's local file: /root/.cache/s3prl/download/0d88c598bf659c310090af843bce281b4a3e558ddf80f59b7e1bb71b5bece17e.wav2vec2-dev-SRCC-best.ckpt?dl=1
2024-06-30 11:51:49 | INFO | s3prl.util.download | Requesting URL: https://huggingface.co/s3prl/converted_ckpts/resolve/main/wav2vec_small.pt
2024-06-30 11:51:49 | INFO | s3prl.util.download | Using URL's local file: /root/.cache/s3prl/download/aa064e275fe0123a0e1b515f2341bbe4408368510d91d0a6816f2822a6e5acdd.wav2vec_small.pt
[Featurizer] - Take a list of 13 features and weighted sum them.
[Featurizer] - The selected feature hidden_states's downsample rate is 320


# Experiment - MOS of original dataset

In [160]:
SAMPLE_RATE = 16_000
crnt_mos_scores = np.array([])
df = pd.read_csv(
    "evaluation/data_split_TEMP/metadata_SWARA1.0_text.csv", delimiter="|", header=None, names=column_names
)
df = df.sample(frac=1).reset_index(drop=True)

for index, row in df.iterrows():
    audio, sr = ta.load(row.path)
    audio = ta.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(audio).to(device)
    with torch.no_grad():
        mos_score = mos_predictor(audio)["scores"]
    crnt_mos_scores = np.append(crnt_mos_scores, mos_score.cpu().numpy())
crnt_mos_scores.mean()

      att_w = softmax(self.W(batch_rep).squeeze(-1)).unsqueeze(-1)
    


3.2696068091786317

- OBS mean MOS over train dataset was: 3.2696068091786317

- From 2K random samples we get around 3.3 MOS score

# Experiment - MOS of synthesized samples

In [47]:
SAMPLE_RATE = 16_000
mos_scores = {}
for speaker in speakers:
    crnt_mos_scores = np.array([])
    speaker_samples = synthesized_speech_samples[synthesized_speech_samples.speaker_id == speaker]
    for index, row in speaker_samples.iterrows():
        audio, sr = ta.load(row.path)
        audio = ta.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(audio).to(device)
        with torch.no_grad():
            mos_score = mos_predictor(audio)['scores']
        crnt_mos_scores = np.append(crnt_mos_scores, mos_score.cpu().numpy())
    mos_scores[speaker] = crnt_mos_scores

  att_w = softmax(self.W(batch_rep).squeeze(-1)).unsqueeze(-1)


Index: 53, MOS: tensor([3.5653])
Index: 205, MOS: tensor([3.5326])
Index: 230, MOS: tensor([3.5359])
Index: 233, MOS: tensor([3.5380])
Index: 357, MOS: tensor([3.5451])
Index: 389, MOS: tensor([3.6227])


In [50]:
for speaker, scores in mos_scores.items():
    print(f"Speaker: {speaker}")
    print(f"\tMean MOS: {scores.mean():.2f}")
    print(f"\tMax MOS: {scores.max():.2f}")
    print(f"\tMin MOS: {scores.min():.2f}")

Speaker: 0
	Mean MOS: 3.23
	Max MOS: 3.57
	Min MOS: 2.80
Speaker: 6
	Mean MOS: 3.24
	Max MOS: 3.42
	Min MOS: 3.01
Speaker: 8
	Mean MOS: 3.36
	Max MOS: 3.54
	Min MOS: 3.18
Speaker: 24
	Mean MOS: 3.40
	Max MOS: 3.62
	Min MOS: 3.20
Speaker: 37
	Mean MOS: 3.31
	Max MOS: 3.49
	Min MOS: 3.12


In [51]:
global_mos = np.mean([score for speaker, scores in mos_scores.items() for score in scores])
print(f"MOS for all speakers: {global_mos:.2f}")

MOS for all speakers: 3.31


- 3.28 - fara sv56 - same MOS W/WO condy
- 3.31 - cu sv56 in ambele cazuri
- 3.33 - cu sv56 AWGN si 500 iteratii pentru a genera

# Speaker Cosine Similarity

In [55]:
speaker_encoder_path = "/checkpoints/EVALUATION/speaker_encoder/checkpts/speaker_encoder.pt"

# Speaker Encoder for extracting speaker embedding
print("Initializing Speaker Encoder...")
spk_embedder = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
state_dict = torch.load(speaker_encoder_path, map_location=lambda storage, loc: storage)
spk_embedder.load_state_dict(state_dict["model"], strict=False)
_ = spk_embedder.cuda().eval()

Initializing Speaker Encoder...


Using cache found in /root/.cache/torch/hub/s3prl_s3prl_main
2024-06-30 11:58:05 | INFO | s3prl.util.download | Requesting URL: https://huggingface.co/s3prl/converted_ckpts/resolve/main/wavlm_large.pt
2024-06-30 11:58:05 | INFO | s3prl.util.download | Using URL's local file: /root/.cache/s3prl/download/f2d5200177fd6a33b278b7b76b454f25cd8ee866d55c122e69fccf6c7467d37d.wavlm_large.pt
2024-06-30 11:58:07 | INFO | s3prl.upstream.wavlm.WavLM | WavLM Config: {'extractor_mode': 'layer_norm', 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': 'gelu', 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'normalize': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.0, 'dropout_input': 0.0, 'dropout_features': 0.0, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': 'static', 'm

In [56]:
from torch import cosine_similarity

if len(synthesized_speech_samples) != len(reference_speech_samples):
    raise ValueError("The number of samples in the reference and synthesized datasets do not match.")

SCS = {speaker: np.array([]) for speaker in speakers}

for (idx_ref, row_ref), (idx_synth, row_synth) in zip(
    reference_speech_samples.iterrows(), synthesized_speech_samples.iterrows()
):
    transcript_ref = row_ref.transcript
    transcript_synth = row_synth.transcript
    assert row_ref.speaker_id == row_synth.speaker_id, f"Speaker ID mismatch at index {idx_ref}."
    assert transcript_ref == transcript_synth, "Transcripts dont match "

    spk_id = row_ref.speaker_id
    path_ref = row_ref.path
    path_syth = row_synth.path

    wav_ref, sr_ref = librosa.load(path_ref)
    wav_ref = torch.FloatTensor(wav_ref).unsqueeze(0)
    resample_fn = torchaudio.transforms.Resample(sr_ref, cfg.spkr_embedder.sr).cuda()
    wav_ref = resample_fn(wav_ref.cuda())
    spk_emb_ref = spk_embedder(wav_ref)

    wav_synth, sr_synth = librosa.load(path_syth)
    wav_synth = torch.FloatTensor(wav_synth).unsqueeze(0)
    resample_fn = torchaudio.transforms.Resample(sr_synth, cfg.spkr_embedder.sr).cuda()
    wav_synth = resample_fn(wav_synth.cuda())
    spk_emb_synth = spk_embedder(wav_synth)

    output = cosine_similarity(spk_emb_ref, spk_emb_synth, dim=1)
    SCS[spk_id] = np.append(SCS[spk_id], output.detach().cpu().numpy())



In [58]:
for speaker, scores in SCS.items():
    print(f"Speaker: {speaker}")
    print(f"\tMean Speaker Cosine Similarity: {scores.mean():.2f}")
    print(f"\tMax Speaker Cosine Similarity: {scores.max():.2f}")
    print(f"\tMin Speaker Cosine Similarity: {scores.min():.2f}")

Speaker: 0
	Mean Speaker Cosine Similarity: 0.52
	Max Speaker Cosine Similarity: 0.72
	Min Speaker Cosine Similarity: 0.23
Speaker: 6
	Mean Speaker Cosine Similarity: 0.47
	Max Speaker Cosine Similarity: 0.70
	Min Speaker Cosine Similarity: -0.09
Speaker: 8
	Mean Speaker Cosine Similarity: 0.47
	Max Speaker Cosine Similarity: 0.65
	Min Speaker Cosine Similarity: 0.05
Speaker: 24
	Mean Speaker Cosine Similarity: 0.57
	Max Speaker Cosine Similarity: 0.70
	Min Speaker Cosine Similarity: 0.31
Speaker: 37
	Mean Speaker Cosine Similarity: 0.43
	Max Speaker Cosine Similarity: 0.62
	Min Speaker Cosine Similarity: 0.21


In [59]:
global_scs = np.mean([score for speaker, scores in SCS.items() for score in scores])
print(f"SCS for all speakers: {global_scs:.4f}")

SCS for all speakers: 0.4923


# AWGN + 50 it
- Speaker: 0
	- Mean Speaker Cosine Similarity: 0.52
	- Max Speaker Cosine Similarity: 0.72
	- Min Speaker Cosine Similarity: 0.23
- Speaker: 6
	- Mean Speaker Cosine Similarity: 0.47
	- Max Speaker Cosine Similarity: 0.70
	- Min Speaker Cosine Similarity: -0.09
- Speaker: 8
	- Mean Speaker Cosine Similarity: 0.47
	- Max Speaker Cosine Similarity: 0.65
	- Min Speaker Cosine Similarity: 0.05
- Speaker: 24
	- Mean Speaker Cosine Similarity: 0.57
	- Max Speaker Cosine Similarity: 0.70
	- Min Speaker Cosine Similarity: 0.31
- Speaker: 37
	- Mean Speaker Cosine Similarity: 0.43
	- Max Speaker Cosine Similarity: 0.62
	- Min Speaker Cosine Similarity: 0.21

SCS for all speakers: 0.4923

## AWGN + SV56 + 50 it

- Speaker: 0
	- Mean Speaker Cosine Similarity: 0.52
	- Max Speaker Cosine Similarity: 0.72
	- Min Speaker Cosine Similarity: 0.23
- Speaker: 6
	- Mean Speaker Cosine Similarity: 0.47
	- Max Speaker Cosine Similarity: 0.70
	- Min Speaker Cosine Similarity: -0.08
- Speaker: 8
	- Mean Speaker Cosine Similarity: 0.47
	- Max Speaker Cosine Similarity: 0.65
	- Min Speaker Cosine Similarity: 0.05
- Speaker: 24
	- Mean Speaker Cosine Similarity: 0.57
	- Max Speaker Cosine Similarity: 0.70
	- Min Speaker Cosine Similarity: 0.31
- Speaker: 37
	- Mean Speaker Cosine Similarity: 0.43
	- Max Speaker Cosine Similarity: 0.62
	- Min Speaker Cosine Similarity: 0.21

- SCS for all speakers: 0.4924

## 500 iterations + sv56

- Speaker: 0
	- Mean Speaker Cosine Similarity: 0.51
	- Max Speaker Cosine Similarity: 0.70
	- Min Speaker Cosine Similarity: 0.20
- Speaker: 6
	- Mean Speaker Cosine Similarity: 0.49
	- Max Speaker Cosine Similarity: 0.70
	- Min Speaker Cosine Similarity: -0.08
- Speaker: 8
	- Mean Speaker Cosine Similarity: 0.48
	- Max Speaker Cosine Similarity: 0.64
	- Min Speaker Cosine Similarity: 0.09
- Speaker: 24
	- Mean Speaker Cosine Similarity: 0.57
	- Max Speaker Cosine Similarity: 0.70
	- Min Speaker Cosine Similarity: 0.35
- Speaker: 37
	- Mean Speaker Cosine Similarity: 0.44
	- Max Speaker Cosine Similarity: 0.63
	- Min Speaker Cosine Similarity: 0.26
- SCS for all speakers: 0.4993

## WER and CER

In [63]:
# https://github.com/openai/whisper/tree/main?tab=readme-ov-file
import whisper
lang = "ro_ro"

model = whisper.load_model("medium").cuda()
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is multilingual and has 762,321,920 parameters.


In [64]:
import re

def normalize_text(text: str) -> str:
    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–„]'

    text = re.sub(chars_to_ignore_regex, "", text.lower())

    # In addition, we can normalize the target text, e.g. removing new lines characters etc...
    # note that order is important here!
    token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]

    for t in token_sequences_to_ignore:
        text = " ".join(text.split(t))
    return text

In [72]:
import IPython.display as ipd
from jiwer import wer, cer

options = whisper.DecodingOptions()

wer_all = np.array([])
cer_all = np.array([])

for idx, row in synthesized_speech_samples.iterrows():
    path = row.path
    transcript = normalize_text(row.transcript)
    audio = whisper.load_audio(path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    hypothesis = normalize_text(whisper.decode(model, mel, options).text)

    print(f"Transcript: {transcript}")
    print(f"Decoded: {hypothesis}")

    wer_all = np.append(wer_all, wer(transcript, hypothesis))
    cer_all = np.append(cer_all, cer(transcript, hypothesis))

Transcript: nu în fața prefecturii
Decoded: nu în fața prefecturii
Transcript: o lună este enorm sunt ca terminat spune micul muzician
Decoded: o lună este enorm sunt ca terminați spune micul muzician
Transcript: în consecință pot acționa mai rapid și mai eficient
Decoded: în consecință pot acționa mai rapid și mai eficient
Transcript: mie mi sa reproșat adesea că distrug prestigiul universității atacând găștile universitare
Decoded: mie mi sa reproșat adesea că distrug prestigiul universității atacând găștile universitare
Transcript: toate acestea au fost confiscate și distruse
Decoded: toate acestea au fost confiscate și distruse
Transcript: sportiv poate pentru că și fratele meu a făcut box
Decoded: sportiv poate pentru ca si fratele meu au facut boc
Transcript: alte cincizeci de posturi dedicate muncitorilor sunt disponibile în hunedoara
Decoded: alte 50 de posturi dedicate muncitorilor sunt disponibile în hunedoara
Transcript: unii patroni închid unitățile alții caută soluții de r

In [73]:
# 500 iterations + sv56
print(f"WER: {wer_all.mean() * 100:.2f}[%]")
print(f"CER: {cer_all.mean() * 100:.2f}[%]")

WER: 15.51[%]
CER: 5.41[%]


In [70]:
# 50 iterations + sv56
print(f"WER: {wer_all.mean() * 100:.2f}[%]")
print(f"CER: {cer_all.mean() * 100:.2f}[%]")

WER: 15.18[%]
CER: 5.39[%]


In [66]:
# 50 iterations
print(f"WER: {wer_all.mean() * 100:.2f}[%]")
print(f"CER: {cer_all.mean() * 100:.2f}[%]")

WER: 15.24[%]
CER: 5.46[%]


In [149]:
# print(f"WER: {wer_all.mean() * 100:.2f}[%]")
# print(f"CER: {cer_all.mean() * 100:.2f}[%]")

WER: 15.18[%]
CER: 5.39[%]


- 50 iterations
    - WER: 15.24[%]
    - CER: 5.46[%]

- 50 iterations + sv56
    - WER: 15.18[%]
    - CER: 5.39[%]

- 500 iterations + sv56