In [117]:
from datasets import load_dataset, concatenate_datasets

ds = load_dataset("UmarRamzan/common-voice-urdu-processed")
train_data = ds['train']
test_data = ds['test']
dataset = concatenate_datasets([train_data, test_data])



In [118]:
import pandas as pd

In [119]:
dataset = dataset.select(range(10))

In [120]:
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import soundfile as sf
from pesq import pesq
from scipy.io import wavfile
import numpy as np



In [121]:
processor = SpeechT5Processor.from_pretrained("aarishshahmohsin/urdu_processor_t5")
model = SpeechT5ForTextToSpeech.from_pretrained("aarishshahmohsin/final_urdu_t5_finetuned")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) 

In [122]:
import numpy as np
import librosa
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf
from scipy.io import wavfile

# Initialize scores lists
lsd_scores = []
mos_scores = []

# Define LSD thresholds for normalization to MOS
LSD_MIN = 5 
LSD_MAX = 20  

# Function to compute Log Spectral Distance (LSD)
def log_spectral_distance(ref_audio, gen_audio, sr=16000):
    min_length = min(len(ref_audio), len(gen_audio))
    ref_audio = ref_audio[:min_length]
    gen_audio = gen_audio[:min_length]

    ref_spectrum = np.abs(librosa.stft(ref_audio))
    gen_spectrum = np.abs(librosa.stft(gen_audio))
    ref_log = np.log(ref_spectrum + 1e-10)  
    gen_log = np.log(gen_spectrum + 1e-10)
    lsd = np.mean(np.sqrt(np.mean((ref_log - gen_log) ** 2, axis=0)))
    return lsd

# Function to normalize LSD scores to MOS scale
def normalize_lsd_to_mos(lsd_score, lsd_min, lsd_max):
    if lsd_score < lsd_min:
        return 5.0  # Best quality
    elif lsd_score > lsd_max:
        return 1.0  # Worst quality
    else:
        return 5 - ((lsd_score - lsd_min) / (lsd_max - lsd_min) * 4)


# Sample processing loop
x = 0
for sample in dataset:
    text = sample['sentence']  # Assuming this is the text to synthesize
    original_audio_array = sample['audio']['array']  # Use the audio array from the dataset

    # Generate speech from text
    inputs = processor(text=text, return_tensors="pt")
    generated_speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    # Save the generated speech to a WAV file
    generated_audio_path = "generated_speech.wav"
    sf.write(generated_audio_path, generated_speech.numpy(), samplerate=16000)
    print(f"Processing sample {x}")
    x += 1

    # Convert the original audio array to float format if necessary
    if original_audio_array.dtype == np.int16:
        original_audio_array = original_audio_array.astype(np.float32) / np.iinfo(np.int16).max  # Scale to [-1.0, 1.0]

    # Save the original audio array to a WAV file
    original_audio_path = f"original_audio_{x}.wav"
    sf.write(original_audio_path, original_audio_array, samplerate=16000)

    # Compute the Log Spectral Distance (LSD)
    fs_ref, ref_audio = wavfile.read(original_audio_path)
    fs_deg, deg_audio = wavfile.read(generated_audio_path)

    # Ensure both audios have the same sample rate
    if fs_ref == fs_deg == 16000:
        
        lsd_score = log_spectral_distance(original_audio_array, generated_speech.numpy(), sr=16000)
        lsd_scores.append(lsd_score)
        print(lsd_score)

        # Normalize LSD score to MOS range
        mos_score = normalize_lsd_to_mos(lsd_score, LSD_MIN, LSD_MAX)
        mos_scores.append(mos_score)
        print(mos_score)
    else:
        print(f"Sample rate mismatch")

# Calculate the average scores
average_lsd = np.mean(lsd_scores)
average_mos = np.mean(mos_scores)

print(f"Average Log Spectral Distance (LSD) score: {average_lsd}")
print(f"Average MOS score (from LSD): {average_mos}")


Processing sample 0
8.592618581798998
4.041968378186934
Processing sample 1
9.35345217453587
3.839079420123768
Processing sample 2
10.543334831137045
3.5217773783634545
Processing sample 3
11.684107309375568
3.217571384166515
Processing sample 4
10.71150095313978
3.4769330791627255
Processing sample 5
10.930715643798537
3.41847582832039
Processing sample 6
8.296943636303336
4.12081503031911
Processing sample 7
8.692879204145214
4.015232212227943
Processing sample 8
8.779797434336793
3.9920540175101884
Processing sample 9
9.233099076095282
3.871173579707925
Average Log Spectral Distance (LSD) score: 9.681844884466644
Average MOS score (from LSD): 3.7515080308088953


In [123]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import torch.nn.utils.prune as prune
import soundfile as sf
from datasets import load_dataset

def prune_model(model, amount=0.15):
    """
    Apply L1 unstructured pruning only to specific layers
    amount: percentage of weights to prune (0.15 = 15%)
    """
    for name, module in model.named_modules():
        # Only prune specific layers to maintain quality
        if isinstance(module, torch.nn.Linear) and ('encoder.feed_forward' in name):
            prune.l1_unstructured(module, name='weight', amount=amount)
            prune.remove(module, 'weight')

def apply_selective_quantization(model):
    """
    Apply very light quantization only to specific layers
    that don't heavily impact audio quality
    """
    quantized_model = torch.quantization.quantize_dynamic(
        model,
        {torch.nn.Linear}, 
        dtype=torch.float16,  
        inplace=True
    )
    
    return quantized_model

# model = SpeechT5ForTextToSpeech.from_pretrained("aarishshahmohsin/final_technical_terms_t5_finetuned")

print("Applying light pruning...")
prune_model(model)

print("Applying minimal quantization...")
model = apply_selective_quantization(model)

Applying light pruning...
Applying minimal quantization...


In [124]:
dataset[0]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/98bf2841c44feb87b01d6be50c6f7419f1ac73fbf85cbe37236d236156bc5b3f/ur_train_0/common_voice_ur_31857979.mp3',
 'audio': {'path': 'common_voice_ur_31857979.mp3',
  'array': array([ 1.13686838e-13, -9.66338121e-13, -2.33058017e-12, ...,
         -4.77751655e-07, -4.93692141e-07, -3.46486331e-07]),
  'sampling_rate': 48000},
 'sentence': 'معاشی مواقع پیدا ہوں',
 'variant': ''}

In [125]:
import numpy as np
import librosa
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf
from scipy.io import wavfile

# Initialize scores lists
lsd_scores = []
mos_scores = []

# Define LSD thresholds for normalization to MOS
LSD_MIN = 5 
LSD_MAX = 20  

# Function to compute Log Spectral Distance (LSD)
def log_spectral_distance(ref_audio, gen_audio, sr=16000):
    min_length = min(len(ref_audio), len(gen_audio))
    ref_audio = ref_audio[:min_length]
    gen_audio = gen_audio[:min_length]

    ref_spectrum = np.abs(librosa.stft(ref_audio))
    gen_spectrum = np.abs(librosa.stft(gen_audio))
    ref_log = np.log(ref_spectrum + 1e-10)  
    gen_log = np.log(gen_spectrum + 1e-10)
    lsd = np.mean(np.sqrt(np.mean((ref_log - gen_log) ** 2, axis=0)))
    return lsd

# Function to normalize LSD scores to MOS scale
def normalize_lsd_to_mos(lsd_score, lsd_min, lsd_max):
    if lsd_score < lsd_min:
        return 5.0  # Best quality
    elif lsd_score > lsd_max:
        return 1.0  # Worst quality
    else:
        return 5 - ((lsd_score - lsd_min) / (lsd_max - lsd_min) * 4)


# Sample processing loop
x = 0
for sample in dataset:
    text = sample['sentence']  # Assuming this is the text to synthesize
    original_audio_array = sample['audio']['array']  # Use the audio array from the dataset

    # Generate speech from text
    inputs = processor(text=text, return_tensors="pt")
    generated_speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    # Save the generated speech to a WAV file
    generated_audio_path = "generated_speech.wav"
    sf.write(generated_audio_path, generated_speech.numpy(), samplerate=16000)
    print(f"Processing sample {x}")
    x += 1

    # Convert the original audio array to float format if necessary
    if original_audio_array.dtype == np.int16:
        original_audio_array = original_audio_array.astype(np.float32) / np.iinfo(np.int16).max  # Scale to [-1.0, 1.0]

    # Save the original audio array to a WAV file
    original_audio_path = f"original_audio_{x}.wav"
    sf.write(original_audio_path, original_audio_array, samplerate=16000)

    # Compute the Log Spectral Distance (LSD)
    fs_ref, ref_audio = wavfile.read(original_audio_path)
    fs_deg, deg_audio = wavfile.read(generated_audio_path)

    # Ensure both audios have the same sample rate
    if fs_ref == fs_deg == 16000:
        
        lsd_score = log_spectral_distance(original_audio_array, generated_speech.numpy(), sr=16000)
        lsd_scores.append(lsd_score)
        print(lsd_score)

        # Normalize LSD score to MOS range
        mos_score = normalize_lsd_to_mos(lsd_score, LSD_MIN, LSD_MAX)
        mos_scores.append(mos_score)
        print(mos_score)
    else:
        print(f"Sample rate mismatch")

# Calculate the average scores
average_lsd = np.mean(lsd_scores)
average_mos = np.mean(mos_scores)

print(f"Average Log Spectral Distance (LSD) score: {average_lsd}")
print(f"Average MOS score (from LSD): {average_mos}")


Processing sample 0
8.914122600703172
3.956233973145821
Processing sample 1
9.086281062756157
3.9103250499316915
Processing sample 2
10.75851421884005
3.46439620830932
Processing sample 3
11.22987997338771
3.338698673763277
Processing sample 4
10.470390275456088
3.5412292598783766
Processing sample 5
10.979913771023833
3.405356327726978
Processing sample 6
8.218208450309385
4.141811079917497
Processing sample 7
8.470232891559833
4.074604562250711
Processing sample 8
9.085405482121146
3.9105585381010277
Processing sample 9
9.197052637406227
3.8807859633583393
Average Log Spectral Distance (LSD) score: 9.641000136356359
Average MOS score (from LSD): 3.762399963638304
