## Automatic Text to Speech (ATS)

In [14]:
from huggingface_hub import login
token = "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"
print("Hugging Face logging")
login(token)

Hugging Face logging


In [15]:
import torch
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

Usando dispositivo: mps


In [16]:
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# DONE: Creación del modelo y processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
# DONE: En este caso indicamos el idioma del audio añadiendo el parámetro language
forced_decoder_ids = processor.get_decoder_prompt_ids(language="spanish", task="transcribe")

### Ajuste fino ASR

Vamos entrenar el modelo con voces que posean acento argentino

In [17]:
from datasets import load_dataset

# Cargar el dataset personalizado
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", cache_dir="./data/common_voice_11_0_test")
dataset = dataset.filter(lambda example: "Argentina" in example["accent"])
print(dataset)

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 201
})


In [18]:
from datasets import Audio

# Preprocesar el dataset
def preprocess_data(batch):
    # Extraer el audio y procesarlo
    audio = batch["audio"]
    inputs = processor.feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt", padding="longest"
    )
    batch["input_features"] = inputs.input_features[0]

    # Tokenizar el texto para las etiquetas
    labels = processor.tokenizer(batch["sentence"], return_tensors="pt", padding="longest").input_ids[0]
    batch["labels"] = labels
    return batch


dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names)


Map:   0%|          | 0/201 [00:00<?, ? examples/s]

In [19]:
import torch

# Crear un DataCollator personalizado con padding a 3000
class WhisperDataCollator:
    def __init__(self, processor, max_length=3000):
        self.processor = processor
        self.max_length = max_length

    def __call__(self, features):
        # Obtener input_features y labels
        input_features = [torch.tensor(feature["input_features"]) for feature in features]
        labels = [torch.tensor(feature["labels"]) for feature in features]

        # Aplicar padding a input_features a la longitud especificada
        padded_input_features = [
            torch.nn.functional.pad(
                feat, (0, self.max_length - feat.shape[1]), mode="constant", value=0
            )
            for feat in input_features
        ]

        # Crear el batch con input_features y labels
        batch = {
            "input_features": torch.stack(padded_input_features),
            "labels": torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100),
        }
        return batch

# Instanciar el DataCollator personalizado
data_collator = WhisperDataCollator(processor, max_length=3000)


In [21]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")

# Configuración del entrenamiento
training_args = Seq2SeqTrainingArguments(
    output_dir="./models/whisper-fine-tuned-argentine",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    remove_unused_columns=False,
)

# Crear el Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,  # Usar el DataCollator personalizado
)


# Iniciar el entrenamiento
trainer.train()
# Guardar el modelo ajustado
trainer.save_model("./models/whisper-fine-tuned-argentine")


RuntimeError: MPS backend out of memory (MPS allocated: 15.59 GB, other allocations: 2.49 GB, max allowed: 18.13 GB). Tried to allocate 70.31 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

## Evaluating models

In [None]:
processor_whisper = WhisperProcessor.from_pretrained("openai/whisper-small")
model_wisper = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
def asr_whisper(sample):
    global processor_whisper, model_wisper
    forced_decoder_ids = processor_whisper.get_decoder_prompt_ids(language="spanish", task="translate")
    input_features = processor_whisper(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
    predicted_ids = model_wisper.generate(input_features, forced_decoder_ids=forced_decoder_ids)
    return processor_whisper.batch_decode(predicted_ids, skip_special_tokens=True)[0]


In [None]:
processor_whisper_ar = WhisperProcessor.from_pretrained("openai/whisper-small")
model_wisper_ar = WhisperForConditionalGeneration.from_pretrained("./models/whisper-fine-tuned-argentine")
def asr_whisper_ar(sample):
    global processor_whisper_ar, model_wisper_ar
    forced_decoder_ids = processor_whisper_ar.get_decoder_prompt_ids(language="spanish", task="translate")
    input_features = processor_whisper_ar(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
    predicted_ids = model_wisper_ar.generate(input_features, forced_decoder_ids=forced_decoder_ids)
    return processor_whisper_ar.batch_decode(predicted_ids, skip_special_tokens=True)[0]


In [None]:
from datasets import load_dataset, Audio

# Cargar el dataset
data = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", cache_dir="./data/common_voice_11_0_test", trust_remote_code=True)
data = data.cast_column("audio", Audio(sampling_rate=16000))
print("Dataset cargado correctamente")

# Preprocesamiento: Normalización del texto
def normalize_text(batch):
    text = batch["sentence"].lower().strip()
    batch["sentence"] = text
    return batch

data = data.map(normalize_text)
print("Texto normalizado")

# DONE: Tarea ASRB4
data = data.select(range(30))
print(data)


¿Qué ocurre con los resultados? ¿Se podría mejorar subiendo los epoch? ¿Puede ser que se haya realizado un overfitting de los datos?¿Puede ser que la calidad de los datos de entrenamiento fueran nefastos?

In [None]:
import torch
from evaluate import load
import matplotlib.pyplot as plt


# Métrica WER (Word Error Rate)
wer_metric = load("wer")

# Creamos los arrays que contendrán las predicciones de los modelos y las referencias (gold std.)
predictions_whisper = []
predictions_whisper_ar = []
references = []

# Recorremos el dataset y generamos las predicciones
for sample in data:
    references.append(sample['sentence'])

    whisper_transcription = asr_whisper(sample['audio'])
    predictions_whisper.append(whisper_transcription)

    whisper_transcription_ar = asr_whisper_ar(sample['audio'])
    predictions_whisper_ar.append(whisper_transcription_ar)


# Evaluar Whisper
print("Evaluando Whisper baseline...")
whisper_wer = wer_metric.compute(predictions=predictions_whisper, references=references)
print(f"WER de Whisper(B): {whisper_wer:.4f}")

# Evaluar Wav2Vec2
print("Evaluando Whisper fine tuned...")
wav2vec_wer = wer_metric.compute(predictions=predictions_whisper_ar, references=references)
print(f"WER de Whisper(F): {wav2vec_wer:.4f}")

# Crear gráfica de comparación
models = ["WhisperB", "WhisperF"]
wer_scores = [whisper_wer, wav2vec_wer]

plt.figure(figsize=(8, 6))
plt.bar(models, wer_scores)
plt.title("Comparación de WER entre modelos ASR")
plt.ylabel("WER (Word Error Rate)")
plt.xlabel("Modelo")
plt.ylim(0, max(wer_scores) + 0.1)
plt.show()
