In [None]:
from huggingface_hub import login
token = "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"
print("Hugging Face logging")
login(token)

In [None]:
import torch
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

In [None]:
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Configuración inicial
MODEL_NAME = "openai/whisper-tiny"  # Modelo preentrenado en Hugging Face
SAMPLE_RATE = 16000
cache_dir = "./models/whisper-tiny"

## Automatic Text to Speech (ATS)

### Uso de distintos modelos

#### Tarea ASRC1

Escribir el código necesario para realizar una generación en ventana y transcribir el audio `patria_invento.wav`

In [None]:
# DONE: cargar el modelo y el processor whisper
processor = WhisperProcessor.from_pretrained(MODEL_NAME, cache_dir=cache_dir)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, cache_dir=cache_dir).to(device)
print("Modelo Whisper cargado correctamente")


In [None]:
# DONE: escribir las funciones necesarias para realizar un ASR en ventana
def chunk_audio(waveform, chunk_size_s, sampling_rate):
    chunk_size = int(chunk_size_s * sampling_rate)  # Tamaño del fragmento en muestras
    return waveform.split(chunk_size, dim=1)

def transcribe_audio_file_chunked(model_asr, processor_asr, audio_path, chunk_size_s=30):
    global device
    # Cargar el archivo de audio
    waveform, sampling_rate = torchaudio.load(audio_path)

    # Resamplear si es necesario
    if sampling_rate != SAMPLE_RATE:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=SAMPLE_RATE)
        waveform = resampler(waveform)

    # Normalizar el audio
    waveform = waveform / torch.max(torch.abs(waveform)).detach()

    # Dividir el audio en fragmentos
    chunks = chunk_audio(waveform, chunk_size_s, SAMPLE_RATE)

    # Procesar cada fragmento
    transcriptions = []
    for chunk in chunks:
        inputs = processor_asr(chunk.squeeze().numpy(), sampling_rate=SAMPLE_RATE, return_tensors="pt")
        inputs = {key: value.clone().detach().to(device) for key, value in inputs.items()}

        # Generar transcripción
        with torch.no_grad():
            forced_decoder_ids = processor_asr.get_decoder_prompt_ids(language="spanish", task="transcribe")
            predicted_ids = model_asr.generate(inputs["input_features"],forced_decoder_ids=forced_decoder_ids, max_length=500, num_beams=5)
            transcription_tmp = processor_asr.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            transcriptions.append(transcription_tmp)

    # Combinar las transcripciones
    return " ".join(transcriptions)


In [None]:
# DONE: transcribir el audio de patria_invento.wav
audio_path = "./provided/patria_invento.wav"  # Reemplazar con la ruta al archivo de audio
transcription = transcribe_audio_file_chunked(model, processor, audio_path, chunk_size_s=30)
print(f"Transcripción: {transcription}")

#### Tarea ASRC2

Comprobar la transcripción usando el modelo small `openai/whisper-small`, ¿hay diferencias significativas en los resultados?

### Ajuste fino ASR

Vamos entrenar el modelo con voces que posean acento argentino

In [None]:
from datasets import load_dataset

# Cargar el dataset personalizado
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", cache_dir="./data/common_voice_11_0_test")
dataset = dataset.filter(lambda example: "Argentina" in example["accent"])
print(dataset)

In [None]:
from datasets import Audio

# Preprocesar el dataset
def preprocess_data(batch):
    # Extraer el audio y procesarlo
    audio = batch["audio"]
    inputs = processor.feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt", padding="longest"
    )
    batch["input_features"] = inputs.input_features[0]

    # Tokenizar el texto para las etiquetas
    labels = processor.tokenizer(batch["sentence"], return_tensors="pt", padding="longest").input_ids[0]
    batch["labels"] = labels
    return batch


dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names)


In [None]:
import torch

# Crear un DataCollator personalizado con padding a 3000
class WhisperDataCollator:
    def __init__(self, processor, max_length=3000):
        self.processor = processor
        self.max_length = max_length

    def __call__(self, features):
        # Obtener input_features y labels
        input_features = [torch.tensor(feature["input_features"]) for feature in features]
        labels = [torch.tensor(feature["labels"]) for feature in features]

        # Aplicar padding a input_features a la longitud especificada
        padded_input_features = [
            torch.nn.functional.pad(
                feat, (0, self.max_length - feat.shape[1]), mode="constant", value=0
            )
            for feat in input_features
        ]

        # Crear el batch con input_features y labels
        batch = {
            "input_features": torch.stack(padded_input_features),
            "labels": torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100),
        }
        return batch

# Instanciar el DataCollator personalizado
data_collator = WhisperDataCollator(processor, max_length=3000)


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")

# Configuración del entrenamiento
training_args = Seq2SeqTrainingArguments(
    output_dir="./models/whisper-fine-tuned-argentine",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    remove_unused_columns=False,
)

# Crear el Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,  # Usar el DataCollator personalizado
)


# Iniciar el entrenamiento
trainer.train()
# Guardar el modelo ajustado
trainer.save_model("./models/whisper-fine-tuned-argentine")


In [None]:
from datasets import load_dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor

# Ruta al modelo fine-tuned y modelo base
fine_tuned_model_path = "./models/whisper-fine-tuned-argentine"
base_model_name = "openai/whisper-tiny"

# Cargar modelos
fine_tuned_model = WhisperForConditionalGeneration.from_pretrained(fine_tuned_model_path, cache_dir="./models/whisper-fine-tuned-argentine")
base_model = WhisperForConditionalGeneration.from_pretrained(base_model_name, cache_dir="./models/whisper-tiny")

# Cargar procesador
processor = WhisperProcessor.from_pretrained(base_model_name, cache_dir="./models/whisper-tiny")


In [None]:
# Cargar el dataset personalizado
eval_dataset = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="validation", cache_dir="./data/common_voice_11_0_validation")
eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
eval_dataset = eval_dataset.filter(lambda sample: "Argentina" in sample["accent"])

# Preprocesamiento: Normalización del texto
def normalize_text(batch):
    text = batch["sentence"].lower().strip()
    batch["sentence"] = text
    return batch

eval_dataset = eval_dataset.map(normalize_text)
eval_dataset = eval_dataset.select(range(50))
print(eval_dataset)

In [None]:
from evaluate import load
from torch.utils.data import DataLoader

# Función de preprocesamiento ajustada
def preprocess_data(batch):
    # Convertir audio y texto a tensores
    processed = processor(batch["audio"]["array"], sampling_rate=batch["audio"]["sampling_rate"], return_tensors="pt")
    batch["input_features"] = processed.input_features.squeeze(0)  # Asegurarse de que sea tensor
    batch["labels"] = processor.tokenizer(batch["sentence"], return_tensors="pt").input_ids.squeeze(0)
    return batch

# Aplicar preprocesamiento al dataset de evaluación
eval_dataset = eval_dataset.map(preprocess_data)

# Métrica WER
wer_metric = load("wer")

# Función para evaluar un modelo
def evaluate_model(model, dataset, processor_asr, batch_size=8):
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=lambda x: x)
    predictions = []
    references = []

    for batch in dataloader:
        # Verificar y convertir input_features en tensores
        input_features = []
        for sample in batch:
            if isinstance(sample["input_features"], torch.Tensor):
                input_features.append(sample["input_features"])
            else:
                input_features.append(torch.tensor(sample["input_features"]))
        input_features = torch.stack(input_features)
        labels = [processor_asr.decode(sample["labels"], skip_special_tokens=True) for sample in batch]

        # Generar predicciones
        with torch.no_grad():
            forced_decoder_ids = processor_asr.get_decoder_prompt_ids(language="spanish", task="transcribe")
            predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
        predicted_texts = processor_asr.batch_decode(predicted_ids, skip_special_tokens=True)

        predictions.extend(predicted_texts)
        references.extend(labels)

    # Calcular la métrica WER
    wer = wer_metric.compute(predictions=predictions, references=references)
    return wer

# Evaluar ambos modelos
print("Evaluando modelo base...")
base_wer = evaluate_model(base_model, eval_dataset, processor)
print(f"WER modelo base: {base_wer}")

print("Evaluando modelo fine-tuned...")
fine_tuned_wer = evaluate_model(fine_tuned_model, eval_dataset, processor)
print(f"WER modelo fine-tuned: {fine_tuned_wer}")


¿Qué ocurre con los resultados? ¿Se podría mejorar subiendo los epoch? ¿Puede ser que se haya realizado un overfitting de los datos?¿Puede ser que la calidad de los datos de entrenamiento fueran nefastos?