In [1]:
import os
import sys
import torch
os.chdir('/shared/home/andoni.sudupe/mHubert_finetune')

from scripts.utils import load_data, setup_processor
from finetune_hubert import train_model
from transformers import (
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    HubertForCTC,
)
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.getcwd()

'/shared/home/andoni.sudupe/mHubert_finetune'

In [3]:
data = load_data('/home/andoni.sudupe/mHubert_finetune/data/preprocessed_data')

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 387426
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 16359
    })
    test_cv: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 14312
    })
    test_parl: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 1521
    })
    test_oslr: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 526
    })
    dev: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 1691
    })
    dev_cv: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 620
    })
    dev_parl: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 550
    })
    dev_oslr: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 521
    })
})


In [4]:
model_name = '/home/andoni.sudupe/mHubert_finetune/checkpoints/mHubert-ASR-eu/checkpoint-43000'

tokenizer = Wav2Vec2CTCTokenizer(
    '/home/andoni.sudupe/mHubert_finetune/data/vocab.json', 
    unk_token="[UNK]", 
    pad_token="[PAD]", 
    word_delimiter_token="|"
)

# Initialize feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

# Combine into processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

model = HubertForCTC.from_pretrained(model_name, local_files_only=True)


In [5]:
def map_to_result(batch, model, processor):
    """Map model predictions to text for evaluation."""
    with torch.no_grad():
        device = "cuda" if torch.cuda.is_available() else "cpu"
        input_values = torch.tensor(batch["input_values"], device=device).unsqueeze(0)
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_str"] = processor.batch_decode(pred_ids)[0]
    batch["text"] = processor.decode(batch["labels"], group_tokens=False)

    return batch

def evaluate_model(data, model, processor):
    """Evaluate the trained model on test data."""
    # Move model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    
    # Define mapping function for evaluation
    map_fn = lambda batch: map_to_result(batch, model, processor)
    
    # Apply mapping function to test data
    results = data.map(map_fn, remove_columns=data.column_names)
    
    # Calculate WER
    wer_metric = load("wer", trust_remote_code=True)
    cer_metric = load("cer", trust_remote_code=True)

    test_wer = wer_metric.compute(predictions=results["pred_str"], references=results["text"])
    test_cer = cer_metric.compute(predictions=results["pred_str"], references=results["text"])

    print(f"\nTest WER: {test_wer:.3f}")
    print(f"\nTest CER: {test_cer:.3f}")
    
    # Display sample predictions
    print("\nSample predictions:")
    for i in range(min(5, len(results))):
        print(f"Reference: {results['text'][i]}")
        print(f"Prediction: {results['pred_str'][i]}")
        print("---")
    
    return results

In [6]:
results = evaluate_model(data['test_cv'].select(range(10)), model, processor)

Map: 100%|██████████| 10/10 [00:18<00:00,  1.88s/ examples]



Test WER: 1.000

Test CER: 0.962

Sample predictions:
Reference: honek garrantzi handia zuen ehun urteko gerran
Prediction: 
---
Reference: osasuna aurkari zuzena da eta beraz puntuek balio bikoitza dute
Prediction: iiiiaaa
---
Reference: irungo familia boteretsu bat da olazabal familia
Prediction: i
---
Reference: hezkuntzak prestatu zituen probak pisa eta antzekoak eredu
Prediction: iiiiiiiii i i  ai
---
Reference: bestalde botilek abangoardiako diseinu orijinalak dituzte
Prediction: 
---


In [25]:
for i in range(len(results)):
        print(f"Reference: {results['text'][i]}")
        print(f"Prediction: {results['pred_str'][i]}")
        print("---")

Reference: new yorkeko aireportuan eskala egin genuen kaliforniara bidean
Prediction: niu jorkeko aire portuan eskal aegin genuen kaliforniarabidean
---
Reference: janet jackson michael jackson abeslari ospetsuaren arreba da
Prediction: jane jacxon maycel jacxon abeslari ospesuaren arrebada
---
Reference: londreseko heathrow aireportua munduko handienetarikoena da
Prediction: londrexeko itroua ireportua munduko handienetarikoa da
---
Reference: hamabietan izango da txupinazoa eta udaletxeko balkoitik botako dute urtero bezala
Prediction: hamabietan izango da txupinasoa eta udaletzeko palkoitik botako dute urtero bezala
---
Reference: motorolaren telefono berria erostekotan nabil
Prediction: motrolaren telefono berria erostekotan nabil
---
Reference: ekuadorretik igaro ginen bidaia hartan
Prediction: ekuadorretik igaro ginen bidai hartan
---
Reference: lau bat bat bi bat bi zazpi zortzi bi hiru hiru hiru zero
Prediction: lau bat bat bi bat bi zazpi zortzi bi hiru hiru hiru zero
---
Refe

In [24]:
cer_metric = load("cer", trust_remote_code=True)

test_cer = cer_metric.compute(predictions=results["pred_str"], references=results["text"])

print(f"\nTest CER: {test_cer:.3f}")


Downloading builder script: 100%|██████████| 5.60k/5.60k [00:00<00:00, 976kB/s]



Test CER: 0.075


In [None]:
model.push_to_hub(repo_id='Ansu/hubert_for_basque')