Prepare colab environment:

In [None]:
!pip install datasets>=1.18.3
!pip install transformers==4.11.3
!pip install librosa
!pip install jiwer
!pip install evaluate
!pip install rouge_score

In [1]:
import evaluate

In [3]:
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import soundfile as sf
import torch
from jiwer import wer


librispeech_eval = load_dataset('librispeech_test_clean', split="test-clean")

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h").to("cuda")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")



In [6]:
def map_to_pred(batch):
    input_values = processor(batch["audio"]["array"], sampling_rate=batch["audio"]["sampling_rate"], return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    batch["transcription"] = transcription[0].lower()
    batch["sentence"] = batch["sentence"].lower()
    return batch

result = librispeech_eval.map(map_to_pred)




In [None]:
print("WER:", wer(result["text"], result["transcription"]))

WER: 0.02765520389531345


In [None]:
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')

bleu_res = bleu.compute(predictions=result["text"], references=result["transcription"])
rouge_res = rouge.compute(predictions=result["text"], references=result["transcription"])

print(f"BLEU: {bleu_res}\nROUGE: {rouge_res}")

INFO:absl:Using default tokenizer.


BLEU: {'bleu': 0.9435228921373371, 'precisions': [0.9746081862446744, 0.9531988149571623, 0.9333093920317715, 0.9140513233190272], 'brevity_penalty': 1.0, 'length_ratio': 1.0000570635116885, 'translation_length': 52576, 'reference_length': 52573}
ROUGE: {'rouge1': 0.9712394139771212, 'rouge2': 0.9466495598030624, 'rougeL': 0.9711493254688222, 'rougeLsum': 0.9711588997891094}
