In [None]:
# install required python packages
!pip install torch numpy transformers datasets g2p_en editdistance tqdm

In [None]:
# import required libraries
import os
import torch
import editdistance
from datasets import load_dataset
from transformers import AutoModel, AutoProcessor, AutoModelForPreTraining
from tqdm import tqdm

In [None]:
# set card name
upstream_model_card = "facebook/wav2vec2-large-lv60"
reborn_model_card = "andybi7676/reborn-uasr_ls100h_iter5-stage1"
dataset_card = "andybi7676/reborn-uasr_librispeech-no-silence-100hr"
dataset_name = None
split = "test.clean"

# load models, processor and dataset from Hugging Face Hub
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60")
upstream_model = AutoModelForPreTraining.from_pretrained(upstream_model_card)
# load the reborn uasr model from the hub, which is composed of the segmenter and the generator
reborn_model = AutoModel.from_pretrained(reborn_model_card, trust_remote_code=True, revision="main")
# load dataset from the hub (streaming mode supported!)
dataset = load_dataset(dataset_card, dataset_name, split=split, streaming=True, trust_remote_code=True)

# set other environment variables
output_dir = f"./output/librispeech"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
def evaluate(dataset, upstream_model, reborn_model, processor, output_dir, split="test"):
    # model eval mode and to device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    upstream_model = upstream_model.to(device)
    reborn_model = reborn_model.to(device)
    upstream_model.eval()
    reborn_model.eval()
    # perform evaluation and dump the results to the output directory
    total_errs = 0
    total_len = 0
    os.makedirs(output_dir, exist_ok=True)
    with torch.no_grad(), open(f"{output_dir}/{split}.hyp", "w") as hyp_fw, open(f"{output_dir}/{split}.ref", "w") as ref_fw:
        for idx, sample in tqdm(enumerate(dataset), desc=f"Generating results...", dynamic_ncols=True):
            audio_feats = processor(sample["audio"]["array"], return_tensors="pt", padding="longest", sampling_rate=16_000).input_values
            audio_feats = audio_feats.to(device)
            
            upstream_output = upstream_model(audio_feats, output_hidden_states=True)
            wav2vecu_feats = upstream_output.hidden_states[15] #(B, T, C)
            feats_padding_mask = torch.zeros(wav2vecu_feats.shape[:-1], dtype=torch.bool, device=device)

            hypothesis = reborn_model.generate(wav2vecu_feats, feats_padding_mask)[0]
            reference = sample["phoneme"]
            print(hypothesis, file=hyp_fw, flush=True)
            print(reference, file=ref_fw, flush=True)
            total_errs += editdistance.eval(hypothesis.split(), reference.split())
            total_len += len(reference.split())

    print(f"\nPER: {total_errs / total_len * 100:.3f}%")

In [None]:
# evaluate based on the loaded components
evaluate(dataset, upstream_model, reborn_model, processor, output_dir, split=split)