In [1]:
# install required python packages
!pip install torch numpy transformers datasets editdistance tqdm

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m62.0 MB/s[0m eta [36m0:

In [2]:
# import required libraries
import os
import torch
import editdistance
from datasets import load_dataset
from transformers import AutoModel, AutoProcessor, AutoModelForPreTraining
from tqdm import tqdm

In [3]:
# set card name
upstream_model_card = "facebook/wav2vec2-large-lv60"
reborn_model_card = "andybi7676/reborn-uasr_ls100h_iter5-stage1"
dataset_card = "andybi7676/reborn-uasr_librispeech-no-silence-100hr"
dataset_name = None
split = "test.clean"

# load models, processor and dataset from Hugging Face Hub
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60")
upstream_model = AutoModelForPreTraining.from_pretrained(upstream_model_card)
# load the reborn uasr model from the hub, which is composed of the segmenter and the generator
reborn_model = AutoModel.from_pretrained(reborn_model_card, trust_remote_code=True, revision="main")
# load dataset from the hub (streaming mode supported!)
dataset = load_dataset(dataset_card, dataset_name, split=split, streaming=True, trust_remote_code=True)

# set other environment variables
output_dir = f"./output/librispeech"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large-lv60 were not used when initializing Wav2Vec2ForPreTraining: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForPreTraining from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForPreTraining from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForPreTraining were not initialized from the model checkpoint at facebook/wav2vec2-large-lv60 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

configuration_reborn.py:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

modeling_reborn.py:   0%|          | 0.00/13.0k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/89.0 [00:00<?, ?B/s]

In [4]:
def evaluate(dataset, upstream_model, reborn_model, processor, output_dir, split="test"):
    # model eval mode and to device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    upstream_model = upstream_model.to(device)
    reborn_model = reborn_model.to(device)
    upstream_model.eval()
    reborn_model.eval()
    # perform evaluation and dump the results to the output directory
    total_errs = 0
    total_len = 0
    os.makedirs(output_dir, exist_ok=True)
    with torch.no_grad(), open(f"{output_dir}/{split}.hyp", "w") as hyp_fw, open(f"{output_dir}/{split}.ref", "w") as ref_fw:
        for idx, sample in tqdm(enumerate(dataset), desc=f"Generating results...", dynamic_ncols=True):
            audio_feats = processor(sample["audio"]["array"], return_tensors="pt", padding="longest", sampling_rate=16_000).input_values
            audio_feats = audio_feats.to(device)

            upstream_output = upstream_model(audio_feats, output_hidden_states=True)
            wav2vecu_feats = upstream_output.hidden_states[15] #(B, T, C)
            feats_padding_mask = torch.zeros(wav2vecu_feats.shape[:-1], dtype=torch.bool, device=device)

            hypothesis = reborn_model.generate(wav2vecu_feats, feats_padding_mask)[0]
            reference = sample["phoneme"]
            print(hypothesis, file=hyp_fw, flush=True)
            print(reference, file=ref_fw, flush=True)
            total_errs += editdistance.eval(hypothesis.split(), reference.split())
            total_len += len(reference.split())

    print(f"\nPER: {total_errs / total_len * 100:.3f}%")

In [5]:
# evaluate based on the loaded components
evaluate(dataset, upstream_model, reborn_model, processor, output_dir, split=split)

Generating results...: 2620it [06:57,  6.27it/s]


PER: 6.941%



