In [1]:
# ! pip install --quiet torch pytorch-lightning transformers datasets librosa soundfile pyctcdecode https://github.com/kpu/kenlm/archive/master.zip

import os
import json
import kenlm
import torch
import datasets
import transformers
import numpy as np

from typing import *
from tqdm import tqdm
from pprint import pprint
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForCTC

# from google.colab import drive
# drive.mount("/content/drive", force_remount=True)
ROOT_DIR = os.path.join("/media", "andrea", "512Gb", "tesi")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load pretrained model

NGRAM_MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
NGRAM_MODEL_NAME = NGRAM_MODEL.split("/")[1]

NGRAM_LOCAL_MODEL_DUMP = os.path.join(ROOT_DIR, "models", NGRAM_MODEL_NAME)

if not os.path.isdir(NGRAM_LOCAL_MODEL_DUMP):

    ngram_model = AutoModelForCTC.from_pretrained(NGRAM_MODEL).to("cuda" if torch.cuda.is_available() else "cpu")
    ngram_processor = AutoProcessor.from_pretrained(NGRAM_MODEL)
    ngram_model.save_pretrained(NGRAM_LOCAL_MODEL_DUMP)
    ngram_processor.save_pretrained(NGRAM_LOCAL_MODEL_DUMP)

else:

    ngram_model = AutoModelForCTC.from_pretrained(NGRAM_LOCAL_MODEL_DUMP).to("cuda" if torch.cuda.is_available() else "cpu")
    ngram_processor = AutoProcessor.from_pretrained(NGRAM_LOCAL_MODEL_DUMP)

In [3]:
def map_to_pred(batch: Dict):

    inputs = ngram_processor(batch["audio"]["array"], sampling_rate=16_000, return_tensors="pt")
    inputs = {k: v.to("cuda") for k,v in inputs.items()}

    with torch.no_grad():
        logits = ngram_model(**inputs).logits

    # for output beams we return the text, the scores, the lm state and the word frame indices
    # text, last_lm_state, text_frames, logit_score, lm_score
    # OutputBeam = Tuple[str, LMState, List[WordFrames], float, float]
    output_beams = ngram_processor.decoder.decode_beams_batch(None, logits.cpu().numpy())[0]

    batch["transcription"] = [x[0] for x in output_beams]
    batch["lm_score"] = [x[-1] for x in output_beams]
    batch["logit_score"] = [x[-2] for x in output_beams]

    batch["lm_probability"] = torch.softmax(torch.tensor(batch["lm_score"]), dim=-1)
    batch["logit_probability"] = torch.softmax(torch.tensor(batch["logit_score"]), dim=-1)

    return batch

## Test set

In [None]:
# load librispeech dataset

LOCAL_DATASET_DUMP = os.path.join(ROOT_DIR, "datasets", "librispeech_test_")

if not os.path.isdir(f"{LOCAL_DATASET_DUMP}clean") and not os.path.isdir(f"{LOCAL_DATASET_DUMP}other"):

    librispeech_test_clean = load_dataset("andreagasparini/librispeech_test_only", "clean", split="test")
    librispeech_test_other = load_dataset("andreagasparini/librispeech_test_only", "other", split="test")

    librispeech_test_clean.save_to_disk(f"{LOCAL_DATASET_DUMP}clean")
    librispeech_test_other.save_to_disk(f"{LOCAL_DATASET_DUMP}other")

else:

    librispeech_test_clean = datasets.Dataset.load_from_disk(f"{LOCAL_DATASET_DUMP}clean")
    librispeech_test_other = datasets.Dataset.load_from_disk(f"{LOCAL_DATASET_DUMP}other")

In [None]:
if os.path.isfile(f"{ROOT_DIR}speech2text/predictions/{NGRAM_MODEL_NAME}-test_other_predictions.json") and \
   os.path.isfile(f"{ROOT_DIR}speech2text/predictions/{NGRAM_MODEL_NAME}-test_clean_predictions.json"):

    with open(f"{ROOT_DIR}speech2text/predictions/{NGRAM_MODEL_NAME}-test_clean_predictions.json", "r") as f:
        result_test_clean = datasets.Dataset.from_dict(json.load(f))

    with open(f"{ROOT_DIR}speech2text/predictions/{NGRAM_MODEL_NAME}-test_other_predictions.json", "r") as f:
        result_test_other = datasets.Dataset.from_dict(json.load(f))
else:

    result_test_clean = librispeech_test_clean.map(map_to_pred, remove_columns=["audio", "file"])
    result_test_other = librispeech_test_other.map(map_to_pred, remove_columns=["audio", "file"])

    with open(f"{ROOT_DIR}speech2text/predictions/{NGRAM_MODEL_NAME}-test_clean_predictions.json", "w+") as f:
        json.dump(result_test_clean.to_dict(), f)

    with open(f"{ROOT_DIR}speech2text/predictions/{NGRAM_MODEL_NAME}-test_other_predictions.json", "w+") as f:
        json.dump(result_test_other.to_dict(), f)

## Train set

In [None]:
librispeech_train_clean_360 = datasets.load_from_disk(f"{ROOT_DIR}/datasets/librispeech_asr/train.360")

In [None]:
if os.path.isfile(f"{ROOT_DIR}/predictions/{NGRAM_MODEL_NAME}-train_clean_100_predictions") and \
   os.path.isfile(f"{ROOT_DIR}/predictions/{NGRAM_MODEL_NAME}-train_clean_360_predictions"):

    # result_train_100_clean = datasets.load_from_disk(f"{ROOT_DIR}/predictions/{NGRAM_MODEL_NAME}-train_clean_100_predictions")
    result_train_360_clean = datasets.load_from_disk(f"{ROOT_DIR}/predictions/{NGRAM_MODEL_NAME}-train_clean_360_predictions")

else:

    # librispeech_train_clean_100 = datasets.load_from_disk(f"{ROOT_DIR}/datasets/librispeech_asr/train.100")
    # result_train_100_clean = librispeech_train_clean.map(map_to_pred, remove_columns=["audio", "file"])
    # result_train_100_clean.save_to_disk(f"{ROOT_DIR}/predictions/{NGRAM_MODEL_NAME}-train_clean_100_predictions")
    librispeech_train_clean_360 = datasets.load_from_disk(f"{ROOT_DIR}/datasets/librispeech_asr/train.360")
    result_train_360_clean = librispeech_train_clean.map(map_to_pred, remove_columns=["audio", "file"])    
    result_train_360_clean.save_to_disk(f"{ROOT_DIR}/predictions/{NGRAM_MODEL_NAME}-train_clean_360_predictions")

In [4]:
librispeech_train_other = datasets.load_from_disk(f"{ROOT_DIR}/datasets/librispeech_asr/train.500")

In [None]:
if os.path.isfile(f"{ROOT_DIR}/predictions/{NGRAM_MODEL_NAME}-train_other_500_predictions"):

    result_train_other = datasets.load_from_disk(f"{ROOT_DIR}/predictions/{NGRAM_MODEL_NAME}-train_other_500_predictions")

else:

    result_train_other = librispeech_train_other.map(map_to_pred, remove_columns=["audio", "file"])
    result_train_other.save_to_diks(f"{ROOT_DIR}/predictions/{NGRAM_MODEL_NAME}-train_other_500_predictions")

 35%|██████████▋                   | 52710/148688 [8:41:50<11:57:34,  2.23ex/s]

## Then

In [None]:
LOCAL_MODEL_DUMP = f"{ROOT_DIR}speech2text/models/wav2vec2-large-960h-lv60-self"

model = AutoModelForCTC.from_pretrained(LOCAL_MODEL_DUMP).to("cuda" if torch.cuda.is_available() else "cpu")
processor = AutoProcessor.from_pretrained(LOCAL_MODEL_DUMP)

vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
sorted_vocab_dict

Downloading preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

{"'": 27,
 '</s>': 2,
 '<pad>': 0,
 '<s>': 1,
 '<unk>': 3,
 'a': 7,
 'b': 24,
 'c': 19,
 'd': 14,
 'e': 5,
 'f': 20,
 'g': 21,
 'h': 11,
 'i': 10,
 'j': 29,
 'k': 26,
 'l': 15,
 'm': 17,
 'n': 9,
 'o': 8,
 'p': 23,
 'q': 30,
 'r': 13,
 's': 12,
 't': 6,
 'u': 16,
 'v': 25,
 'w': 18,
 'x': 28,
 'y': 22,
 'z': 31,
 '|': 4}

In [4]:
torch.cuda.is_available()

False