In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%load_ext autoreload
%autoreload 2

<h1 style="background-color:LightGreen;"> <center> Links </center></h1>

HF With KenLM:
https://huggingface.co/blog/wav2vec2-with-ngram

Hebrew Dataset:
https://huggingface.co/datasets/HeNLP/HeDC4/blob/main/HeDC4.csv

<h1 style="background-color:LightGreen;"> <center> Imports </center></h1>

In [46]:
from datasets        import load_dataset

from transformers    import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers    import Wav2Vec2ProcessorWithLM
from transformers    import AutoProcessor

from huggingface_hub import Repository
from huggingface_hub import notebook_login
from pyctcdecode     import build_ctcdecoder

import torch
import re


<h1 style="background-color:LightGreen;"> <center> 1. Decoding audio data with Wav2Vec2 and a language model </center></h1>

<h2 style="background-color:#33DAC8;"> <left> NO LM: </left></h2>

In [7]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset

Found cached dataset librispeech_asr_demo (/home/amitli/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [8]:
audio_sample = dataset[2]
audio_sample["text"].lower()

'he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind'

In [40]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-100h")
model     = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-100h")

Some weights of the model checkpoint at facebook/wav2vec2-base-100h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.mask_time_emb_vector']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-100h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
inputs = processor(audio_sample["audio"]["array"],
                   sampling_rate  = audio_sample["audio"]["sampling_rate"],
                   return_tensors ="pt")

In [14]:
with torch.no_grad():
    logits = model(**inputs).logits

In [15]:
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

transcription[0].lower()

'he tells us that at this festive season of the year with christmaus and rose beef looming before us simalyis drawn from eating and its results occur most readily to the mind'

<h2 style="background-color:#33DAC8;"> <left> With LM: </left></h2>

In [41]:
processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [22]:
logits.shape

torch.Size([1, 624, 32])

In [23]:
" ".join(sorted(processor.tokenizer.get_vocab()))

"' </s> <pad> <s> <unk> A B C D E F G H I J K L M N O P Q R S T U V W X Y Z |"

In [24]:
transcription = processor.batch_decode(logits.numpy()).text
transcription[0].lower()

'he tells us that at this festive season of the year with christmas and rose beef looming before us similes drawn from eating and its results occur most readily to the mind'

<h1 style="background-color:LightGreen;"> <center> 2. Getting data for your language model </center></h1>


In [25]:
target_lang = "sv"
dataset     = load_dataset("europarl_bilingual", lang1="en", lang2=target_lang, split="train")

Downloading builder script:   0%|          | 0.00/6.92k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/59.3k [00:00<?, ?B/s]

Downloading and preparing dataset europarl_bilingual/en-sv to /home/amitli/.cache/huggingface/datasets/europarl_bilingual/en-sv-lang1=en,lang2=sv/8.0.0/2ab0200e7729616bfd4a4df6bfb29b31746ceb5a59f8c75c02ca35e1ebead950...


Downloading data:   0%|          | 0.00/142M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/127M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.90M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1892723 [00:00<?, ? examples/s]

Dataset europarl_bilingual downloaded and prepared to /home/amitli/.cache/huggingface/datasets/europarl_bilingual/en-sv-lang1=en,lang2=sv/8.0.0/2ab0200e7729616bfd4a4df6bfb29b31746ceb5a59f8c75c02ca35e1ebead950. Subsequent calls will reuse this data.


In [26]:
 # change to the ignored characters of your fine-tuned model
chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' 

def extract_text(batch):
    text          = batch["translation"][target_lang]
    batch["text"] = re.sub(chars_to_ignore_regex, "", text.lower())
    return batch

In [27]:
dataset.column_names

['translation']

In [28]:
dataset = dataset.map(extract_text, remove_columns=dataset.column_names)

Map:   0%|          | 0/1892723 [00:00<?, ? examples/s]

In [45]:
dataset

Dataset({
    features: ['text'],
    num_rows: 1892723
})

<h2 style="background-color:#33DAC8;"> <left> Push Dataset to HuggingFace: </left></h2>

In [34]:
notebook_login()

Token is valid.
Your token has been saved to /home/amitli/.cache/huggingface/token
Login successful


In [36]:
dataset.push_to_hub(f"{target_lang}_corpora_parliament_processed", split="train")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1893 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

<h1 style="background-color:LightGreen;"> <center> 3. Build an n-gram with KenLM </center></h1>

In [37]:
username    = "laro1" 
target_lang = "sv"
dataset     = load_dataset(f"{username}/{target_lang}_corpora_parliament_processed", split="train")

with open("text.txt", "w") as file:
    file.write(" ".join(dataset["text"]))


Downloading readme:   0%|          | 0.00/382 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /home/amitli/.cache/huggingface/datasets/laro1___parquet/laro1--sv_corpora_parliament_processed-09c2e735160f274d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/162M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1892723 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/amitli/.cache/huggingface/datasets/laro1___parquet/laro1--sv_corpora_parliament_processed-09c2e735160f274d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


<h2 style="background-color:#33DAC8;"> <left> kenlm/build/bin/lmplz -o 5 <"text.txt" > "5gram.arpa" </left></h2>

In [None]:
with open("5gram.arpa", "r") as read_file, open("5gram_correct.arpa", "w") as write_file:
    
    has_added_eos = False
    
    for line in read_file:        
        
        if not has_added_eos and "ngram 1=" in line:
            count = line.strip().split("=")[-1]
            write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
            
        elif not has_added_eos and "<s>" in line:
            
            write_file.write(line)
            write_file.write(line.replace("<s>", "</s>"))
            has_added_eos = True
        else:
            write_file.write(line)

<h1 style="background-color:LightGreen;"> <center> 4. Combine an n-gram with Wav2Vec2 </center></h1>

In [None]:
processor = AutoProcessor.from_pretrained("hf-test/xls-r-300m-sv")

In [None]:
vocab_dict        = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), 
                                                     key = lambda item: item[1])}


In [None]:
decoder = build_ctcdecoder(
    labels           = list(sorted_vocab_dict.keys()),
    kenlm_model_path = "5gram_correct.arpa",
)

In [None]:
processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor = processor.feature_extractor,
    tokenizer         = processor.tokenizer,
    decoder           = decoder
)

In [None]:
repo = Repository(local_dir  = "xls-r-300m-sv", 
                  clone_from = "hf-test/xls-r-300m-sv")

processor_with_lm.save_pretrained("xls-r-300m-sv")

repo.push_to_hub(commit_message="Upload lm-boosted decoder")