# Experiment 3 – Fine-Tuned WavLM-CTC with KenLM Decoder

## Install Dependencies

In [14]:
!pip install pyctcdecode jiwer torchcodec

Collecting torchcodec
  Downloading torchcodec-0.7.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.4 kB)
Downloading torchcodec-0.7.0-cp312-cp312-manylinux_2_28_x86_64.whl (1.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.7.0


In [2]:
!pip install https://github.com/kpu/kenlm/archive/master.zip

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Downloading https://github.com/kpu/kenlm/archive/master.zip
[2K     [32m-[0m [32m553.6 kB[0m [31m9.4 MB/s[0m [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: kenlm
  Building wheel for kenlm (pyproject.toml) ... [?25l[?25hdone
  Created wheel for kenlm: filename=kenlm-0.2.0-cp312-cp312-linux_x86_64.whl size=3188042 sha256=ee4be9dddde09775d86a5e11bdff79529c3e6f4da2524ff319adb440e9a6e236
  Stored in directory: /tmp/pip-ephem-wheel-cache-jv9x6ybu/wheels/92/c8/12/56d187154e078f0eaa74d059017fc1afe1c4d91fbce02ce8d9
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.2.0


## Setup Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

device: cuda


Load Saved Models in my drive

In [2]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

model = Wav2Vec2ForCTC.from_pretrained("/content/drive/MyDrive/wavlm-ctc-ex-2").to(device)
processor = Wav2Vec2Processor.from_pretrained("/content/drive/MyDrive/wavlm-ctc-ex-2")

---

## Install a KenLM Model(4-gram)

In [7]:
!wget -O 4-gram.arpa.gz https://openslr.elda.org/resources/11/4-gram.arpa.gz

--2025-10-03 15:17:15--  https://openslr.elda.org/resources/11/4-gram.arpa.gz
Resolving openslr.elda.org (openslr.elda.org)... 141.94.109.138, 2001:41d0:203:ad8a::
Connecting to openslr.elda.org (openslr.elda.org)|141.94.109.138|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1355172078 (1.3G) [application/x-gzip]
Saving to: ‘4-gram.arpa.gz’


2025-10-03 15:18:48 (14.2 MB/s) - ‘4-gram.arpa.gz’ saved [1355172078/1355172078]



In [8]:
!gunzip 4-gram.arpa.gz

Build KenLM Decoder

In [3]:
import pyctcdecode

# Load vocabulary
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab = sorted(vocab_dict.items(), key=lambda item: item[1])
vocab_list = [x[0] for x in sorted_vocab]

# Load 4-gram LM
decoder = pyctcdecode.build_ctcdecoder(
    labels=vocab_list,
    kenlm_model_path="4-gram.arpa"  # or "4-gram.bin"
)



---

In [4]:
import torch

def transcribe_with_lm(logits):
    # logits: [batch_size, time, vocab_size]
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    log_probs = log_probs.cpu().detach().numpy()
    return decoder.decode(log_probs[0])  # single example

## Load Test Dataset

In [5]:
from datasets import load_dataset

librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

## Define Evaluation Metrics

In [6]:
from jiwer import wer, cer

def get_wer_cer(result):
  refs = result["text"]
  hyps = result["transcription"]

  # Convert to plain Python lists
  refs = list(refs)
  hyps = list(hyps)

  print("WER:", wer(refs, hyps))
  print("CER:", cer(refs, hyps))

---

### For Test Dataset

In [8]:
def map_to_pred_model(batch):
    # extract all audio arrays
    audio_arrays = [x["array"] for x in batch["audio"]]

    # process batch
    inputs = processor(audio_arrays, sampling_rate=16000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.to(model.device)).logits.cpu().numpy()

    # decode predictions
    beam_lm = [decoder.decode(logit) for logit in logits]

    batch["transcription"] = beam_lm
    return batch

# run evaluation
resultModel = librispeech_eval.map(map_to_pred_model, batched=True, batch_size=8, remove_columns=["audio"])

Map:   0%|          | 0/2620 [00:00<?, ? examples/s]

In [9]:
get_wer_cer(resultModel)

WER: 0.04161594643944005
CER: 0.01348701736937449


### For Validation Dataset

In [10]:
val_dataset = load_dataset("librispeech_asr", "clean", split="validation")

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

In [11]:
resultEval = val_dataset.map(map_to_pred_model, batched=True, batch_size=8, remove_columns=["audio"])

Map:   0%|          | 0/2703 [00:00<?, ? examples/s]

In [12]:
get_wer_cer(resultEval)

WER: 0.03941031579721334
CER: 0.013371190060182489


---