In [1]:
# !pip install datasets==3.6.0
# !pip install transformers
# !pip install tf-keras
# !pip install torch
# !pip install transformers[torch]
# !pip install wandb
# !pip install evaluate
# !pip install librosa
# !pip install jiwer
# !pip install numpy==1.26.4
# !pip install pyctcdecode
# !pip install kenlm

In [2]:
import logging
import torch
import warnings
import gc
import os
import evaluate
import numpy as np
import librosa
from pyctcdecode import build_ctcdecoder
import kenlm
from dataclasses import dataclass, field
from typing import Any, Dict, List, Union, Optional
from tqdm import tqdm
from huggingface_hub import login
from datasets import load_dataset, Dataset, Audio, disable_caching
from transformers import (
    AutoProcessor,
    AutoModelForCTC,
    Wav2Vec2Processor,
    TrainingArguments,
    Trainer
)

  from .autonotebook import tqdm as notebook_tqdm
2025-08-06 08:47:00.540883: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754470020.560326   16671 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754470020.566514   16671 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754470020.585322   16671 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754470020.585341   16671 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754470020.585343   16671

In [3]:
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [4]:
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("Elormiden/1.05-0.9-0.55-full")
model = AutoModelForCTC.from_pretrained("Elormiden/1.05-0.9-0.55-full").eval();

In [62]:
from datasets import load_dataset, Dataset, Audio

# ====================================================================================
# Data Loading and Resampling
# ====================================================================================

print("Loading dataset...")
ds = load_dataset("Elormiden/RIK_Cypriot_News_Dataset")
print("Dataset loaded successfully.")
print(ds)

# We will use all splits for a complete workflow
train_ds = ds['train']
eval_ds = ds['validation']
test_ds = ds['test']

debug_ds = ds['test'].select(range(200))

Loading dataset...
Dataset loaded successfully.
DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 34065
    })
    validation: Dataset({
        features: ['audio', 'text'],
        num_rows: 4255
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 4279
    })
})


In [110]:
def evaluate_model(model, processor, test_dataset, batch_size=30):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    all_predictions = []
    all_references = []
    
    with torch.no_grad():
        for i in range(0, len(test_dataset), batch_size):
            batch = test_dataset[i:i+batch_size]
            
            audio_arrays = [sample["array"] for sample in batch["audio"]]
            inputs = processor(
                audio_arrays, 
                sampling_rate=16000, 
                return_tensors="pt", 
                padding=True
            )
            
            inputs = {k: v.to(device) for k, v in inputs.items()}
            logits = model(**inputs).logits

            # logits[:, :, 53] = -float('inf') # only for my trained models
            
            predicted_ids = torch.argmax(logits, dim=-1)
            predictions = processor.batch_decode(predicted_ids)
            
            all_predictions.extend(predictions)
            all_references.extend(batch["text"])
            
            if i % (batch_size * 10) == 0:
                print(f"Processed {i}/{len(test_dataset)} samples")
    
    wer = wer_metric.compute(predictions=all_predictions, references=all_references)
    
    return wer, all_predictions, all_references

In [111]:
from transformers import AutoProcessor, AutoModelForCTC

base_processor = AutoProcessor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-greek")
base_model = AutoModelForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-greek")

In [112]:
print('Launching tests...')
wer_score, predictions, references = evaluate_model(base_model, base_processor, eval_ds)

print(f"\nTest WER: {wer_score:.4f}")
print(f"Test samples: {len(test_ds)}")

print("\nSample predictions:")
for i in range(10):
    print(f"Reference: {references[i]}")
    print(f"Predicted: {predictions[i]}")
    print("-" * 50)

Launching tests...
Processed 0/4255 samples
Processed 300/4255 samples
Processed 600/4255 samples
Processed 900/4255 samples
Processed 1200/4255 samples
Processed 1500/4255 samples
Processed 1800/4255 samples
Processed 2100/4255 samples
Processed 2400/4255 samples
Processed 2700/4255 samples
Processed 3000/4255 samples
Processed 3300/4255 samples
Processed 3600/4255 samples
Processed 3900/4255 samples
Processed 4200/4255 samples

Test WER: 1.0309
Test samples: 4279

Sample predictions:
Reference: Και το Τμήμα Ενόπλων και τους βουλευτές και τις οργανώσεις να έχουν μια συνάντηση την Τετάρτη τη μία ώρα εξέβησης στην αίθουσα του Δήμου Π.
Predicted: ΡΕ ΤΟ ΤΣΊΜΠΑΛΙΆ ΤΟ ΚΑΙ ΤΟΥΝ ΔΟΥΛΕΥΤΑΊΣ ΚΑΙ ΤΙΣ ΣΤΟΡΓΑΛΏΣ Ή ΝΑ ΈΧΟΥ ΠΙΑΝ ΣΥΝΆΝΤΙΣΗ ΤΗ ΝΤΕΤΆΡΤΗΡΗ ΏΡΑ ΕΞΈΠΗΣΗ ΣΤΗΝ ΑΊΘΟΥ ΣΑΝ ΤΟΥ ΙΜΟΥ Έ
--------------------------------------------------
Reference: Τα μετάλλια. Εξαιρετική ήταν η χθεσινή μέρα στην ενόργανη γυμναστική, εκεί όπου ουσιαστικά σαρώσαμε δύο.
Predicted: ΝΤΑ ΜΕΤΆΛΕΙΑ ΉΞΕΡΕΤΙ ΚΑΊ ΑΝΗΧΘΕΣΙΝΉ

In [97]:
vocab = processor.tokenizer.get_vocab()

kenlm_vocab = []
for token, idx in sorted(vocab.items(), key=lambda x: x[1]):
    if idx not in [55, 56]:  # тупые служебные токены <s> и </s>!!!
        kenlm_vocab.append(token)

print(len(kenlm_vocab))
print(len(vocab))
print(vocab)

55
57
{"'": 49, '[PAD]': 54, '[UNK]': 53, 'a': 16, 'e': 12, 'g': 10, 'h': 46, 'm': 4, 'n': 29, 'o': 40, 'r': 20, 't': 6, 'v': 50, '|': 30, '«': 25, '´': 11, '·': 23, '»': 24, '́': 52, 'ΐ': 14, 'ά': 34, 'έ': 36, 'ή': 42, 'ί': 48, 'α': 47, 'β': 26, 'γ': 33, 'δ': 18, 'ε': 44, 'ζ': 19, 'η': 32, 'θ': 22, 'ι': 45, 'κ': 27, 'λ': 21, 'μ': 38, 'ν': 28, 'ξ': 41, 'ο': 5, 'π': 3, 'ρ': 1, 'ς': 39, 'σ': 8, 'τ': 17, 'υ': 51, 'φ': 2, 'χ': 31, 'ψ': 0, 'ω': 35, 'ϊ': 9, 'ϋ': 7, 'ό': 37, 'ύ': 13, 'ώ': 43, '’': 15, '<s>': 55, '</s>': 56}


In [103]:
def evaluate_model_kenlm(model, processor, test_dataset, lm_path, batch_size=20, alpha=0.5, beta=1.0):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Настройка декодера с KenLM
    # vocab_dict = kenlm_vocab
    # sorted_vocab_list = [
    #     key for key, value in sorted(vocab_dict.items(), key=lambda item: item[1])
    # ]
    
    decoder = build_ctcdecoder(
        labels=kenlm_vocab,
        kenlm_model_path=lm_path,
        alpha=alpha,
        beta=beta
    )
    
    all_predictions = []
    all_references = []
    
    with torch.no_grad():
        for i in range(0, len(test_dataset), batch_size):
            batch = test_dataset[i:i+batch_size]
            
            audio_arrays = [sample["array"] for sample in batch["audio"]]
            inputs = processor(
                audio_arrays, 
                sampling_rate=16000, 
                return_tensors="pt", 
                padding=True
            )
            
            inputs = {k: v.to(device) for k, v in inputs.items()}
            logits = model(**inputs).logits
            
            # ВАЖНО: тот же фильтр что и в оригинале
            logits[:, :, 53] = -float('inf')
            
            # Для KenLM нужны log probabilities
            logits = torch.nn.functional.log_softmax(logits, dim=-1)
            logits_numpy = logits.cpu().numpy()
            
            # Декодируем с KenLM
            batch_predictions = []
            for j in range(logits_numpy.shape[0]):
                single_logits = logits_numpy[j]
                prediction = decoder.decode(single_logits)
                batch_predictions.append(prediction)
            
            all_predictions.extend(batch_predictions)
            all_references.extend(batch["text"])  # Как в оригинале
            
            if i % (batch_size * 10) == 0:
                print(f"Processed {i}/{len(test_dataset)} samples")
    
    wer = wer_metric.compute(predictions=all_predictions, references=all_references)
    
    return wer, all_predictions, all_references

# Запуск
lm_path = './cypriot.klm'  # Путь к твоей KenLM модели

In [105]:
print('Launching tests with KenLM support...')
wer_score, predictions, references = evaluate_model_kenlm(model, processor, eval_ds, lm_path)

print(f"\nTest WER: {wer_score:.4f}")
print(f"Test samples: {len(test_ds)}")

print("\nSample predictions:")
for i in range(10):
    print(f"Reference: {references[i]}")
    print(f"Predicted: {predictions[i]}")
    print("-" * 50)

Unigrams not provided and cannot be automatically determined from LM file (only arpa format). Decoding accuracy might be reduced.
No known unigrams provided, decoding results might be a lot worse.


Launching tests with KenLM support...
Processed 0/4255 samples
Processed 200/4255 samples
Processed 400/4255 samples
Processed 600/4255 samples
Processed 800/4255 samples
Processed 1000/4255 samples
Processed 1200/4255 samples
Processed 1400/4255 samples
Processed 1600/4255 samples
Processed 1800/4255 samples
Processed 2000/4255 samples
Processed 2200/4255 samples
Processed 2400/4255 samples
Processed 2600/4255 samples
Processed 2800/4255 samples
Processed 3000/4255 samples
Processed 3200/4255 samples
Processed 3400/4255 samples
Processed 3600/4255 samples
Processed 3800/4255 samples
Processed 4000/4255 samples
Processed 4200/4255 samples

Test WER: 0.5469
Test samples: 4279

Sample predictions:
Reference: Και το Τμήμα Ενόπλων και τους βουλευτές και τις οργανώσεις να έχουν μια συνάντηση την Τετάρτη τη μία ώρα εξέβησης στην αίθουσα του Δήμου Π.
Predicted: και το τπαδάτωκαιτους δουλευτές και τις δοογανώσεις να έχουν μια συνάντηση την τετάρτη ώρα εξέπισηςστην αίθουσα του δήμου
-----------