In [None]:
from transformers import AutoProcessor, AutoModelForCTC
from datasets import Dataset, load_dataset, Audio, DatasetDict
import torch
from tqdm.auto import tqdm
import random

In [None]:
milamou_processor = AutoProcessor.from_pretrained("Elormiden/wav2vec2-cypriot-greek-milamou")
milamou_model = AutoModelForCTC.from_pretrained("Elormiden/wav2vec2-cypriot-greek-milamou")
milamou_model.eval()

In [None]:
hellenic_processor = AutoProcessor.from_pretrained("Elormiden/wav2vec2-greek-hellenic-parlament")
hellenic_model = AutoModelForCTC.from_pretrained("Elormiden/wav2vec2-greek-hellenic-parlament")
hellenic_model.eval()

In [None]:
base_processor = AutoProcessor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-greek")
base_model = AutoModelForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-greek")
base_model.eval()

In [None]:
################ Datasets
ds = load_dataset("Elormiden/RIK_Cypriot_News_Dataset", split="test") # audio, text

In [None]:
def transcribe_audio(model, processor, audio_array):
    """
    Транскрибирует аудио массив с использованием данной ASR модели и процессора.
    """
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

In [None]:
num_samples_to_test = 5
random_indices = random.sample(range(len(ds)), num_samples_to_test)

for i, idx in enumerate(random_indices):
    sample = ds[idx]
    audio_data = sample['audio']['array'] # Это уже будет массив numpy 16kHz
    true_text = sample['text']

    print(f"\n=== Пример {i+1} (Индекс в датасете: {idx}) ===")
    print(f"Оригинальный текст (Истина): \"{true_text}\"")
    milamou_hyp = transcribe_audio(milamou_model, milamou_processor, audio_data)
    print(f"Milamou:    \"{milamou_hyp}\"")

    hellenic_hyp = transcribe_audio(hellenic_model, hellenic_processor, audio_data)
    print(f"Hellenic:  \"{hellenic_hyp}\"")
    base_hyp = transcribe_audio(base_model, base_processor, audio_data)
    print(f"Jonatasgrosman: \"{base_hyp}\"")