In [1]:
# !pip install datasets==3.6.0
# !pip install transformers
# !pip install tf-keras
# !pip install torch
# !pip install transformers[torch]
# !pip install wandb
# !pip install evaluate
# !pip install librosa
# !pip install jiwer
# !pip install numpy==1.26.4

from huggingface_hub import login
login("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")

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import logging
import torch
import warnings

logging.basicConfig(level=logging.INFO)
warnings.filterwarnings('ignore')
logging.getLogger("pyngrok").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

In [3]:
import evaluate


wer_metric = evaluate.load("wer")


def compute_metrics(preds):
    pred_logits = preds.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Pred ids get padded by -100
    pred_ids[(pred_logits == -100).all(axis=-1)] = processor.tokenizer.pad_token_id
    preds.label_ids[preds.label_ids == -100] = processor.tokenizer.pad_token_id
    
    # Group repeating tokens to get the final transcription
    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(preds.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

INFO:datasets:PyTorch version 2.7.1 available.
INFO:datasets:TensorFlow version 2.19.0 available.
2025-08-04 09:54:46.408996: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-04 09:54:46.421821: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754301286.437317   71423 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754301286.442009   71423 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754301286.453685   71423 computa

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
print(f"PyTorch version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

Using device: cuda
PyTorch version: 2.7.1+cu126
CUDA device: NVIDIA A100 80GB PCIe


In [5]:
from transformers import AutoProcessor, AutoModelForCTC, Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa
import numpy as np
import re
import gc
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from tqdm import tqdm
import copy

In [6]:
from datasets import load_dataset, Dataset, Audio

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("Elormiden/RIK_Cypriot_Collection_Dataset")

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 15756
    })
    validation: Dataset({
        features: ['audio', 'text'],
        num_rows: 1770
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 2068
    })
})

In [8]:
train_ds = ds['train']
eval_ds = ds['validation']

In [9]:
def sampling_map(array):
    sr = array['audio']['sampling_rate']
    tr = 16000
    if sr != tr:  
        resample_array = librosa.resample(array['audio']['array'], orig_sr=sr, target_sr=tr)
        array['audio'] = {
            'path': array['audio']['path'],
            'array': resample_array,
            'sampling_rate': tr
        }
    return array

In [10]:
reforged_train = [sampling_map(sample) for sample in tqdm(train_ds, desc="Resampling")]
reforged_eval = [sampling_map(sample) for sample in tqdm(eval_ds, desc="Resampling")]

Resampling: 100%|██████████| 15756/15756 [00:15<00:00, 1047.71it/s]
Resampling: 100%|██████████| 1770/1770 [00:01<00:00, 1136.78it/s]


In [11]:
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("lighteternal/wav2vec2-large-xlsr-53-greek")
model = AutoModelForCTC.from_pretrained("lighteternal/wav2vec2-large-xlsr-53-greek")

In [44]:
# model.train()

In [48]:
def tokens_working_processes_wac2vec2(rf_ds): # Wac2vec2 architecture
    audio_arrays = [sample["audio"]["array"] for sample in tqdm(rf_ds)]
    sentences = [sample["text"] for sample in tqdm(rf_ds)]

    inputs = processor(
        audio_arrays,
        sampling_rate=16000,
        padding=True,
        max_length=16000,
        truncation=True
    )

    # Word registry fix, in case some models don't support lower registry letters (common issue with Jonathas greek model)
    sentences_upper = [sentence.upper() for sentence in sentences]
    sentences_lower = [sentence.lower() for sentence in sentences]
    labels = processor.tokenizer(
        sentences_lower, 
        padding='max_length',
        max_length=512,
        truncation=True
    )

    labels_ids = labels["input_ids"]
    labels_ids = torch.tensor(labels_ids)  
    labels_ids[labels_ids == 54] = -100
    
    return {
        **inputs,
        "labels": labels_ids,
        # "pronounce_labels": pronounce_labels_ids # Greek model does not recognize english tokens -> therefore we will skip pronunciation for now
    }

In [49]:
processed_data_train = tokens_working_processes_wac2vec2(reforged_train)
processed_data_eval = tokens_working_processes_wac2vec2(reforged_eval)

100%|██████████| 15756/15756 [00:00<00:00, 2811189.97it/s]
100%|██████████| 15756/15756 [00:00<00:00, 2806057.23it/s]
100%|██████████| 1770/1770 [00:00<00:00, 2584029.96it/s]
100%|██████████| 1770/1770 [00:00<00:00, 1719295.53it/s]


In [15]:
processed_data_train.keys()

dict_keys(['input_values', 'attention_mask', 'labels'])

In [50]:
# print(processed_data_train['input_values'][:5])
# print(processed_data_train['attention_mask'][0])
print(processed_data_train['labels'][4])

tensor([  47,   51,   17,   37,   30,   47,   27,    5,   13,   35,   30,   47,
          51,   17,   37,   30,   17,    5,   30,    3,    1,   34,   33,   38,
          47,   30,   33,   44,   28,   45,   27,   37,   17,   44,    1,   47,
          53,   30,   44,   28,   17,   34,   41,   44,   45,   53,   30,   22,
          44,   35,    1,   43,   30,   37,   17,   45,   30,    3,    1,    5,
          31,   35,    1,   42,    8,   47,   38,   44,   30,   21,   48,   33,
           5,   30,   28,   47,   45,   30,    8,   17,   47,   30,   38,   51,
          47,   21,   34,   30,   38,   47,   39,   53, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -1

In [51]:
train_hf = Dataset.from_dict(processed_data_train)
eval_hf = Dataset.from_dict(processed_data_eval)

In [52]:
model.freeze_feature_extractor()

In [61]:
# Проверь несколько примеров manually
sample = eval_hf[1]
true_text = processor.tokenizer.decode(sample["labels"])
print(f"True text: {true_text}")

True text: μαρία[UNK] καλό μεσημέρι[UNK] καλό μεσημέρι παναγιώτη[UNK] καλό μεσημέρι κυρίες και κύριοι[UNK] όπως είπες κι εσύ χθες βράδυ ξεκίνησε η σταδιακή παραμονή[UNK]


In [58]:
print(processor.tokenizer.get_vocab())

{'ψ': 0, 'ρ': 1, 'φ': 2, 'π': 3, 'm': 4, 'ο': 5, 't': 6, 'ϋ': 7, 'σ': 8, 'ϊ': 9, 'g': 10, '´': 11, 'e': 12, 'ύ': 13, 'ΐ': 14, '’': 15, 'a': 16, 'τ': 17, 'δ': 18, 'ζ': 19, 'r': 20, 'λ': 21, 'θ': 22, '·': 23, '»': 24, '«': 25, 'β': 26, 'κ': 27, 'ν': 28, 'n': 29, 'χ': 31, 'η': 32, 'γ': 33, 'ά': 34, 'ω': 35, 'έ': 36, 'ό': 37, 'μ': 38, 'ς': 39, 'o': 40, 'ξ': 41, 'ή': 42, 'ώ': 43, 'ε': 44, 'ι': 45, 'h': 46, 'α': 47, 'ί': 48, "'": 49, 'v': 50, 'υ': 51, '́': 52, '|': 30, '[UNK]': 53, '[PAD]': 54, '<s>': 55, '</s>': 56}


In [64]:
# Data collator автоматически обработает типы
batch = data_collator([sample])
model.eval()

with torch.no_grad():
    outputs = model(**batch)
    predicted_text = processor.batch_decode(torch.argmax(outputs.logits, dim=-1))[0]
    
print(f"Predicted: '{predicted_text}'")

RuntimeError: Input type (float) and bias type (c10::BFloat16) should be the same

In [54]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor ([`WhisperProcessor`])
            The processor used for processing the data.
        decoder_start_token_id (`int`)
            The begin-of-sentence of the decoder.
        forward_attention_mask (`bool`)
            Whether to return attention_mask.
    """

    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        model_input_name = self.processor.model_input_names[0]
        input_features = [
            {model_input_name: feature[model_input_name]} for feature in features
        ]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(
            input_features, return_tensors="pt"
        )

        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        
        # replace padding with -100 to ignore loss correctly
#         labels = labels_batch["input_ids"].masked_fill(
#             labels_batch.attention_mask.ne(1), -100
#         )

#         # if bos token is appended in previous tokenization step,
#         # cut bos token here as it's append later anyways
#         if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
#             labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
#     decoder_start_token_id=model.config.decoder_start_token_id,
#     forward_attention_mask=forward_attention_mask,
)

In [55]:
print("Train dataset length:", len(train_hf))
print("Eval dataset length:", len(eval_hf))
print("First sample keys:", train_hf[0].keys())

Train dataset length: 15756
Eval dataset length: 1770
First sample keys: dict_keys(['input_values', 'attention_mask', 'labels'])


In [56]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=f"./wav2vec2-metrics-final-batch32/64",
    num_train_epochs=5,
    
    ################# 
    per_device_train_batch_size=4,        
    per_device_eval_batch_size=8,         
    gradient_accumulation_steps=12,       
    ################
    
    learning_rate=1e-5,
    warmup_steps=1500,
    
    #################### A100 
    gradient_checkpointing=True,        
    bf16=True,                           
    dataloader_pin_memory=True,        
    dataloader_num_workers=8,            
    #################
    
    save_steps=100,
    eval_steps=50,                      
    weight_decay=0.01,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    report_to='wandb',
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_steps=50,                    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hf,
    eval_dataset=eval_hf,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
)

In [57]:
trainer.train()

Step,Training Loss,Validation Loss,Wer
50,0.6063,0.9874,0.97737
100,0.5536,0.949812,0.978121


KeyboardInterrupt: 