In [2]:
# !pip install datasets==3.6.0
# !pip install transformers
# !pip install tf-keras
# !pip install torch
# !pip install transformers[torch]
# !pip install wandb
# !pip install evaluate
# !pip install librosa
# !pip install jiwer
# !pip install numpy==1.26.4

In [2]:
import logging
import torch
import warnings
import gc
import os
import evaluate
import numpy as np
import librosa
from dataclasses import dataclass, field
from typing import Any, Dict, List, Union, Optional
from tqdm import tqdm
from huggingface_hub import login
from datasets import load_dataset, Dataset, Audio, disable_caching
from transformers import (
    AutoProcessor,
    AutoModelForCTC,
    Wav2Vec2Processor,
    TrainingArguments,
    Trainer
)

  from .autonotebook import tqdm as notebook_tqdm
2025-08-05 13:57:28.137646: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754402248.167548   72301 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754402248.177192   72301 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754402248.204357   72301 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754402248.204395   72301 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754402248.204398   72301

In [23]:
login("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")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(preds):
    pred_logits = preds.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred_ids[(pred_logits == -100).all(axis=-1)] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(preds.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer, "cer": cer}

Downloading builder script: 6.61kB [00:00, 8.33MB/s]


In [4]:
logging.basicConfig(level=logging.INFO)
warnings.filterwarnings('ignore')
logging.getLogger("pyngrok").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
print(f"PyTorch version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

Using device: cuda
PyTorch version: 2.7.0
CUDA device: NVIDIA A100-SXM4-40GB


In [9]:
from datasets import load_dataset, Dataset, Audio

# ====================================================================================
# Data Loading and Resampling
# ====================================================================================

print("Loading dataset...")
ds = load_dataset("Elormiden/RIK_Cypriot_News_Dataset")
print("Dataset loaded successfully.")
print(ds)

# We will use all splits for a complete workflow
train_ds = ds['train']
eval_ds = ds['validation']
test_ds = ds['test']

Loading dataset...


Downloading data: 100%|██████████| 16/16 [02:59<00:00, 11.21s/files]
Generating train split: 100%|██████████| 34065/34065 [00:09<00:00, 3541.01 examples/s]
Generating validation split: 100%|██████████| 4255/4255 [00:01<00:00, 3536.61 examples/s]
Generating test split: 100%|██████████| 4279/4279 [00:01<00:00, 3535.63 examples/s]


Dataset loaded successfully.
DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 34065
    })
    validation: Dataset({
        features: ['audio', 'text'],
        num_rows: 4255
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 4279
    })
})


In [6]:
# ====================================================================================
# Model and Processor Loading
# ====================================================================================
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("Elormiden/1.67-1.53-0.71")
model = AutoModelForCTC.from_pretrained("Elormiden/1.67-1.53-0.71")

# It's good practice to freeze the feature extractor to save memory and
# focus on training the CTC head.
model.freeze_feature_extractor()

In [7]:
model.train()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [8]:
import torch #### WAC2VEC2

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [10]:
max_text_len_train = max(len(s.lower()) for s in train_ds["text"])
max_text_len_eval = max(len(s.lower()) for s in eval_ds["text"])
print(f"Max text length in train: {max_text_len_train}")
print(f"Max text length in eval: {max_text_len_eval}")

Max text length in train: 399
Max text length in eval: 231


In [18]:
from torch.utils.data import Dataset as TorchDataset
import librosa
import torch

class StreamingASRDataset(TorchDataset):
    def __init__(self, ds, processor, max_label_length=410):
        self.ds = ds
        self.processor = processor
        self.max_label_length = max_label_length
    
    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, idx):
        sample = self.ds[idx]
        audio = sample["audio"]["array"]
        sentence = sample["text"].lower()
        
        sr = sample["audio"]["sampling_rate"]
        target_sr = 16000
        if sr != target_sr:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        
        inputs = self.processor(
            audio,
            sampling_rate=target_sr,
            padding=True,
            max_length=112000,  # 7 секунд
            truncation=True,
            return_tensors='pt'
        )
        
        labels = self.processor.tokenizer(
            sentence,
            padding='max_length',
            max_length=self.max_label_length,
            truncation=True,
            return_tensors='pt'
        )["input_ids"]
        
        labels[labels == 54] = -100 # empty space
        labels[labels == 53] = -100 # dots
        
        return {
            "input_values": inputs["input_values"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": labels.squeeze(0),
        }

In [30]:
train_dataset = StreamingASRDataset(train_ds, processor, max_label_length=400)
eval_dataset = StreamingASRDataset(eval_ds, processor, max_label_length=240)

In [31]:
eval_dataset[0]

{'input_values': tensor([ 0.0318,  0.0737,  0.0757,  ..., -0.0925,  0.0152,  0.0092]),
 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1], dtype=torch.int32),
 'labels': tensor([  27,   47,   45,   30,   17,    5,   30,   17,   38,   42,   38,   47,
           30,   44,   28,   37,    3,   21,   35,   28,   30,   27,   47,   45,
           30,   17,    5,   51,   39,   30,   26,    5,   51,   21,   44,   51,
           17,   36,   39,   30,   27,   47,   45,   30,   17,   45,   39,   30,
            5,    1,   33,   47,   28,   43,    8,   44,   45,   39,   30,   28,
           47,   30,   36,   31,    5,   51,   28,   30,   38,   45,   47,   30,
            8,   51,   28,   34,   28,   17,   32,    8,   32,   30,   17,   32,
           28,   30,   17,   44,   17,   34,    1,   17,   32,   30,   17,   32,
           30,   38,   48,   47,   30,   43,    1,   47,   30,   44,   41,   36,
           26,   32,    8,   32,   39,   30,    8,   17,   32,   28,   30,   47,
           48,   22, 

In [21]:
gc.collect()

3342

In [22]:
#############################
# Unfreezing feature extractor during training
#############################

from transformers import TrainerCallback

class UnfreezeCallback(TrainerCallback):
    def __init__(self, unfreeze_epoch=4):
        self.unfreeze_epoch = unfreeze_epoch
        self.unfrozen = False
    
    def on_epoch_begin(self, args, state, control, model=None, **kwargs):
        if state.epoch >= self.unfreeze_epoch and not self.unfrozen:
            print(f"Unfreezing feature extractor at epoch {state.epoch}")
            for param in model.wav2vec2.feature_extractor.parameters():
                param.requires_grad = True
                
            optimizer = kwargs.get('optimizer')
            if optimizer:
                feature_params = list(model.wav2vec2.feature_extractor.parameters())
                other_params = [p for p in model.parameters() if p not in feature_params]
                
                optimizer.param_groups = [
                    {'params': feature_params, 'lr': 1e-6},
                    {'params': other_params, 'lr': optimizer.param_groups[0]['lr']}
                ]
            
            self.unfrozen = True

In [34]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=f"./honey-im-back",
    num_train_epochs=4,
    
    ################# 
    per_device_train_batch_size=4,        
    per_device_eval_batch_size=8,         
    gradient_accumulation_steps=12,       
    ################
    
    learning_rate=1e-5,
    warmup_steps=1500,
    
    #################### A100 
    gradient_checkpointing=True,        
    bf16=True, # but DataLoader issues                 
    dataloader_pin_memory=True,        
    dataloader_num_workers=8,            
    #################
    
    save_steps=100,
    eval_steps=50,                      
    weight_decay=0.01,
    eval_strategy="steps",
    eval_on_start = True,
    save_strategy="steps",
    load_best_model_at_end=True,
    report_to='wandb',
    metric_for_best_model="wer",
    greater_is_better=False,
    logging_steps=50,                    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
)

In [35]:
trainer.train()

{'eval_loss': 0.9840735197067261, 'eval_wer': 0.5807119443326241, 'eval_cer': 0.9029969073711952, 'eval_runtime': 42.4225, 'eval_samples_per_second': 100.301, 'eval_steps_per_second': 12.541, 'epoch': 0}
{'loss': 1.1133, 'grad_norm': 0.6126646995544434, 'learning_rate': 3.266666666666667e-07, 'epoch': 0.07044734061289186}
{'eval_loss': 0.984184980392456, 'eval_wer': 0.5804962907585255, 'eval_cer': 0.9029854770714048, 'eval_runtime': 44.8658, 'eval_samples_per_second': 94.838, 'eval_steps_per_second': 11.858, 'epoch': 0.07044734061289186}
{'loss': 1.1264, 'grad_norm': 0.5624136924743652, 'learning_rate': 6.6e-07, 'epoch': 0.14089468122578372}
{'eval_loss': 0.983736515045166, 'eval_wer': 0.5811144976709414, 'eval_cer': 0.9029860632406248, 'eval_runtime': 44.5761, 'eval_samples_per_second': 95.455, 'eval_steps_per_second': 11.935, 'epoch': 0.14089468122578372}
{'loss': 1.1126, 'grad_norm': 0.6393551230430603, 'learning_rate': 9.933333333333333e-07, 'epoch': 0.2113420218386756}
{'eval_loss

TrainOutput(global_step=2840, training_loss=1.0909068322517503, metrics={'train_runtime': 7704.3486, 'train_samples_per_second': 17.686, 'train_steps_per_second': 0.369, 'train_loss': 1.0909068322517503, 'epoch': 4.0})

In [36]:
trainer.save_model("./baby-we-are-so-back")
processor.save_pretrained("./baby-we-are-so-back")

[]

In [37]:
from transformers import AutoProcessor, AutoModelForCTC

trained_processor = AutoProcessor.from_pretrained("./baby-we-are-so-back")
trained_model = AutoModelForCTC.from_pretrained("./baby-we-are-so-back")

In [38]:
trained_model

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [41]:
trained_model.push_to_hub("Elormiden/1.05-0.9-0.55-full")
trained_processor.push_to_hub("Elormiden/1.05-0.9-0.55-full")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            
New Data Upload                         : |          |  0.00B /  0.00B            [A

  /tmp/tmpeh8wbato/model.safetensors    :   0%|          |  549kB / 1.26GB            [A[A

Processing Files (0 / 1)                :   0%|          |  549kB / 1.26GB,  274kB/s  [A[A
New Data Upload                         :   0%|          |  549kB /  134MB,  274kB/s  [A

  /tmp/tmpeh8wbato/model.safetensors    :   0%|          |  549kB / 1.26GB            [A[A

  /tmp/tmpeh8wbato/model.safetensors    :   0%|          |  549kB / 1.26GB            [A[A

Processing Files (0 / 1)                :   0%|          | 1.10MB / 1.26GB,  422kB/s  [A[A
New Data Upload                         :   1%|          | 1.10MB /  201MB,  422kB/s  [A

Processing Files (0 / 1)                :   0%|          | 3.29MB / 1.26GB, 1.18MB/s  [A[A
New Data Upload                         :   2%|▏         | 3.29MB /  201MB, 1.18MB/s  

CommitInfo(commit_url='https://huggingface.co/Elormiden/1.05-0.9-0.55-full/commit/5747d284ed7d3459d45a205aa2f4e87642767745', commit_message='Upload processor', commit_description='', oid='5747d284ed7d3459d45a205aa2f4e87642767745', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Elormiden/1.05-0.9-0.55-full', endpoint='https://huggingface.co', repo_type='model', repo_id='Elormiden/1.05-0.9-0.55-full'), pr_revision=None, pr_num=None)

In [39]:
import shutil

shutil.make_archive('dataset_one_and_second_wav2vec2', 'zip', './baby-we-are-so-back/')

'/home/ubuntu/dataset_one_and_second_wav2vec2.zip'

In [42]:
trained_model.eval()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [43]:
def evaluate_model(model, processor, test_dataset, batch_size=8):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    all_predictions = []
    all_references = []
    
    with torch.no_grad():
        for i in range(0, len(test_dataset), batch_size):
            batch = test_dataset[i:i+batch_size]
            
            audio_arrays = [sample["array"] for sample in batch["audio"]]
            inputs = processor(
                audio_arrays, 
                sampling_rate=16000, 
                return_tensors="pt", 
                padding=True
            )
            
            # Инференс
            inputs = {k: v.to(device) for k, v in inputs.items()}
            logits = model(**inputs).logits

            logits[:, :, 53] = -float('inf')
            
            # Декодируем предсказания
            predicted_ids = torch.argmax(logits, dim=-1)
            predictions = processor.batch_decode(predicted_ids)
            
            # Собираем результаты
            all_predictions.extend(predictions)
            all_references.extend(batch["text"])
            
            if i % (batch_size * 10) == 0:
                print(f"Processed {i}/{len(test_dataset)} samples")
    
    # Вычисляем WER
    wer = wer_metric.compute(predictions=all_predictions, references=all_references)
    
    return wer, all_predictions, all_references

# Запускаем тестирование
print("Starting evaluation on test set...")
wer_score, predictions, references = evaluate_model(trained_model, trained_processor, test_ds)

print(f"\nTest WER: {wer_score:.4f}")
print(f"Test samples: {len(test_ds)}")

# Покажем несколько примеров
print("\nSample predictions:")
for i in range(5):
    print(f"Reference: {references[i]}")
    print(f"Predicted: {predictions[i]}")
    print("-" * 50)

Starting evaluation on test set...
Processed 0/4279 samples
Processed 80/4279 samples
Processed 160/4279 samples
Processed 240/4279 samples
Processed 320/4279 samples
Processed 400/4279 samples
Processed 480/4279 samples
Processed 560/4279 samples
Processed 640/4279 samples
Processed 720/4279 samples
Processed 800/4279 samples
Processed 880/4279 samples
Processed 960/4279 samples
Processed 1040/4279 samples
Processed 1120/4279 samples
Processed 1200/4279 samples
Processed 1280/4279 samples
Processed 1360/4279 samples
Processed 1440/4279 samples
Processed 1520/4279 samples
Processed 1600/4279 samples
Processed 1680/4279 samples
Processed 1760/4279 samples
Processed 1840/4279 samples
Processed 1920/4279 samples
Processed 2000/4279 samples
Processed 2080/4279 samples
Processed 2160/4279 samples
Processed 2240/4279 samples
Processed 2320/4279 samples
Processed 2400/4279 samples
Processed 2480/4279 samples
Processed 2560/4279 samples
Processed 2640/4279 samples
Processed 2720/4279 samples
P