In [None]:
import logging
import torch
import warnings
import pandas as pd

logging.basicConfig(level=logging.INFO)
warnings.filterwarnings('ignore')
logging.getLogger("pyngrok").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)

import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
print(f"PyTorch version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

In [None]:
from datasets import load_dataset

dictionary_cyprus = load_dataset("Elormiden/Thesaurus-Cypriot-Greek-Dialect")

In [None]:
def prepare_cyprus_words(dataset_dict):
  def format_example(example):
    return {
            'input_text': example['word'],
            'target_text': example['greek_word']
    }
  train_dataset = dataset_dict['train'].map(format_example)
  val_dataset = dataset_dict['validation'].map(format_example)
  train_dataset = train_dataset.remove_columns(['word', 'description', 'greek_word', 'greek_description'])
  val_dataset = val_dataset.remove_columns(['word', 'description', 'greek_word', 'greek_description'])

  return train_dataset, val_dataset

In [None]:
train_text, val_text = prepare_cyprus_words(dictionary_cyprus)

In [None]:
# def tokenize_text_pairs(batch):
#    input_texts = [text.upper() for text in tqdm(batch['input_text'], desc="Collecting input")]
#    target_texts = [text.upper() for text in tqdm(batch['target_text'], desc="Collecting target")]

#    def tokenize_by_char(text):
#       chars = list(text)
#       ids = [processor.tokenizer(c)['input_ids'][0] for c in chars]
#       return ids

#    tokenized_input_texts = [tokenize_by_char(text) for text in tqdm(input_texts, desc="Tokenized input")]
#    tokenized_target_texts = [tokenize_by_char(text) for text in tqdm(target_texts, desc="Tokenized target")]

#   #  inputs = processor.tokenizer(
#   #       tokenized_input_texts,
#   #       padding=True,
#   #       truncation=True,
#   #       max_length=128,
#   #         # return_tensors="pt" # -> вложенность списков
#   #   )

#   #  labels = processor.tokenizer(
#   #       tokenized_target_texts,
#   #       padding=True,
#   #       truncation=True,
#   #        max_length=128,
#   #        # return_tensors="pt" # -> вложенность списков
#   #   )

#   #  labels_ids = labels["input_ids"]
#   #  labels_ids = torch.tensor(labels_ids)  # Превращаем в tensor
#   #  labels_ids[labels_ids == 0] = -100     # ВОТ ТАК РАБОТАЕТ!

#    return {
#       **inputs,
#       "labels": labels_ids
#    }

# как нам тогда в середине теперь оформить

In [None]:
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

def tokenize_by_char(text):
    chars = list(text.upper())
    ids = [processor.tokenizer(c)['input_ids'][0] for c in chars]
    return ids

def tokenize_text_pairs(batch):
    tokenized_input_texts = []
    tokenized_target_texts = []

    for input_text, target_text in tqdm(zip(batch['input_text'], batch['target_text']),
                                       total=len(batch['input_text']), desc="Tokenizing batch"):
        tokenized_input_texts.append(tokenize_by_char(input_text))
        tokenized_target_texts.append(tokenize_by_char(target_text))

    input_ids_tensors = [torch.tensor(ids, dtype=torch.long) for ids in tokenized_input_texts]
    labels_tensors = [torch.tensor(ids, dtype=torch.long) for ids in tokenized_target_texts]

    input_ids_padded = pad_sequence(input_ids_tensors, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels_tensors, batch_first=True, padding_value=0)

    attention_mask = (input_ids_padded != 0).long()
    labels_padded[labels_padded == 0] = -100

    return {
        "input_ids": input_ids_padded,
        "attention_mask": attention_mask,
        "labels": labels_padded
    }

In [None]:
train_cyprus_tokenized = tokenize_text_pairs(train_text)
val_cyprus_tokenized = tokenize_text_pairs(val_text)

In [None]:
train_cyprus_tokenized['labels'][43]

In [None]:
for param in model.wav2vec2.parameters():
    param.requires_grad = False

for param in model.lm_head.parameters():
    param.requires_grad = True

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable_params:,} / Total: {total_params:,}")

In [None]:
from transformers import TrainingArguments, Trainer

text_training_args = TrainingArguments(
    output_dir='./wav2vec2-cypriot-text-pretrain',
    run_name="cypriot-greek-text-mapping",
    overwrite_output_dir=True,
    max_steps=1000,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_steps=200,
    eval_steps=100,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=100,
    eval_strategy="steps",
    save_total_limit=2,
    fp16=True,
    report_to=[],
)

In [None]:
sample = train_cyprus_tokenized[0]
print(f"Sample keys: {sample.keys()}")
for key, value in sample.items():
    print(f"{key}: type={type(value)}, shape={value.shape if hasattr(value, 'shape') else 'no shape'}")

In [None]:
train_data_dict = {
    'input_ids': train_cyprus_tokenized['input_ids'],
    'attention_mask': train_cyprus_tokenized['attention_mask'],
    'labels': train_cyprus_tokenized['labels']
}

val_data_dict = {
    'input_ids': val_cyprus_tokenized['input_ids'],
    'attention_mask': val_cyprus_tokenized['attention_mask'],
    'labels': val_cyprus_tokenized['labels']
}

text_train_hf = Dataset.from_dict(train_data_dict)
text_val_hf = Dataset.from_dict(val_data_dict)

In [None]:
text_train_hf

In [None]:
text_trainer = Trainer(
    model=model,
    args=text_training_args,
    train_dataset=train_cyprus_tokenized,
    eval_dataset=val_cyprus_tokenized,
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
)

In [None]:
text_trainer.train()

In [None]:
from transformers import AutoProcessor, AutoModelForCTC, Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import librosa
import numpy as np
from tqdm import tqdm
import copy
import re
from datasets import load_dataset, Audio

In [None]:
ds = load_dataset("mozilla-foundation/common_voice_17_0", "el")


In [None]:
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-greek"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

In [None]:
print(f"Модель: {type(model)}")
print(f"Процессор: {type(processor)}")
print(f"Параметры модели: {next(model.parameters()).dtype}")
print(f"Model vocab size: {model.config.vocab_size}")
print(f"Processor vocab: {len(processor.tokenizer.get_vocab())}")

In [None]:
sample = ds['train'][0]
print(f"Sample keys: {sample.keys()}")
print(f"Audio type: {type(sample['audio'])}")
print(f"Sentence: {sample['sentence']}")

In [None]:
def sampling_map(array): # <- ds [train] goes here MANUALLY
    saved_array = array
    sr = array['audio']['sampling_rate']
    tr = 16000
    resample_array = librosa.resample(array['audio']['array'], orig_sr=sr, target_sr=tr)
    saved_array['audio'] = {
        'path': array['audio']['path'],
        'array': resample_array,
        'sampling_rate': tr
    }
    return saved_array

In [None]:
reforged_train = [sampling_map(sample) for sample in tqdm(ds['train'], desc="Resampling")]
reforged_eval = [sampling_map(sample) for sample in tqdm(ds['validation'], desc="Resampling")]

In [None]:
reforged_train[:10]

In [None]:
def process_reforged_list_wac2vec2(rf_ds):
    audio_arrays = [sample["audio"]["array"] for sample in rf_ds]
    sentences = [sample["sentence"] for sample in rf_ds]

    inputs = processor(
        audio_arrays,
        sampling_rate=16000,
        padding=True,
        max_length=16000,
        truncation=True
    )

    labels = processor.tokenizer(
        sentences,
        padding='max_length',
        max_length=512,
        truncation=True
    )

    labels_ids = labels["input_ids"]
    labels_ids = torch.tensor(labels_ids)  # Превращаем в tensor
    labels_ids[labels_ids == 0] = -100     # ВОТ ТАК РАБОТАЕТ!

    return {
        **inputs,
        "labels": labels_ids
    }

In [None]:
processed_data_train = process_reforged_list_wac2vec2(reforged_train)
processed_data_eval = process_reforged_list_wac2vec2(reforged_eval)

In [None]:
print(processed_data_train)
print(processed_data_eval.keys())

In [None]:
processed_data_eval['labels'][0]

In [None]:
from datasets import Dataset
train_hf = Dataset.from_dict(processed_data_train)
eval_hf = Dataset.from_dict(processed_data_eval)

In [None]:
print("=== TRAIN DATASET ===")
print(f"Размер: {len(train_hf)}")
print(f"Колонки: {train_hf.column_names}")
print(f"Features: {train_hf.features}")

print("\n=== EVAL DATASET ===")
print(f"Размер: {len(eval_hf)}")
print(f"Колонки: {eval_hf.column_names}")
print(f"Features: {eval_hf.features}")

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
print("Проверка labels:")
sample_labels = processed_data_eval['labels'][0]
print(f"Labels: {sample_labels}")
print(f"Уникальные: {set(sample_labels)}")
print(f"Есть ли -100: {-100 in sample_labels}")
print(f"Все ли -100: {all(x == -100 for x in sample_labels)}")

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
      output_dir="./new",
      overwrite_output_dir=True,
      max_steps=100,
      per_device_train_batch_size=10, # количество рассмотренных обьектов за один раз -> усреднение -> лучшая точность
      save_steps=50,
      save_total_limit=1,
      prediction_loss_only=True,
      fp16=True,
      learning_rate=5e-6,
      ######################
      logging_steps=10,    # <- training losses
      ######################
      eval_strategy="steps",
      eval_steps=10, # <- validation losses
      ######################
    )

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train_hf,        # Весь датасет
   eval_dataset=eval_hf,
   data_collator=data_collator,
   tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()