In [None]:
# !pip install ipywidgets # huggin face widgets
# !pip install --upgrade timm # timm error gpu gemma 3n
# !pip install torchcodec
# !pip install librosa soundfile

# # audio errors
# !sudo apt update
# !sudo apt install -y ffmpeg
# !pip install --upgrade huggingface_hub

# HF errors fix
# !pip install datasets==3.6.0
# !pip index versions datasets
# !pip index versions numpy
# !pip install huggingface-hub==0.20.0

In [None]:
from huggingface_hub import login
login("")

In [None]:
import numpy
numpy.__version__

In [None]:
# !pip install mlflow
# !pip install pyngrok

In [None]:
import logging
import torch
import warnings

logging.basicConfig(level=logging.INFO)
warnings.filterwarnings('ignore')
logging.getLogger("pyngrok").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)

import warnings
warnings.filterwarnings('ignore')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
print(f"PyTorch version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

In [None]:
# !pip install --upgrade datasets
from transformers import AutoProcessor, AutoModelForCTC, Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa
import numpy as np
import re

In [None]:
from datasets import load_dataset

ds = load_dataset("mozilla-foundation/common_voice_17_0", "el")

In [None]:
ds

In [None]:
def audio_type_tester(dataset):
    sample = dataset['train'][0]
    audio_decoder = sample['audio']

    print("Attributes of AudioDecoder:")
    print([attr for attr in dir(audio_decoder) if not attr.startswith('_')])

    if hasattr(audio_decoder, 'path'):
        print(f"Path type: {type(audio_decoder.path)}")
        print(f"Path content: {audio_decoder.path}")

    for attr in ['file', 'filename', 'source', 'metadata']:
        if hasattr(audio_decoder, attr):
            value = getattr(audio_decoder, attr)
            print(f"{attr}: {type(value)} = {value}")

In [None]:
audio_type_tester(ds)

In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-greek")
model = AutoModelForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-greek").eval()

In [None]:
print(f"Model vocab size: {model.config.vocab_size}")
print(f"Processor vocab: {len(processor.tokenizer.get_vocab())}")

In [None]:
sample = ds['train'][0]
print(f"Sample keys: {sample.keys()}")
print(f"Audio type: {type(sample['audio'])}")
print(f"Sentence: {sample['sentence']}")

In [None]:
# sample = ds['train'][3]
# sr = sample['audio']['sampling_rate'] # -> sampling rate
# tr = 16000

# saved_example = sample
# print(f"Before: {sample['audio']}")
# resample_sample = librosa.resample(sample['audio']['array'], orig_sr=sr, target_sr=tr)
# saved_example['audio'] = {
#     'path': sample['audio']['path'],
#     'array': resample_sample,
#     'sampling_rate': tr
# }
# print(f"After: {saved_example['audio']}")

# Now we will resample the whole dataset to 16k sampling rate
def sampling_map(array): # <- ds [train] goes here
    saved_array = array
    sr = array['audio']['sampling_rate']
    tr = 16000
    resample_array = librosa.resample(array['audio']['array'], orig_sr=sr, target_sr=tr)
    saved_array['audio'] = {
        'path': array['audio']['path'],
        'array': resample_array,
        'sampling_rate': tr
    }
    return saved_array

In [None]:
from tqdm import tqdm
import copy

In [None]:
reforged_train = [sampling_map(sample) for sample in tqdm(ds['train'], desc="Resampling")]
reforged_eval = [sampling_map(sample) for sample in tqdm(ds['validation'], desc="Resampling")]

In [None]:
model = model.to(device)

In [None]:
# Data Collator is needed in order to solve the list/numpy problem
data_collator = DataCollatorWithPadding(
    tokenizer=processor.feature_extractor,
    return_tensors="pt"
)

In [None]:
data_collator

In [None]:
def process_reforged_list(sample_list):
    audio_arrays = [sample["audio"]["array"] for sample in sample_list]
    sentences = [sample["sentence"] for sample in sample_list]

    inputs = processor(
        audio_arrays,
        sampling_rate=16000,
        padding=True,
        max_length=16000,
        truncation=True
    )

    labels = processor.tokenizer(
        sentences,
        padding='max_length',
        max_length=512,
        truncation=True
    )
    labels[labels == processor.tokenizer.pad_token_id] = -100

    return {
        "input_values": inputs["input_values"],
        "labels": labels["input_ids"]
    }

In [None]:
processed_data_train = process_reforged_list(reforged_train)
processed_data_eval = process_reforged_list(reforged_train)

In [None]:
print(processed_data_train.keys())
print(processed_data_eval.keys())

In [None]:
from datasets import Dataset
train_hf = Dataset.from_dict(processed_data_train)
eval_hf = Dataset.from_dict(processed_data_eval)

In [None]:
print("=== TRAIN DATASET ===")
print(f"Размер: {len(train_hf)}")
print(f"Колонки: {train_hf.column_names}")
print(f"Features: {train_hf.features}")

print("\n=== EVAL DATASET ===")
print(f"Размер: {len(eval_hf)}")
print(f"Колонки: {eval_hf.column_names}")
print(f"Features: {eval_hf.features}")

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir='/kaggle/working/wav2vec2-finetuned_mozilla',
    run_name="wav2vec2-greek-asr",  # ← Добавили уникальное имя
    overwrite_output_dir=True,
    max_steps=500,
    per_device_train_batch_size=4,
    save_steps=50,
    save_total_limit=1,
    prediction_loss_only=True,
    fp16=True,
    learning_rate=5e-6,
    logging_steps=10,
    # eval_strategy="steps",
    # eval_steps=50,
    disable_tqdm=False,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hf,
    # eval_dataset=eval_hf,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)

In [None]:
# Получить один пример и преобразовать в правильный формат
sample = train_hf[0]
print(f"Sample keys: {sample.keys()}")

# Преобразовать в тензоры и добавить batch dimension
import torch

inputs = {
    "input_values": torch.tensor(sample["input_values"]).unsqueeze(0),  # добавляем batch dim
    "labels": torch.tensor(sample["labels"]).unsqueeze(0)
}

print(f"Input shapes: {inputs['input_values'].shape}")
print(f"Label shapes: {inputs['labels'].shape}")

# Теперь проверить loss
outputs = model(**inputs)
print(f"Loss из модели: {outputs.loss}")


## СКОРЕЕ ВСЕГО ПРОБЛЕМА В ТОМ ЧТО ДАННЫЕ НА cpu а модель на gpu!!

In [None]:
trainer.train()