# Лабораторна робота 4.1
## Машинний переклад EN → UK на базі Transformer

## 1) Імпорт та перевірка середовища

In [1]:

import platform
import random
from pathlib import Path

import numpy as np
import pandas as pd
import torch

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate

print("Python:", platform.python_version())
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


  from .autonotebook import tqdm as notebook_tqdm


Python: 3.13.9
PyTorch: 2.9.1+cu128
CUDA available: False


## 2) Завантаження локального `ukr.txt`

In [2]:

BASE_DIR = Path.cwd()
DATA_FILE = BASE_DIR / "ukr.txt"

print("Working dir:", BASE_DIR.resolve())
print("Looking for:", DATA_FILE.resolve())
assert DATA_FILE.exists(), "❌ ukr.txt не знайдено"

def load_pairs_txt(path: Path, max_lines: int | None = None):
    pairs = []
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if max_lines is not None and i >= max_lines:
                break
            line = line.strip()
            if not line:
                continue
            parts = line.split("\t")
            if len(parts) < 2:
                continue
            en, uk = parts[0].strip(), parts[1].strip()
            if en and uk:
                pairs.append((en, uk))
    return pairs

with DATA_FILE.open("r", encoding="utf-8") as f:
    for _ in range(3):
        print(f.readline().rstrip())


Working dir: /home/kali/Desktop/KPI/DATA_ANALYS/lab4
Looking for: /home/kali/Desktop/KPI/DATA_ANALYS/lab4/ukr.txt
Go.	Йди.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #6584257 (deniko)
Hi.	Вітаю!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #414700 (deniko)
Hi.	Привіт.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #3841503 (rmdas)


## 3) Підготовка даних (швидкий режим)
Зменшуй `MAX_SAMPLES`, якщо все ще довго. Для CPU зазвичай 5k–10k — ок.
Також можна підняти `MAX_STEPS`/`MAX_SAMPLES`, якщо є GPU.

In [3]:
MAX_SAMPLES = 8000  
MAX_STEPS   = 1200 

pairs = load_pairs_txt(DATA_FILE)
print("All pairs:", len(pairs))

random.shuffle(pairs)
pairs = pairs[:MAX_SAMPLES]
print("Using pairs:", len(pairs))

df = pd.DataFrame(pairs, columns=["en", "uk"]).drop_duplicates()
df["en_len"] = df["en"].str.split().str.len()
df["uk_len"] = df["uk"].str.split().str.len()

df = df[(df.en_len <= 35) & (df.uk_len <= 35)].reset_index(drop=True)
print("After filtering:", len(df))
df.head()


All pairs: 160049
Using pairs: 8000
After filtering: 7999


Unnamed: 0,en,uk,en_len,uk_len
0,Stand still!,Стій спокійно!,2,2
1,Where's my coat?,Де моє пальто?,3,3
2,I'm not done.,Я не скінчив.,3,3
3,Tom insisted on going.,"Том наполіг на тому, щоб піти.",4,6
4,"In accepting the money, he lost the respect of...","Взявши гроші, він втратив повагу людей.",11,6


## 4) Train/Validation split

In [4]:

test_size = 0.02
n = len(df)
n_val = max(1, int(n * test_size))

val_df = df.sample(n=n_val, random_state=42)
train_df = df.drop(val_df.index).reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

print("Train:", len(train_df), "Validation:", len(val_df))

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df[["en","uk"]]),
    "validation": Dataset.from_pandas(val_df[["en","uk"]]),
})
ds


Train: 7840 Validation: 159


DatasetDict({
    train: Dataset({
        features: ['en', 'uk'],
        num_rows: 7840
    })
    validation: Dataset({
        features: ['en', 'uk'],
        num_rows: 159
    })
})

## 5) Модель та токенізація (коротші послідовності = швидше)

In [5]:

MODEL_NAME = "Helsinki-NLP/opus-mt-en-uk"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

max_source_len = 64
max_target_len = 64

def preprocess(batch):
    model_inputs = tokenizer(batch["en"], max_length=max_source_len, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["uk"], max_length=max_target_len, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
tokenized


Map: 100%|█████████████████████████████████████████████████████████████████████| 7840/7840 [00:00<00:00, 11648.55 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████| 159/159 [00:00<00:00, 7803.77 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7840
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 159
    })
})

## 6) Тренування (без eval/save кожні N кроків)

In [6]:

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

use_fp16 = torch.cuda.is_available()

training_args = Seq2SeqTrainingArguments(
    output_dir="mt_en_uk_fast20",

    max_steps=MAX_STEPS,
    eval_strategy="no",
    save_strategy="no",
    predict_with_generate=False,

    learning_rate=5e-5,
    per_device_train_batch_size=8 if torch.cuda.is_available() else 4,
    gradient_accumulation_steps=2, 
    logging_steps=50,
    fp16=use_fp16,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
50,0.6423
100,0.6259
150,0.6629
200,0.6501
250,0.625
300,0.5652
350,0.6292
400,0.695
450,0.6329
500,0.6184


TrainOutput(global_step=1200, training_loss=0.566940336227417, metrics={'train_runtime': 902.5551, 'train_samples_per_second': 10.636, 'train_steps_per_second': 1.33, 'total_flos': 27753226960896.0, 'train_loss': 0.566940336227417, 'epoch': 1.2244897959183674})

## 7) Швидка оцінка BLEU на піднаборі (наприклад 200 прикладів)

In [7]:

bleu = evaluate.load("sacrebleu")

def fast_bleu_eval(n_samples=200, num_beams=4, max_new_tokens=64):
    val_raw = ds["validation"]
    n = min(n_samples, len(val_raw))
    idx = list(range(len(val_raw)))
    random.shuffle(idx)
    idx = idx[:n]

    sources = [val_raw[i]["en"] for i in idx]
    refs = [[val_raw[i]["uk"]] for i in idx]

    model.eval()
    batch_size = 16 if torch.cuda.is_available() else 8
    preds = []

    for i in range(0, n, batch_size):
        batch = sources[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_source_len).to(model.device)
        with torch.no_grad():
            out = model.generate(**inputs, num_beams=num_beams, max_new_tokens=max_new_tokens)
        preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))

    preds = [p.strip() for p in preds]
    refs = [[r[0].strip()] for r in refs]

    res = bleu.compute(predictions=preds, references=refs)
    return res

res = fast_bleu_eval(n_samples=200)
res


{'score': 56.27799372538043,
 'counts': [836, 540, 366, 246],
 'totals': [1054, 895, 736, 577],
 'precisions': [79.3168880455408,
  60.33519553072626,
  49.72826086956522,
  42.63431542461005],
 'bp': 0.9971577470606117,
 'sys_len': 1054,
 'ref_len': 1057}

## 8) Приклади перекладу

In [8]:

def translate(sentences, num_beams=4, max_new_tokens=80):
    model.eval()
    inputs = tokenizer(
        sentences, return_tensors="pt", padding=True, truncation=True, max_length=max_source_len
    ).to(model.device)

    with torch.no_grad():
        out = model.generate(**inputs, num_beams=num_beams, max_new_tokens=max_new_tokens)
    return tokenizer.batch_decode(out, skip_special_tokens=True)

test_sents = [
    "I love Ukraine and I want to learn the language.",
    "Cybersecurity is important for modern organizations.",
    "Where is the nearest train station?",
    "Please open the window.",
]

for s, t in zip(test_sents, translate(test_sents)):
    print("EN:", s)
    print("UK:", t)
    print("-"*60)


EN: I love Ukraine and I want to learn the language.
UK: Я люблю Україну і хочу вивчити мову.
------------------------------------------------------------
EN: Cybersecurity is important for modern organizations.
UK: Кібербезпеки важливі для сучасних організацій.
------------------------------------------------------------
EN: Where is the nearest train station?
UK: Де найближча залізнична станція?
------------------------------------------------------------
EN: Please open the window.
UK: Будь ласка, відчиніть вікно.
------------------------------------------------------------


## 9) (Опціонально) Збереження моделі

In [9]:

SAVE_DIR = Path("mt_en_uk_fast20_saved")
SAVE_DIR.mkdir(exist_ok=True)

trainer.save_model(str(SAVE_DIR))
tokenizer.save_pretrained(str(SAVE_DIR))

print("✅ Saved to:", SAVE_DIR.resolve())




✅ Saved to: /home/kali/Desktop/KPI/DATA_ANALYS/lab4/mt_en_uk_fast20_saved


## Висновки
- Виконано машинний переклад EN→UK на основі трансформера.
- Застосовано fine-tuning у «швидкому» режимі (~20 хв на CPU).
- BLEU пораховано на піднаборі validation.
