# Install required packages

In [2]:
model_checkpoint = "Helsinki-NLP/opus-mt-sla-sla"
max_input_length = 512
max_target_length = 512
source_lang = "pl"
target_lang = "szl"

# Creating a dataset

In [None]:
import pandas as pd

test_df = pd.read_csv("test_df.csv")

In [None]:
with open('test.szl', 'r', encoding='utf-8') as file:
    lines_szl = file.readlines()
with open('test.pl', 'r', encoding='utf-8') as file:
    lines_pl = file.readlines()

data = {'szl': [], 'pl': []}
for line_szl, line_pl in zip(lines_szl, lines_pl):
    data['szl'].append(line_szl.strip())
    data['pl'].append(line_pl.strip())

test_df = pd.DataFrame(data)

In [None]:
from datasets import load_dataset, load_metric, DatasetDict, Dataset
from sklearn.model_selection import train_test_split

combined_df = pd.read_csv("training_df.csv", index_col=0)
combined_df.dropna(subset=['pl', 'szl'], inplace=True)

train_df, eval_df = train_test_split(combined_df, test_size=0.05, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

train_dict = [{"translation": {
            "pl": row["pl"],
            "szl": row["szl"],
        }} for row in train_df.to_dict('records')]
eval_dict = [{"translation": {
            "pl": row["pl"],
            "szl": row["szl"],
        }} for row in eval_df.to_dict('records')]
test_dict = [{"translation": {
            "pl": row["pl"],
            "szl": row["szl"],
        }} for row in test_df.to_dict('records')]

formatted_train_dataset = Dataset.from_list(train_dict)
formatted_eval_dataset = Dataset.from_list(eval_dict)
formatted_test_dataset = Dataset.from_list(test_dict)

dataset_dict = DatasetDict({"train": formatted_train_dataset, "validation": formatted_eval_dataset, "test": formatted_test_dataset})

metric = load_metric("sacrebleu")

In [10]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 60241
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3171
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 100
    })
})

# Preprocessing the data

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
prefix = ""
def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
preprocess_function(dataset_dict['train'][:2])



{'input_ids': [[7, 25274, 1760, 1367, 234, 1083, 63, 53, 512, 63, 2920, 314, 909, 59, 278, 2098, 779, 6396, 1005, 2574, 1558, 25092, 20, 3874, 318, 4461, 7, 25274, 1760, 0], [357, 584, 14815, 10191, 1154, 63, 119, 51, 454, 3409, 57, 273, 1820, 245, 6860, 45, 676, 770, 2369, 1850, 1025, 2278, 119, 12625, 10464, 3730, 39, 315, 1275, 13475, 3886, 1579, 42, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[12286, 30393, 1367, 234, 1083, 2728, 53, 512, 2728, 2920, 314, 909, 59, 278, 2098, 779, 6396, 1005, 2574, 1558, 25092, 20, 3874, 318, 4461, 12286, 30393, 0], [2755, 584, 14815, 10191, 1154, 63, 119, 51, 454, 3409, 57, 273, 33, 245, 6860, 466, 770, 2369, 1850, 1025, 2278, 119, 20891, 395, 3730, 39, 172, 1275, 13475, 5689, 1504, 42, 0]]}

In [14]:
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/60241 [00:00<?, ? examples/s]

Map:   0%|          | 0/3171 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# Fine-tuning the model

In [15]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

pytorch_model.bin:   0%|          | 0.00/257M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [16]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_strategy="no",
    num_train_epochs=10,
    predict_with_generate=True
)

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [19]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [20]:
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.2844,0.200254,80.0333,26.9395
2,0.1752,0.138887,85.3732,26.9584
3,0.1308,0.116357,86.7489,27.0142
4,0.103,0.099759,88.4323,26.9798
5,0.0896,0.09034,89.6398,26.9543
6,0.0661,0.087347,90.0686,26.9599
7,0.0635,0.081644,90.7096,26.9871
8,0.0511,0.078851,91.0197,26.9707
9,0.0459,0.077849,91.1126,26.9666
10,0.0419,0.076597,91.0178,26.9779


TrainOutput(global_step=37660, training_loss=0.13012184364898244, metrics={'train_runtime': 9535.8104, 'train_samples_per_second': 63.173, 'train_steps_per_second': 3.949, 'total_flos': 1.0956768018628608e+16, 'train_loss': 0.13012184364898244, 'epoch': 10.0})

In [None]:
from transformers import MarianMTModel, MarianTokenizer
src_text = ['feliks steuer narodził sobie w 1889 roku w sulkowie sztudiowoł słowiańską filologię w innsbrucku i na śląskim uniwersytecie w wrocławiu po sztudyji wypracował dyplomów rektora i w katowicach pracował za rektora i dyrektora państwowej gimnazyje klasycznej i miastowy gimnazyje matematycznoprzyrodniczej']

# model_name = f'opus-mt-sla-sla-finetuned-{source_lang}-to-{target_lang}/checkpoint-3500'
# tokenizer = MarianTokenizer.from_pretrained(model_name)
print(tokenizer.supported_language_codes)


In [None]:
# model = MarianMTModel.from_pretrained(model_name)
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def translate(model, src_text):
  translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True).to(device))
  return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

print(translate(model, ['feliks steuer narodził sobie w 1889 roku w sulkowie sztudiowoł słowiańską filologię w innsbrucku i na śląskim uniwersytecie w wrocławiu po sztudyji wypracował dyplomów rektora i w katowicach pracował za rektora i dyrektora państwowej gimnazyje klasycznej i miastowy gimnazyje matematycznoprzyrodniczej']))
# print(translate(model, ['feliks steuer narodziōł sie we 1889 roku we sulkowie sztudiowoł słowiańskõ filologijõ we innsbrucku i na ślōnskim uniwersytecie we wrocławiu po sztudyji wyrobiōł dyplōm rechtora i we katowicach robiōł za rechtora i dyrechtora państwowyj gimnazyje klasycznyj i miastowyj gimnazyje matymatycznoprzirodniczyj']))
# feliks steuer narodził sobie w 1889 roku w sulkowie sztudiowoł słowiańską filologię w innsbrucku i na śląskim uniwersytecie w wrocławiu po sztudyji wypracował dyplomów rektora i w katowicach pracował za rektora i dyrektora państwowej gimnazyje klasycznej i miastowy gimnazyje matematycznoprzyrodniczej

In [None]:
model_baseline = MarianMTModel.from_pretrained(model_checkpoint).to(device)

print(translate(model_baseline, ['feliks steuer narodziōł sie we 1889 roku we sulkowie sztudiowoł słowiańskõ filologijõ we innsbrucku i na ślōnskim uniwersytecie we wrocławiu po sztudyji wyrobiōł dyplōm rechtora i we katowicach robiōł za rechtora i dyrechtora państwowyj gimnazyje klasycznyj i miastowyj gimnazyje matymatycznoprzirodniczyj']))



["Felix steuer porodrodrodrodrod felix se narodrodrodrodrodrodrodrodrodrodrodrodrodrodrod v 1889 w sulllix stud studoval Slovian sty Slovian ́ filologologicie v innsbruck in innsbruck i na Sll Sllix steuer Universitetete v rocclave Universitetete v roccclavic po Stud stud stud stud stud stud stud stud stud stud studiiiiii felix narodrodrodrodrodrodrodrodrodrodrodrodil se v 1889 n' 1889"]


In [None]:
from huggingface_hub import login
login()

In [None]:
model.push_to_hub(f"opus-mt-{source_lang}-{target_lang}")