In [None]:
# !python -m wikiextractor.WikiExtractor /kaggle/input/olo-wiki/olowiki-20250201-pages-articles-multistream.xml

In [None]:
# import os
# import subprocess
# from IPython.display import FileLink, display

# def download_file(path, download_file_name):
#     os.chdir('/kaggle/working/')
#     zip_name = f"/kaggle/working/{download_file_name}.zip"
#     command = f"zip {zip_name} {path} -r"
#     result = subprocess.run(command, shell=True, capture_output=True, text=True)
#     if result.returncode != 0:
#         print("Unable to run zip command!")
#         print(result.stderr)
#         return
#     display(FileLink(f'{download_file_name}.zip'))

In [None]:
# download_file('/kaggle/working/nllb-rus-kar/model.safetensors', 'weights')

In [None]:
%pip install sacremoses sacrebleu --quiet

In [None]:
import gc
import torch
import re
import random
import sys
from collections import Counter
import unicodedata
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange
from sklearn.model_selection import train_test_split

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, NllbTokenizer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import get_constant_schedule_with_warmup
from transformers.optimization import Adafactor

import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model

from torch.optim.lr_scheduler import SequentialLR, ExponentialLR, ConstantLR, LinearLR
from sacremoses import MosesPunctNormalizer

import json
import os
import shutil
from typing import List, Tuple

from transformers.models.nllb.tokenization_nllb import FAIRSEQ_LANGUAGE_CODES

def cleanup():
    """Try to free GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
KAGGLE_INPUT = '/kaggle/input/'
MODEL_PATH = 'mbart-rus-kar'
DF_PATH = KAGGLE_INPUT + 'karelian-data/parallel_df_merged4.csv'
# MODEL_LOAD_PATH = KAGGLE_INPUT + '/m/taciturno/nllb-rus-kar/pytorch/ft-messy/1'
MODEL_SAVE_PATH = MODEL_PATH

In [None]:
MBART_PATH = '/kaggle/input/mbart-rus-kar/pytorch/15k/1'
NLLB_OLD_PATH = '/kaggle/input/nllb-rus-kar/pytorch/old-data/1'
NLLB_NEW_PATH = '/kaggle/input/nllb-rus-kar/pytorch/new-data/3'
NLLB_LAST_PATH = '/kaggle/working/nllb-rus-kar'

In [None]:
df_corpus_labeled = pd.read_csv(DF_PATH)
df_train = df_corpus_labeled[df_corpus_labeled.split=='train'].copy() # 22692 items
df_dev = df_corpus_labeled[df_corpus_labeled.split=='dev'].copy()     # 500 items
df_test = df_corpus_labeled[df_corpus_labeled.split=='test'].copy()  

df_dev = df_dev.sample(frac=1).reset_index(drop=True)

In [None]:
# ONLY OLD DATA (content-krl-20)

# train_path = KAGGLE_INPUT + "content-krl-20/train/"
# dev_path = KAGGLE_INPUT + "content-krl-20/test/"
# with open(train_path + 'train.krl.txt', "r", encoding="utf-8") as file:
#     kar_corpus = file.readlines()
# kar_corpus = [l.replace('\n', '').strip() for l in kar_corpus]
# with open(train_path + 'train.rus.txt', "r", encoding="utf-8") as file:
#     rus_corpus = file.readlines()
# rus_corpus = [l.replace('\n', '').strip() for l in rus_corpus]
# df_train = pd.DataFrame([kar_corpus, rus_corpus]).T
# df_train.columns = ['kar', 'rus']

# with open(dev_path + 'dev.krl.txt', "r", encoding="utf-8") as file:
#     kar_corpus = file.readlines()
# kar_corpus = [l.replace('\n', '').strip() for l in kar_corpus]
# with open(dev_path + 'dev.rus.txt', "r", encoding="utf-8") as file:
#     rus_corpus = file.readlines()
# rus_corpus = [l.replace('\n', '').strip() for l in rus_corpus]
# df_dev = pd.DataFrame([kar_corpus, rus_corpus]).T
# df_dev.columns = ['kar', 'rus']

Preproc

In [None]:
mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]

def get_non_printing_char_replacer(replace_by: str = " "):
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    return clean

Tokenizer

In [None]:
# additional_special_tokens=sorted(FAIRSEQ_LANGUAGE_CODES + ['ol_KA'])

In [None]:
MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)
tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_NAME)
len(tokenizer)

Train

In [None]:
LR = 1e-4

model.cuda();
optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=LR,
    clip_threshold=1.0,
    weight_decay=1e-3,
)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=1000)
scheduler1 = ConstantLR(optimizer, factor=1, total_iters=1000)
scheduler2 = ConstantLR(optimizer, factor=0.9, total_iters=8000)
scheduler3 = ConstantLR(optimizer, factor=0.8, total_iters=1000)
scheduler4 = ConstantLR(optimizer, factor=0.7, total_iters=4000)
scheduler = SequentialLR(optimizer, 
                         schedulers=[scheduler1, scheduler2, scheduler3, scheduler4], 
                         milestones=[1000,9000,10000])
# scheduler1 = ConstantLR(optimizer, factor=0.9, total_iters=100)
# scheduler2 = ExponentialLR(optimizer, gamma=0.9)
# scheduler = LinearLR(optimizer, start_factor=0.5, total_iters=1400)
# scheduler = SequentialLR(optimizer, schedulers=[scheduler1, scheduler2], milestones=[100])
# scheduler = scheduler1

In [None]:
LANGS = [('rus', 'ru_RU'), ('kar', 'fi_FI')]
# TODO переделать в dataset
def get_batch_pairs(batch_size, data=df_train):
    (l1, lang1), (l2, lang2) = random.sample(LANGS, 2) # здесь random чтобы переводила модель туда-сюда
    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(preproc(item[l1]))
        yy.append(preproc(item[l2]))
    return xx, yy, lang1, lang2

print(get_batch_pairs(1))

In [None]:
BATCH_SIZE = 16  # 32 already doesn't fit well to 15GB of GPU memory
MAX_LENGTH = 128 
TRAINING_STEPS = 14000 

N_STEPS_TO_ESTIMATE = 1000
losses = list()

preproc(df_dev.iloc[0]['kar'])

In [None]:
def get_batched_validation(batch_size):
    (l1, lang1), (l2, lang2) = random.sample(LANGS, 2)
    for i in range(0, len(df_dev), batch_size):
        xx, yy = list(), list()
        sl = df_dev.iloc[i:i+batch_size]
        for _, row in sl.iterrows():
            xx.append(preproc(row[l1]))
            yy.append(preproc(row[l2]))
        yield xx, yy, lang1, lang2

In [None]:
def validation(model, data=df_dev, batch_size=16):
    model.eval()
    av_loss = list()
    with torch.no_grad():
        for xx, yy, lang1, lang2 in get_batched_validation(batch_size):#range(len(data) // batch_size):
            # xx, yy, lang1, lang2 =  #get_batch_pairs(batch_size, data)
            tokenizer.src_lang = lang1
            x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH).to('cuda')
            tokenizer.src_lang = lang2
            y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH).to('cuda')
            y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100 # TODO: надо ли?
            loss = model(**x, labels=y.input_ids).loss
            av_loss.append(loss.item())
    model.train()
    return np.mean(av_loss)

In [None]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [None]:
# get_lr(optimizer)

In [None]:
model.train()
x, y, loss = None, None, None
BEST_VAL_LOSS = 1e9
train_loss = 1e9
cleanup()


tq = trange(len(losses), TRAINING_STEPS)
for step in tq:
    xx, yy, lang1, lang2 = get_batch_pairs(BATCH_SIZE)
    try:
        tokenizer.src_lang = lang1
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH).to(model.device)
        tokenizer.src_lang = lang2
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH).to(model.device)
        # -100 is a magic value ignored in the loss function
        # because we don't want the model to learn to predict padding ids
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

        loss = model(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except RuntimeError as e: 
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue
    # if step % 100 == 0:
    #     print(get_lr(optimizer))
    if step < 9000 and step % N_STEPS_TO_ESTIMATE == 0:
        # average for N steps
        train_loss = np.mean(losses[-N_STEPS_TO_ESTIMATE:])
        val_loss = validation(model)
        print(f'Train loss is {train_loss} and valid loss is {val_loss} at step {step}')
        if train_loss < 1.5 and val_loss < BEST_VAL_LOSS:
            BEST_VAL_LOSS = val_loss
            model.save_pretrained(MODEL_SAVE_PATH)
            tokenizer.save_pretrained(MODEL_SAVE_PATH)
    elif step >= 9000 and step % 100 == 0:
        train_loss = np.mean(losses[-100:])
        val_loss = validation(model)
        print(f'Train loss is {train_loss} and valid loss is {val_loss} at step {step}')
        if train_loss < 1.5 and val_loss < BEST_VAL_LOSS:
            BEST_VAL_LOSS = val_loss
            model.save_pretrained(MODEL_SAVE_PATH)
            tokenizer.save_pretrained(MODEL_SAVE_PATH)