##### Copyright 2024 Google LLC.

In [1]:
# %pip install --upgrade --no-cache-dir pip wheel setuptools black isort jupyterlab-code-formatter jupyterthemes jupyterlab_darkside_theme nvitop
# %pip install --upgrade --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# %pip install --upgrade --no-cache-dir torcheval optuna torchmetrics torchtnt
# %pip install --upgrade --no-cache-dir evaluate rouge_score datasets tensorboard accelerate flash-attn torchtnt bitsandbytes transformers
# %pip install --upgrade --no-cache-dir unsloth
# %pip install --upgrade --no-cache-dir trl
# # # !rm ~/.cache/matplotlib -rf

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Collecting wheel
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Collecting setuptools
  Downloading setuptools-75.8.0-py3-none-any.whl.metadata (6.7 kB)
Collecting black
  Downloading black-24.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.2/79.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isort
  Downloading isort-5.13.2-py3-none-any.whl.metadata (12 kB)
Collecting jupyterlab-code-formatter
  Downloading jupyterlab_code_formatter-3.0.2-py3-none-any.whl.metadata (6.0 kB)
Collecting jupyterthemes
  Downloading jupyterthemes-0.20.0-py2.py3-none-any.whl.metadata (1.0 kB)
Collecting jupyterlab_darkside_theme
  Downloading jupyterlab_darkside_theme-0.1.2-py3-none-any.whl.metadata (5.8 kB)
Collecting nvitop
  Downloading nvitop-1.4.0-py3-none-any.whl.metadata (80 

# FIXME
- Translate topic (aka wikipedia page) in a standard language (English)
- For venetian language, use "decoded" translation (no phonetic symbols)
- Remove batch of text that are below 50 words

In [1]:
import gc
import gzip
import json
import os
import pickle
import re
import string
from collections import defaultdict
from copy import copy
from typing import *

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
from datasets import Dataset, load_dataset
from tqdm.auto import tqdm
from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextStreamer,
    TrainingArguments,
)
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported

batch_regex = re.compile(r"(?<=[.!?\n])\s+")
punctuation_set = set(string.punctuation)
number_regex = re.compile(r"\d*\.\d+|\d+", re.MULTILINE)

tqdm.pandas()
os.environ["HF_TOKEN"] = "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"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


def _collect():
    x = 0
    for i in range(3):
        x += gc.collect()
        torch.cuda.empty_cache()
    return x

2025-01-09 21:33:33.450742: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-09 21:33:33.450810: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-09 21:33:33.452219: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-09 21:33:33.460070: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from typing import *

import nltk.translate.bleu_score as bleu
import nltk.translate.gleu_score as gleu


def sentence_metric(translated: str, original: str, metric=gleu) -> float:
    method_name = [name for name in metric.__dict__.keys() if "sentence" in name][0]
    score_method = getattr(metric, method_name)
    return score_method([translated.split()], original.split())


def corpus_metric(translated: List[str], original: List[str], metric=gleu) -> float:
    assert len(translated) == len(original)
    method_name = [name for name in metric.__dict__.keys() if "corpus" in name][0]
    score_method = getattr(metric, method_name)
    translated = [[sentence.split()] for sentence in translated]
    original = [sentence.split() for sentence in original]
    return score_method(translated, original)


hyp = "she read the book because she was interested in world history"
ref_a = "she read the book because she was interested in world history"
ref_b = "she was interested in world history because she read the book"
sentence_metric(ref_b, hyp), corpus_metric([ref_b, ref_b], [hyp, hyp]), sentence_metric(
    ref_b, hyp, bleu
), corpus_metric([ref_b, ref_b], [hyp, hyp], bleu)

(0.7894736842105263,
 0.7894736842105263,
 0.7400828044922853,
 0.7400828044922853)

In [6]:
# FIXME: store these constant in a single file (are shared between dataset/train/inference)
max_seq_length = 1024
min_seq_length = 0


def load_base_model(model_id, max_seq_length, device="sequential"):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_id,
        max_seq_length=max_seq_length,
        dtype=None,
        load_in_4bit=True,
        device_map=device,
        attn_implementation="flash_attention_2",
    )
    return model, tokenizer


model, tokenizer = load_base_model(
    # "models/gemma-2-2b-it_unsloth_ia_interlingua_translate_config_3.1-candidate/",
    "google/gemma-2-2b-it",
    max_seq_length,
)
_collect()
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.1.5: Fast Gemma2 patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla V100-SXM2-16GB. Max memory: 15.773 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): GemmaFixedRotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear4bit(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attention_layerno

In [7]:
def tokenize_for_inference(entries: List[str], tokenizer=tokenizer, to_gpu=False):
    tokenized = [
        tokenizer.apply_chat_template(
            [{"role": "user", "content": entry}], return_tensors="pt"
        )
        for entry in entries
    ]
    if to_gpu:
        tokenized = [entry.to("cuda") for entry in tokenized]
    return tokenized


MODEL_START_TOKEN = "<start_of_turn>model"


def extract_response(model_response: str):

    n = model_response.find(MODEL_START_TOKEN)
    response = (
        model_response[n:]
        .replace("### Response:\n", "")
        .removeprefix(MODEL_START_TOKEN)
        .removesuffix("<end_of_turn>")
        .strip()
    )

    return response


def create_inference_dataset(original_content, translated_content, ids, to_gpu=True):
    return (
        tokenize_for_inference(original_content, to_gpu=to_gpu),
        translated_content,
        ids,
    )

In [8]:
df_test = Dataset.load_from_disk("datasets/interlingua_translate_test.hf/").to_pandas()

In [9]:
df_test = df_test[df_test["task_type"] == "translate"]

In [10]:
df_test.sample(1).to_dict(orient="records")

[{'topic': 'Deriva genetic',
  'original_content': 'Mais illo pote etiam ser causate per le homine, in un gruppo de limitate de animales que ille ha domesticate. Le deriva genetic es un de vias de speciation del species (apparition de nove species), al minus a longe termino. Genetica',
  'translated_content': "Ma ciò potrebbe anche essere causato dall'uomo, in un gruppo limitato di animali che lui ha addomesticato. La derivazione genetica è una delle vie di speciazione delle specie (apparizione di nuove specie), almeno a lungo termine. Genetica",
  'prompt': "<start_of_turn>user\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nProvide a punctual translation of the following text from interlingua to italian, without any comments, explanations or interpretations.\n\n### Input:\nMais illo pote etiam ser causate per le homine, in un gruppo de limitate de animal

In [11]:
df_test["hash"] = df_test.apply(
    lambda x: hash(
        x["topic"]
        + "|"
        + x["original_content"]
        + "|"
        + x["translated_content"]
        + "|"
        + x["starting_language"]
        + "|"
        + x["translated_language"]
        + "|"
        + x["task_type"]
        + "|"
    ),
    axis=1,
)

In [12]:
instruction_translate = "Provide a punctual translation of the following text from {} to {}, without any comments, explanations or interpretations."
instruction_describe = "Describe the following topic in the following language: {}."


alpaca_prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}"""
_collect()

0

In [13]:
df_test["question"] = df_test.apply(
    lambda row: alpaca_prompt_template.format(
        instruction_translate.format(
            row["starting_language"], row["translated_language"]
        ),
        row["original_content"],
    ),
    axis=1,
)
print(df_test["question"].sample(1).item())
df_test.rename(columns={"translated_content": "answer"}, inplace=True)
df_test = df_test[["topic", "question", "answer", "starting_language", "hash"]]

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Provide a punctual translation of the following text from interlingua to italian, without any comments, explanations or interpretations.

### Input:
Le turre es un pecia de chacos que resimila un turre de castello. In le position initial, cata jocator ha duo turres, un in cata angulo inferior. Movimento
 Le turre se move sempre in avante, a retro o lateralmente. Illo pote prender un pecia del adversario si iste es super un quadrato ubi le turre pote mover se. Illo non pote saltar supra altere pecias. De plus, le turre pote exequer un movimento special appellate roc: le turre se move super le columna f si illo es initialmente super le columna h, o super le columna d si illo es initialmente super le columna a, durante que le rege se move per duo casos verso le position initial del turre. Valor
 Le turre ha un valor

In [14]:
test_datasets = {}
for group, _df in df_test.groupby("starting_language"):
    test_datasets[group] = _df.copy()
for k, v in test_datasets.items():
    print(f"{k} -> {len(v)}")

interlingua -> 233
italian -> 106


In [15]:
example_in_sentence = (
    test_datasets["italian"].sample(1, random_state=43)["question"].item()
)
example_out_sentence = (
    test_datasets["italian"].sample(1, random_state=43)["answer"].item()
)
print(example_in_sentence, "\n-------\n", example_out_sentence)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Provide a punctual translation of the following text from italian to interlingua, without any comments, explanations or interpretations.

### Input:
Frederick Sanger 
Frederick Sanger è stato il vincitore individuale del Premio Nobel per la chimica nel 1958 per la ricerca sulla composizione molecolare dell'insulina. Egli è stato il co-vincitore con Paul Berg e Walter Gilbert nel 1980 per la ricerca sul DNA ricombinante. Sanger è l'unica persona ad aver vinto due Premi Nobel per la chimica. Riferimenti 
Premio Nobel 
-------
 Frederick Sanger 
 Frederick Sanger era le ganiator indivise del Premio Nobel pro Chimia in 1958 pro recerca super le composition del molecular de insulina. Ille era le co-ganiator con Paul Berg e Walter Gilbert in 1980 pro recerca super ADN recombinante. Sanger es le sol persona a ganiar duo

In [48]:
# example_tokenized_sentence = tokenizer(example_in_sentence,return_tensors="pt").to('cuda')
example_tokenized_sentence = tokenizer.apply_chat_template(
    [{"role": "user", "content": example_in_sentence}], return_tensors="pt"
).to("cuda")
# example_tokenized_sentence = tokenizer(example_in_sentence, return_tensors="pt").to("cuda")

with torch.no_grad():
    example_model_res = model.generate(example_tokenized_sentence).detach().cpu()
    # print(tokenizer.decode(example_model_res[0], skip_special_tokens=True))
detokenized_example_model_res = tokenizer.decode(example_model_res[0][len(example_tokenized_sentence[0]):],skip_special_tokens=True).strip()

In [None]:
print(tokenizer.batch_decode(example_tokenized_sentence)[0])

<bos><start_of_turn>user
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Provide a punctual translation of the following text from italian to interlingua, without any comments, explanations or interpretations.

### Input:
Frederick Sanger 
Frederick Sanger è stato il vincitore individuale del Premio Nobel per la chimica nel 1958 per la ricerca sulla composizione molecolare dell'insulina. Egli è stato il co-vincitore con Paul Berg e Walter Gilbert nel 1980 per la ricerca sul DNA ricombinante. Sanger è l'unica persona ad aver vinto due Premi Nobel per la chimica. Riferimenti 
Premio Nobel<end_of_turn>



In [56]:
print(detokenized_example_model_res)
print('---'*30)
print(example_out_sentence)

### Output:
Frederick Sanger
Frederick Sanger, vincitore individuali del Premio Nobel per la chimica nel 1958 per la ricerca sulla composizione molecolare dell'insulina. Egli è stato il co-vincitore con Paul Berg e Walter Gilbert nel 1980 per la ricerca sul DNA ricombinante. Sanger è l'unica persona ad aver vinto due Premi Nobel per la chimica. Riferimenti 
Premio Nobel.
------------------------------------------------------------------------------------------
Frederick Sanger 
 Frederick Sanger era le ganiator indivise del Premio Nobel pro Chimia in 1958 pro recerca super le composition del molecular de insulina. Ille era le co-ganiator con Paul Berg e Walter Gilbert in 1980 pro recerca super ADN recombinante. Sanger es le sol persona a ganiar duo Premios Nobel pro Chimia. Referentias 
Premio Nobel


In [67]:
inference_datasets = {}
for k, ds in test_datasets.items():
    inputs, outputs, ids = create_inference_dataset(
        ds["question"].to_list(),
        ds["answer"].to_list(),
        ds["hash"].to_list(),
    )
    inference_datasets[k] = {"inputs": inputs, "outputs": outputs, "ids": ids}

In [69]:
inputs, outputs = (
    inference_datasets["italian"]["inputs"],
    inference_datasets["italian"]["outputs"],
)
for input_entry, output_entry in zip(inputs, outputs):
    print(tokenizer.batch_decode(input_entry,skip_special_tokens=False)[0])
    print("---" * 30)
    print(output_entry)
    print("===" * 30)
    break

<bos><start_of_turn>user
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Provide a punctual translation of the following text from italian to interlingua, without any comments, explanations or interpretations.

### Input:
Un gruppo scissionista di membri, con il nome DPA Gruppo 04, si è opposto alla dissoluzione e ha deciso di riaprire il sito web dell'associazione. In un comunicato pubblicato sullo stesso sito, il DPA Gruppo 04 spiega che i suoi membri si rifiutano di "essere dominati dalla pressione ingiusta dei giornalisti antidemocratici, dei politici e dei gruppi fanatici danesi e stranieri" e esprime la propria volontà di "utilizzare la libertà di espressione costituzionale che spetta a tutti i cittadini (...) e lavorare politicamente a questo scopo". Afferma inoltre che la dissoluzione dell'associazione e del suo sito web sarebbe equivalente a cedere al

In [None]:
_collect()
generation_config = {"max_new_tokens": max_seq_length} | {
    # "do_sample": True,
    # "early_stopping" : True,
    # "temperature": 0.1,
    # "top_k": 10,
    # "top_p": 0.1,
    # "max_new_tokens": 256,
    # "repetition_penalty": 1.3,
}
# with torch.inference_mode():
# res = defaultdict(list)
with torch.no_grad():
    for language in inference_datasets:
        inference_dataset = inference_datasets[language]
        inference_dataset["response"] = []
        for _input in tqdm(inference_dataset["inputs"],desc=language):
            inference_dataset["response"].append(
                model.generate(_input, **generation_config).detach().cpu()
            )

interlingua:   0%|          | 0/233 [00:00<?, ?it/s]

AUTOTUNE bmm(8x256x256, 8x256x256)
  bmm 0.0164 ms 100.0% 
  triton_bmm_6 0.0184 ms 88.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_5 0.0195 ms 84.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_10 0.0195 ms 84.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_14 0.0195 ms 84.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_9 0.0225 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  trit

In [None]:
inference_datasets["interlingua"]["response"]

In [55]:
inference_datasets["italian"]["response"]

dict_keys(['inputs', 'outputs', 'ids', 'response'])

In [25]:
import numpy as np

res_decode = [tokenizer.batch_decode(v) for v in res]
res_decode = np.asarray(res_decode).flatten().tolist()

In [26]:
# pickle.dump(
#     dict(zip(ids, res_decode)),
#     open(f"{os.path.basename(model_id)}_base_prediction.pkl", "wb"),
#     pickle.HIGHEST_PROTOCOL,
# )

In [27]:
# pd.DataFrame([dict(zip(ids, res_decode))]).T.reset_index()

In [28]:
[print(v) for v in res_decode]

<bos><start_of_turn>user
### Instruction:
Provide a punctual translation of the following text from **interlingua** to **italian**, without any comments, explanations or interpretations.

### Input:
Le sultan esseva reprehendite plure vices per Pelagio e le Cruzatos marciava verso le sud verso Cairo in julio de 1221. Durante le cammino, illes ataccava un fortia de al-Kamil in le battalia de Mansurah, ma esseva sconfite e obligate a render se. Le conditiones de rendite includite le retrocedite de Damietta, abandonante Egypto completemente e un tregua de octo annos. Le Quinte Crusata terminava in septembre de 1221, un fracasso dels Cruzatos que non achieveva su objectivos. Cruciadas<end_of_turn>
<start_of_turn>model
### Response:
I crociati furono represi più volte da Pelagio e i crociati marciarono verso sud verso il Cairo nel luglio del 1221. Durante il cammino, attaccarono una forza di al-Kamil nella battaglia di Mansura, ma furono sconfitti e costretti a ritirarsi. Le condizioni di r

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [29]:
_results = []
for test_example, model_response in zip(outputs, res_decode):
    print("Test Example")
    print(test_example)
    print("---" * 30)
    print("Model Response")
    model_response = extract_response(model_response)
    print(model_response)
    m = sentence_metric(model_response, test_example, bleu)
    if m >= 0.1:
        _results.append((model_response, test_example, m))
    print(m)
    print("===" * 30)

Test Example
il sultano fu criticato più volte da Pelagio e i Crociati marciarono verso sud, verso Il Cairo, nel luglio 1221. Durante il cammino, attaccarono una fortezza di al-Kamil nella battaglia di Mansura, ma furono sconfitti e costretti alla resa. Le condizioni della resa includevano il ritiro da Damietta, l'abbandono completo dell'Egitto e una tregua di otto anni. La quinta crociata terminò nel settembre 1221, un fallimento dei Crociati che non raggiunsero i loro obiettivi.
------------------------------------------------------------------------------------------
Model Response
I crociati furono represi più volte da Pelagio e i crociati marciarono verso sud verso il Cairo nel luglio del 1221. Durante il cammino, attaccarono una forza di al-Kamil nella battaglia di Mansura, ma furono sconfitti e costretti a ritirarsi. Le condizioni di resa includevano il ritorno di Damietta, abbandonando completamente l'Egitto e una tregua di otto anni. La Quinta Crociata si concluse nel settembr

In [30]:
np.mean([v[2] for v in _results])

0.48078206535573687

In [31]:
corpus_metric([v[0] for v in _results], [v[1] for v in _results])

0.4995690310804644

In [None]:
_results = []
for test_example, model_response in zip(outputs, extract_response(res_decode)):
    print("Test Example")
    print(test_example)
    print("---" * 30)
    print("Model Response")
    print(model_response)
    m = sentence_metric(model_response, test_example, bleu)
    _results.append((model_response, test_example, m))
    print(m)
    print("===" * 30)

In [None]:
corpus_metric([v[0] for v in _results], [v[1] for v in _results])