##### Copyright 2024 Google LLC.

In [57]:
# %pip install --upgrade --no-cache-dir pip wheel setuptools black isort jupyterlab-code-formatter jupyterthemes jupyterlab_darkside_theme nvitop
# %pip install --upgrade --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# %pip install --upgrade --no-cache-dir torcheval optuna torchmetrics torchtnt
# %pip install --upgrade --no-cache-dir evaluate rouge_score datasets tensorboard accelerate flash-attn torchtnt bitsandbytes transformers
# %pip install --upgrade --no-cache-dir unsloth
# %pip install --upgrade --no-cache-dir trl
# # # !rm ~/.cache/matplotlib -rf

# FIXME
- Translate topic (aka wikipedia page) in a standard language (English)
- For venetian language, use "decoded" translation (no phonetic symbols)
- Remove batch of text that are below 50 words

In [1]:
import gc
import gzip
import json
import os
import pickle
import re
import string
from collections import defaultdict
from copy import copy
from typing import *

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
from datasets import Dataset, load_dataset
from tqdm.auto import tqdm
from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextStreamer,
    TrainingArguments,
)
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported

batch_regex = re.compile(r"(?<=[.!?\n])\s+")
punctuation_set = set(string.punctuation)
number_regex = re.compile(r"\d*\.\d+|\d+", re.MULTILINE)

tqdm.pandas()
os.environ["HF_TOKEN"] = "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"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


def _collect():
    x = 0
    for i in range(3):
        x += gc.collect()
        torch.cuda.empty_cache()
    return x

2025-01-02 10:17:21.609672: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-02 10:17:21.609741: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-02 10:17:21.611443: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-02 10:17:21.620902: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from typing import *

import nltk.translate.bleu_score as bleu
import nltk.translate.gleu_score as gleu


def sentence_metric(translated: str, original: str, metric=gleu) -> float:
    method_name = [name for name in metric.__dict__.keys() if "sentence" in name][0]
    score_method = getattr(metric, method_name)
    return score_method([translated.split()], original.split())


def corpus_metric(translated: List[str], original: List[str], metric=gleu) -> float:
    assert len(translated) == len(original)
    method_name = [name for name in metric.__dict__.keys() if "corpus" in name][0]
    score_method = getattr(metric, method_name)
    translated = [[sentence.split()] for sentence in translated]
    original = [sentence.split() for sentence in original]
    return score_method(translated, original)


hyp = "she read the book because she was interested in world history"
ref_a = "she read the book because she was interested in world history"
ref_b = "she was interested in world history because she read the book"
sentence_metric(ref_b, hyp), corpus_metric([ref_b, ref_b], [hyp, hyp]), sentence_metric(
    ref_b, hyp, bleu
), corpus_metric([ref_b, ref_b], [hyp, hyp], bleu)

(0.7894736842105263,
 0.7894736842105263,
 0.7400828044922853,
 0.7400828044922853)

In [3]:
max_seq_length = 1024
min_seq_length = 128
model_id = "google/gemma-2-2b-it"


def load_base_model(model_id, max_seq_length, device="sequential"):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_id,
        max_seq_length=max_seq_length,
        dtype=None,
        load_in_4bit=True,
        device_map=device,
        attn_implementation="flash_attention_2",
    )
    return model, tokenizer


model, tokenizer = load_base_model(
    "gemma-2-2b-it_unsloth_ia_trans-desc_v7-3epoch-lm_head", max_seq_length
)
_collect()
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.12.12: Fast Gemma2 patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla V100-SXM2-16GB. Max memory: 15.773 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.12.12 patched 26 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.2, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
       

In [4]:
df_test = Dataset.load_from_disk("dataset_test.hf").to_pandas()

In [5]:
df_test.sample(1).to_dict(orient="records")

[{'topic': 'Ixola de Ariàn',
  'original_content': "Xe oncora parçialmente vixibiłi, soravisùe a atività scriteriàe de cava, alcune dune fosili, testimoniança dei vechi cordoni litoranei de confin col mar, che se ga formà in diverse faxe fin a più de 2000 ani fa. Aluvion \nFin a la costruçion de i grandi arzeri inte el XX secolo, l'Ixola xe stà sogeta a aluvion continue. L'Ixola de Ariàn no xe stà coinvolta inte l'aluvione del 1951; la ga parò subito in ani recenti do alluvion, dovùe a la rota de l'arzere del Po de Goro;\n el 20 de zugno 1957, la rota in località Ca' Vendramin ga meso sota çirca 7\xa0700 etari; \n el 2 de novenbre 1960, la rota a Riva' ga lagà çirca 800 ettari. Voxe corelae \n Delta del Po\n Tajo de Porto Viro\n Ariàn\nAriàn\nPàjine co erori inte łe mape",
  'translated_content': "It is still partially visible, surviving the excavations of, among other things, a fossil, testifying to the old coastline that was formed in different phases up to over 2000 years ago.\n\nUn

In [6]:
df_test["hash"] = df_test.apply(
    lambda x: hash(
        x["topic"]
        + "|"
        + x["original_content"]
        + "|"
        + x["translated_content"]
        + "|"
        + x["starting_language"]
        + "|"
        + x["translated_language"]
        + "|"
        + x["task_type"]
        + "|"
    ),
    axis=1,
)

In [7]:
instruction_translate = "Provide a direct translation of the following text from **{}** to **{}**, without any additions, explanations, or interpretations."
alpaca_prompt_template = """<start_of_turn>user
{}

{}<end_of_turn>
<start_of_turn>model"""

In [8]:
df_test["question"] = df_test.apply(
    lambda row: alpaca_prompt_template.format(
        instruction_translate.format(
            row["starting_language"], row["translated_language"]
        ),
        row["original_content"],
    ),
    axis=1,
)

In [17]:
test_dataset = pd.DataFrame()
for group, _df in df_test.groupby(
    ["starting_language", "translated_language", "task_type"]
):
    if 'task_type' == 'describe':
        continue
    test_dataset = pd.concat([test_dataset, _df.sample(3, random_state=42)])

In [20]:
def tokenize_for_inference(entries: List[str], tokenizer=tokenizer, to_gpu=False):
    tokenized = [
        tokenizer(
            entry,
            padding=False,
            truncation=False,
            return_tensors="pt",
            add_special_tokens=False,
        )
        for entry in entries
    ]
    if to_gpu:
        [entry.to("cuda") for entry in tokenized]
    return tokenized


MODEL_START_TOKEN = "<start_of_turn>model"


def extract_response(model_responses: List[str]):
    responses = []
    for response in model_responses:
        n = response.find(MODEL_START_TOKEN)
        responses.append(
            response[n:]
            .removeprefix(MODEL_START_TOKEN)
            .removesuffix("<end_of_turn>")
            .strip()
        )
    return responses


inputs = tokenize_for_inference(test_dataset["question"].to_list(), to_gpu=True)
outputs = test_dataset["translated_content"].to_list()
ids = test_dataset["hash"].to_list()

In [21]:
for input_entry, output_entry in zip(inputs, outputs):
    print(tokenizer.batch_decode(input_entry["input_ids"])[0])
    print("---" * 30)
    print(output_entry)
    print("===" * 30)
    break

<start_of_turn>user
Provide a direct translation of the following text from **english** to **interlingua**, without any additions, explanations, or interpretations.

Ike Turner struggled with cocaine addiction and legal problems in the following years . Tina Turner revealed that he had committed domestic violence during their marriage , which was depicted in her 1986 autobiography and the 1993 film What 's Love Got to Do with It . Ike Turner spent 18 months in prison in the early 1990s . In the final years of his life , Turner used drugs less often and had a commercial and artistic rebirth . He returned to blues music , released two acclaimed albums and went on concert tours . Turner is noted for his innovations in music . He won five Grammy Awards , an induction ( with his ex-wife ) into the Rock and Roll Hall of Fame in 1991 and many other awards . The magazine Rolling Stone included him in their list of the 100 Best Guitarists .<end_of_turn>
<start_of_turn>model
--------------------

In [None]:
_collect()
generation_config = {"max_new_tokens": max_seq_length // 2} | {
    # "do_sample": True,
    # "early_stopping" : True,
    # "temperature": 0.1,
    # "top_k": 10,
    # "top_p": 0.1,
    # "max_new_tokens": 256,
    # "repetition_penalty": 1.3,
}
with torch.inference_mode():
    res = [
        model.generate(**_input, **generation_config).detach().cpu()
        for _input in tqdm(inputs)
    ]

  0%|          | 0/48 [00:00<?, ?it/s]

In [None]:
import numpy as np

res_decode = [tokenizer.batch_decode(v) for v in res]
res_decode = np.asarray(res_decode).flatten().tolist()

In [None]:
pickle.dump(
    dict(zip(ids, res_decode)),
    open(f"{os.path.basename(model_id)}_base_prediction.pkl", "wb"),
    pickle.HIGHEST_PROTOCOL,
)

In [54]:
# pd.DataFrame([dict(zip(ids, res_decode))]).T.reset_index()

In [49]:
_results = []
for test_example, model_response in zip(outputs, res_decode):
    print("Test Example")
    print(test_example)
    print("---" * 30)
    print("Model Response")
    print(model_response)
    model_response = extract_response([model_response])[0]
    m = sentence_metric(model_response, test_example, bleu)
    _results.append((model_response, test_example, m))
    print(m)
    print("===" * 30)

Test Example
R. le prince regente de Portugal e de Brasil, sur le citate de Olivença/Olivenza e le altere territorios cedite a Espania per le Tractato de Badajoz de 1801, e contemplante le restitution de tal objectos como un del mesuras proprie a assecurar inter le duo regnos del peninsula iste bon harmonia complete e stabile cuje conservation in tote le partes del Europa ha essite le objectivo constante de lor arrangiamentos, se ingagia formalmente a emplear in le vias del conciliation lor effortios plus efficace, a fin que le retrocession de tal territorios in favor de Portugal sia effectuate; e le Potentias recognosce, secundo lo que depende de cata un de illos, que iste arrangiamento debe haber loco le plus tosto possibile."
 Espania sustene que Olivenza esseva conquerite per Espania e non per le Francia napoleonic, ergo le firma del tractato de Vienna esseva facite como vincitor e non pro ceder territorios legitimemente acquirite. Hodie, in le 2006, illo es ancora sub administrati

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [50]:
corpus_metric([v[0] for v in _results], [v[1] for v in _results])

0.16540055248618785

In [55]:
_results = []
for test_example, model_response in zip(outputs, extract_response(res_decode)):
    print("Test Example")
    print(test_example)
    print("---" * 30)
    print("Model Response")
    print(model_response)
    m = sentence_metric(model_response, test_example, bleu)
    _results.append((model_response, test_example, m))
    print(m)
    print("===" * 30)

Test Example
Le majoritate de su obras explora le thema de solitate. García Márquez moriva de pneumonia al etate de 87 in Citate de Mexico. A su morte, Juan Manuel Santos, le presidente de Colombia, le describeva como "le Colombiano le plus magne qui unquam viveva."
 Gente colombian
Scriptores
Laureatos del Premio Nobel pro Litteratura
------------------------------------------------------------------------------------------
Model Response
La mayoría de su trabajo explora el tema de la soledad. García Márquez murió de pneumonia a los 87 años en la Ciudad de México. En su muerte, Juan Manuel Santos, el presidente de Colombia, describió como "el mayor colombiano que ha vivido". 
Colombianos
Escritores
Nobel laureates en Literatura
2.5473825510753416e-78
Test Example
La agricoltura no la ze mai stata la economia principal. Al paeset al ze famoss par la tradision dei consa che da qua i partia par l'Italia e par i Stati confinanti co pochi arte par far e inpajar carieghe. I consa par difend

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [56]:
corpus_metric([v[0] for v in _results], [v[1] for v in _results])

0.12051077414205906