##### Copyright 2024 Google LLC.

In [1]:
%pip install --upgrade --no-cache-dir pip wheel setuptools black isort jupyterlab-code-formatter jupyterthemes jupyterlab_darkside_theme nvitop
%pip install --upgrade --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --upgrade --no-cache-dir torcheval optuna torchmetrics torchtnt
%pip install --upgrade --no-cache-dir evaluate rouge_score datasets tensorboard accelerate flash-attn torchtnt bitsandbytes transformers
%pip install --upgrade --no-cache-dir unsloth
%pip install --upgrade --no-cache-dir trl
# # !rm ~/.cache/matplotlib -rf

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Collecting wheel
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Collecting setuptools
  Downloading setuptools-75.8.0-py3-none-any.whl.metadata (6.7 kB)
Collecting black
  Downloading black-24.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.2/79.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isort
  Downloading isort-5.13.2-py3-none-any.whl.metadata (12 kB)
Collecting jupyterlab-code-formatter
  Downloading jupyterlab_code_formatter-3.0.2-py3-none-any.whl.metadata (6.0 kB)
Collecting jupyterthemes
  Downloading jupyterthemes-0.20.0-py2.py3-none-any.whl.metadata (1.0 kB)
Collecting jupyterlab_darkside_theme
  Downloading jupyterlab_darkside_theme-0.1.2-py3-none-any.whl.metadata (5.8 kB)
Collecting nvitop
  Downloading nvitop-1.4.0-py3-none-any.whl.metadata (80 

# FIXME
- Translate topic (aka wikipedia page) in a standard language (English)
- For venetian language, use "decoded" translation (no phonetic symbols)
- Remove batch of text that are below 50 words

In [1]:
import gc
import gzip
import json
import os
import re
import string
from collections import defaultdict
from copy import copy
from pprint import pprint
from typing import *

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
from datasets import Dataset, load_dataset
from tqdm.auto import tqdm
from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextStreamer,
    TrainingArguments,
)
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported

batch_regex = re.compile(r"(?<=[.!?\n])\s+")
punctuation_set = set(string.punctuation)
number_regex = re.compile(r"\d*\.\d+|\d+", re.MULTILINE)

tqdm.pandas()
os.environ["HF_TOKEN"] = "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"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


def _collect():
    x = 0
    for i in range(3):
        x += gc.collect()
        torch.cuda.empty_cache()
    return x


seed = 42
import random

import numpy as np

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

max_seq_length = 1024
min_seq_length = 128

2025-01-03 17:16:45.965433: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-03 17:16:45.965498: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-03 17:16:45.967011: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-03 17:16:45.976595: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
# f = sys.argv[1]
# f = "configs/config_3.1-candidate-trans-desc.json"
config = json.load(open(f))
modules_to_save = config.get("modules_to_save")
dataset_name = config.get("dataset_name")
epochs = config.get("epochs")
model_id = config.get("model_id")
learning_rate = config.get("learning_rate")
r = config.get("r")
dropout = config.get("dropout")
lora_alpha = config.get("lora_alpha")
pprint(config)

{'dataset_name': 'interlingua_translate_describe',
 'dropout': 0.3,
 'epochs': 1,
 'learning_rate': 0.001,
 'lora_alpha': 16,
 'lr_scheduler_type': 'cosine',
 'model_id': 'google/gemma-2-2b-it',
 'modules_to_save': ['embed_tokens'],
 'r': 64}


In [3]:
# def load_base_model(model_id, max_seq_length, device="sequential"):
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         model_name=model_id,
#         max_seq_length=max_seq_length,
#         dtype=None,
#         load_in_4bit=True,
#         device_map=device,
#         # attn_implementation="flash_attention_2",
#     )
#     return model, tokenizer


# model, tokenizer = load_base_model(model_id, max_seq_length)

# FIXME
- Remove entrie from "describe" dataset from test and add them into train.

In [10]:
dataset_train = Dataset.load_from_disk(f"datasets/{dataset_name}_train.hf")
dataset_test = Dataset.load_from_disk(f"datasets/{dataset_name}_test.hf")
ds = dataset_test.to_pandas()
ds.pop('__index_level_0__')
ds = ds[ds["task_type"] == "translate"]
dataset_test = Dataset.from_pandas(ds)
del ds
_collect()

631

In [8]:
ds

Unnamed: 0,topic,original_content,translated_content,prompt,starting_language,translated_language,task_type,new_lines_ratio,n_tokens,len_ratio,len,id,__index_level_0__
0,Mahatma Gandhi,Quando le independentia de India esseva atting...,"Quando l'indipendenza dell'India fu raggiunta,...",<start_of_turn>user\nBelow is an instruction t...,italian,interlingua,describe,0.080851,389,1.090395,235,4669,998
1,Luca Pacioli,Pacioli era membro della corte di Ludovico Sfo...,Pacioli esseva membro del corte de Ludovico Sf...,<start_of_turn>user\nBelow is an instruction t...,italian,interlingua,translate,0.048900,694,0.909091,409,7245,1621
2,Theologia,Varie ancian patres del ecclesia describeva al...,Vari antichi padri della Chiesa descrivono il ...,<start_of_turn>user\nBelow is an instruction t...,interlingua,italian,translate,0.103203,975,1.007874,562,1286,611
3,Referentia,(Isto duceva a Frege a distinguer inter le sen...,Ciò portava Frege a distinguere tra il senso d...,<start_of_turn>user\nBelow is an instruction t...,interlingua,italian,translate,0.052158,815,1.032258,556,646,514
4,Molecula,No molecula typic pote esser definite pro crys...,Nessun molecolo tipico può essere definito com...,<start_of_turn>user\nBelow is an instruction t...,interlingua,italian,translate,0.119691,432,1.029412,259,1328,413
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1915,Recuperation de Aristotele,"Le decision per le chefes religiose poterose, ...",La decisione dei capi religiosi di censurare u...,<start_of_turn>user\nBelow is an instruction t...,italian,interlingua,describe,0.118182,181,1.080645,110,4512,1130
1916,Barack Obama,"in politica estera , si ritirò dalle truppe am...","In su politica externe, ille retirava le milit...",<start_of_turn>user\nBelow is an instruction t...,italian,interlingua,translate,0.030631,816,1.122363,555,7691,1294
1917,Ginny Weasley,"Depost, Ginny ha periodos de amnesia que coinc...",Ginny soffre di periodi di amnesia che coincid...,<start_of_turn>user\nBelow is an instruction t...,italian,interlingua,describe,0.067358,278,1.048611,193,4938,860
1918,Franz Kafka,Ille aveva un rapporto complicato e disturbato...,Ille habeva un relation complicate e disturbat...,<start_of_turn>user\nBelow is an instruction t...,italian,interlingua,translate,0.049412,680,1.005376,425,7474,1459


### Set LoRA configuration

LoRA (Low-Rank Adaptation) allows for efficient fine-tuning by adapting only a subset of model parameters.

Here, you set the following parameters:
- `r` to 16, which controls the rank of the adaptation matrices.
- `lora_alpha` to 16 for scaling.
- `lora_dropout` to 0 since it is optimized.

To know more about LoRA parameters and their effects, check out the [LoRA parameters encyclopedia](https://github.com/unslothai/unsloth/wiki#lora-parameters-encyclopedia).

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r=r,  # LoRA attention dimension
    # target_modules=[
    #     "q_proj",
    #     "k_proj",
    #     "v_proj",
    #     "o_proj",
    #     "gate_proj",
    #     "up_proj",
    #     "down_proj",
    # ],
    lora_alpha=lora_alpha,  # Alpha parameter for LoRA scaling
    lora_dropout=dropout,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=42,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None,  # LoRA-Fine-Tuning-Aware Quantization
    modules_to_save=modules_to_save,
)

Unsloth: Offloading output_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)
Unsloth 2024.12.12 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


Unsloth: Training lm_head in mixed precision to save VRAM


### Set training configuration

Set up the training arguments that define how the model will be trained.

Here, you'll define the following parameters:

- For training and evaluation:
  - `output directory`
  - `max steps`
  - `batch sizes`

- To optimize the training process:
  - `learning rate`
  - `optimizer`
  - `learning rate scheduler`

**Note:** `max_steps` is set as 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run.

In [13]:
batch_size = 2
grad_acc_batch_size = 8
# Perform eval at each 10% of training dataset
eval_steps = int((len(dataset_train) / (batch_size * grad_acc_batch_size)) // 10)
print(eval_steps)
training_args = TrainingArguments(
    # auto_find_batch_size=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=grad_acc_batch_size,
    eval_accumulation_steps=grad_acc_batch_size,
    eval_strategy="steps",
    eval_steps=eval_steps,
    logging_steps=10,
    # warmup_steps=20,
    num_train_epochs=epochs,
    learning_rate=learning_rate,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=42,
    output_dir="outputs",
    report_to=["tensorboard"],
)

26


<a name="Train"></a>
### Train the model

[Huggingface's TRL](https://huggingface.co/docs/trl/index) offers a user-friendly API for building SFT models and training them on your dataset with just a few lines of code. Here you will use Huggingface TRL's `SFTTrainer` class to train the model. This class inherits from the `Trainer` class available in the Transformers library, but is specifically optimized for supervised fine-tuning (instruction tuning). Read more about SFFTrainer from the [official TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer).

In [14]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    dataset_text_field="prompt",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)

In [15]:
dl = trainer.get_train_dataloader()
# Ensure that the tokenizer respect the max length
for batch in dl:
    assert len(batch["input_ids"][0]) <= max_seq_length
    print(tokenizer.decode(batch["input_ids"][0]))
    del batch
    del dl
    break

<bos><start_of_turn>user
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Provide a punctual translation of the following text from **interlingua** to **italian**, without any comments, explanations or interpretations.

### Input:
Un corpore es un structura algebric con duo operationes. In 1871, le mathematico german Richard Dedekind introduceva iste notion como Körper, que significa corpore, alora un entitate claudite. In 1893, Eliakim Hastings Moore introduceva le parola field (= campo, agro) in le anglese. Évariste Galois e Ernst Steinitz dava resultatas importante re corpores. Definition 
Un corpore es un insimul  con duo operationes binari e interne  e  (que es nominate addition e multiplication), pro que es ver le conditiones sequente
 es un gruppo abelian (con le elemento neutre )
 es un gruppo abelian (con le elemento neutre )
 le leges distributive:
 p

Now, let's start the fine-tuning process by calling `trainer.train()`, which uses `SFTTrainer` to handle the training loop, including data loading, forward and backward passes, and optimizer steps, all configured according to the settings you've provided.

In [16]:
_collect()
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,179 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 261
 "-____-"     Number of trainable parameters = 755,957,760


Step,Training Loss,Validation Loss
26,1.7782,1.739214
52,1.5707,1.648928


Unsloth: Not an error, but Gemma2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient

KeyboardInterrupt



### Save the model locally

After training is complete, save the fine-tuned model by calling `save_pretrained(new_model)`. This saves the model weights and configuration files to the directory specified by `new_model` (**gemma_ft_unsloth**). You can reload and use the fine-tuned model later for inference or further training.

In [None]:
new_model = f"{os.path.basename(model_id)}_unsloth_ia_{dataset_name}_{os.path.basename(os.path.splitext(f)[0])}"
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)