##### Copyright 2024 Google LLC.

In [1]:
# %pip install --upgrade --no-cache-dir pip wheel setuptools black isort jupyterlab-code-formatter jupyterthemes jupyterlab_darkside_theme nvitop
# %pip install --upgrade --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# %pip install --upgrade --no-cache-dir torcheval optuna torchmetrics torchtnt
# %pip install --upgrade --no-cache-dir evaluate rouge_score datasets tensorboard accelerate flash-attn torchtnt bitsandbytes transformers
# %pip install --upgrade --no-cache-dir unsloth
# %pip install --upgrade --no-cache-dir trl
# # # !rm ~/.cache/matplotlib -rf

# FIXME
- Translate topic (aka wikipedia page) in a standard language (English)
- For venetian language, use "decoded" translation (no phonetic symbols)
- Remove batch of text that are below 50 words

In [2]:
import gc
import gzip
import json
import os
import re
import string
from collections import defaultdict
from copy import copy
from typing import *

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
from datasets import Dataset, load_dataset
from tqdm.auto import tqdm
from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextStreamer,
    TrainingArguments,
)
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported

batch_regex = re.compile(r"(?<=[.!?\n])\s+")
punctuation_set = set(string.punctuation)
number_regex = re.compile(r"\d*\.\d+|\d+", re.MULTILINE)

tqdm.pandas()
os.environ["HF_TOKEN"] = "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"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


def _collect():
    x = 0
    for i in range(3):
        x += gc.collect()
        torch.cuda.empty_cache()
    return x

2025-01-02 01:42:20.566020: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-02 01:42:20.566102: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-02 01:42:20.567676: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-02 01:42:20.578031: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
max_seq_length = 1024
min_seq_length = 128
model_id = "google/gemma-2-2b-it"


def load_base_model(model_id, max_seq_length, device="sequential"):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_id,
        max_seq_length=max_seq_length,
        dtype=None,
        load_in_4bit=True,
        device_map=device,
        # attn_implementation="flash_attention_2",
    )
    return model, tokenizer


model, tokenizer = load_base_model(model_id, max_seq_length)

==((====))==  Unsloth 2024.12.12: Fast Gemma2 patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla V100-SXM2-16GB. Max memory: 15.773 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
dataset_train = Dataset.load_from_disk("dataset_train.hf")
dataset_test = Dataset.load_from_disk("dataset_test.hf")

### Set LoRA configuration

LoRA (Low-Rank Adaptation) allows for efficient fine-tuning by adapting only a subset of model parameters.

Here, you set the following parameters:
- `r` to 16, which controls the rank of the adaptation matrices.
- `lora_alpha` to 16 for scaling.
- `lora_dropout` to 0 since it is optimized.

To know more about LoRA parameters and their effects, check out the [LoRA parameters encyclopedia](https://github.com/unslothai/unsloth/wiki#lora-parameters-encyclopedia).

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r=128,  # LoRA attention dimension
    # target_modules=[
    #     "q_proj",
    #     "k_proj",
    #     "v_proj",
    #     "o_proj",
    #     "gate_proj",
    #     "up_proj",
    #     "down_proj",
    # ],
    lora_alpha=16,  # Alpha parameter for LoRA scaling
    lora_dropout=0.2,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=42,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None,  # LoRA-Fine-Tuning-Aware Quantization
    modules_to_save=["lm_head"],  # ["embed_tokens"],
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.2.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Offloading output_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)
Unsloth 2024.12.12 patched 26 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Unsloth: Training lm_head in mixed precision to save VRAM


### Set training configuration

Set up the training arguments that define how the model will be trained.

Here, you'll define the following parameters:

- For training and evaluation:
  - `output directory`
  - `max steps`
  - `batch sizes`

- To optimize the training process:
  - `learning rate`
  - `optimizer`
  - `learning rate scheduler`

**Note:** `max_steps` is set as 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run.

In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    auto_find_batch_size=True,
    # per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=20,
    num_train_epochs=3,
    learning_rate=0.0001,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=42,
    output_dir="outputs",
    report_to=["tensorboard"],
)

<a name="Train"></a>
### Train the model

[Huggingface's TRL](https://huggingface.co/docs/trl/index) offers a user-friendly API for building SFT models and training them on your dataset with just a few lines of code. Here you will use Huggingface TRL's `SFTTrainer` class to train the model. This class inherits from the `Trainer` class available in the Transformers library, but is specifically optimized for supervised fine-tuning (instruction tuning). Read more about SFFTrainer from the [official TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer).

In [7]:
from transformers import DataCollatorForSeq2Seq

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_train,
    # eval_dataset=dataset_test,
    dataset_text_field="prompt",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)

In [8]:
dl = trainer.get_train_dataloader()
# Ensure that the tokenizer respect the max length
for batch in dl:
    assert len(batch["input_ids"][0]) <= max_seq_length
    print(tokenizer.decode(batch["input_ids"][0]))
    del batch
    del dl
    break

<bos><start_of_turn>user
Provide a direct translation of the following text from **italian** to **interlingua**, without any additions, explanations, or interpretations.

L'Elba (localmente e ufficialmente: Isola d'Elba; latino classico: Ilva, dall'etrusco: iLUA "ferroso"; in greco: Αἰθαλία, Aithalía, da αἴθαλē "fuliggine") è un'isola mediterranea situata nel Canale di Piombino, circa 10 chilometri a est della costa continentale italiana, il Mar Tirreno a sud, il Mar Ligure a nord e il Canale di Corsica a ovest. L'Elba è la terza isola più grande d'Italia dopo la Sardegna e la Sicilia, e la sua capitale storica è Portoferraio, antico centro abitato di origine paleolitica, poi etrusco e romano, anche chiamata cosmopoli. La destinazione di intrattenimento e divertimento per le vacanze estive è quella di essere un luogo di mare e di montagna, nonché produttrice di buon vino autoctono, in realtà la sua prima fortuna venne storicamente dal commercio siderurgico, grazie alle colline ricche d

Now, let's start the fine-tuning process by calling `trainer.train()`, which uses `SFTTrainer` to handle the training loop, including data loading, forward and backward passes, and optimizer steps, all configured according to the settings you've provided.

In [None]:
_collect()
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 25,596 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 8
\        /    Total batch size = 64 | Total steps = 1,200
 "-____-"     Number of trainable parameters = 755,957,760
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 25,596 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 2,397
 "-____-"     Number of trainable parameters = 755,957,760


Step,Training Loss


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 25,596 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 4,797
 "-____-"     Number of trainable parameters = 755,957,760


Step,Training Loss
10,3.0337
20,2.542
30,2.2994
40,2.1868
50,2.1126
60,2.0695
70,2.0077
80,1.9594
90,1.9529
100,1.901


AUTOTUNE bmm(16x256x256, 16x256x256)
  bmm 0.0225 ms 100.0% 
  triton_bmm_5 0.0297 ms 75.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_13 0.0297 ms 75.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_15 0.0297 ms 75.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8
  triton_bmm_9 0.0348 ms 64.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_18 0.0348 ms 64.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8


### Save the model locally

After training is complete, save the fine-tuned model by calling `save_pretrained(new_model)`. This saves the model weights and configuration files to the directory specified by `new_model` (**gemma_ft_unsloth**). You can reload and use the fine-tuned model later for inference or further training.

In [None]:
new_model = f"{os.path.basename(model_id)}_unsloth_ia_trans-desc_v7-3epoch-lm_head"
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('gemma-2-2b-it_unsloth_ia_trans-desc_v7-3epoch-lm_head/tokenizer_config.json',
 'gemma-2-2b-it_unsloth_ia_trans-desc_v7-3epoch-lm_head/special_tokens_map.json',
 'gemma-2-2b-it_unsloth_ia_trans-desc_v7-3epoch-lm_head/tokenizer.model',
 'gemma-2-2b-it_unsloth_ia_trans-desc_v7-3epoch-lm_head/added_tokens.json',
 'gemma-2-2b-it_unsloth_ia_trans-desc_v7-3epoch-lm_head/tokenizer.json')