# General Notebook Configuration

In [None]:
# LoRA installation
%%capture
! pip install --no-deps peft bitsandbytes
! pip install -U trl==0.9.4

# Import LoRA Packages
from peft import LoraConfig, TaskType, get_peft_model
from trl import SFTTrainer
from transformers import BitsAndBytesConfig

In [None]:
# Import General Packages
from tqdm import tqdm
from datasets import Dataset
from typing import Union
from transformers import (
    NllbTokenizerFast,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    EarlyStoppingCallback,
    DataCollatorForSeq2Seq
)
import json
import io
import random
import numpy as np
import torch
import os
import datasets
import datetime
import gc

In [None]:
# Color messages function
def print_mess (mess:str, color="green") -> None:
  match color:
    case "green":
      print (f"\033[92m{mess}\033[0m")
    case "red":
      print (f"\033[91m{mess}\033[0m")
    case "yellow":
      print (f"\033[93m{mess}\033[0m")
    case "white":
      print (f"\033[97m{mess}\033[0m")

In [None]:
# Environmente identification
try:
  from google.colab import auth
  from googleapiclient.errors import HttpError
  from googleapiclient.discovery import build
  from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload, MediaFileUpload
  IS_COLAB = True
  print_mess ("Running in Google Colaboratoy Environment.")
except ImportError:
  IS_COLAB = False
  print_mess ("Notebook not running in Gooogle Colaboratory Environment.", "yellow")

[92mRunning in Google Colaboratoy Environment.[0m


In [None]:
# Google connection and authentication
if IS_COLAB:
  auth.authenticate_user ()
  drive_service = build ("drive","v3")

In [None]:
# Set Seeds for Reproducibility
seed = 3407
random.seed (seed)
np.random.seed (seed)
torch.manual_seed (seed)
if torch.cuda.is_available ():
  torch.cuda.manual_seed_all (seed)

# Datasets' Downloading

In [None]:
# Main forlder ID
main_folder_id = "1jDVh28jOvtTU6Wsz_ZhlYrUT1plT9KXb"

# Common Dataset Folder Name
common_dataset_folder_name = "RMI_Intentions_Dataset_"

# Dataset Names
dataset_names = [
    "RMI_Intentions_Train_Dataset.json",
    "RMI_Intentions_Test_Dataset.json"
]

# Dataset download directory
dataset_download_dir = "/content/Datasets"
os.makedirs(dataset_download_dir, exist_ok=True)

In [None]:
# Identify most recent Dataset Folder function.
def search_folder (main_folder_id:str, initial_folder_name:str) -> Union[None, str]:
  # Search most recent folder
  match_list = drive_service.files ().list (
      q = (f"'{main_folder_id}' in parents "
           f"and mimeType = 'application/vnd.google-apps.folder' "
           f"and name contains '{initial_folder_name}'"),
      spaces = "drive",
      fields = "nextPageToken, files(id, name, createdTime)",
      orderBy = "createdTime desc"
  ).execute ()

  # Obtain most recent Folder Dataset ID
  try:
    return match_list.get ("files", [])[0]["id"]
  except IndexError:
    return None

In [None]:
# Identify Dataset function
def search_dataset (dataset_name:str, dataset_folder_id:str) -> Union[None, str]:
  # Search most recent Dataset
  match_list = drive_service.files ().list (
      q = f"'{dataset_folder_id}' in parents and name = '{dataset_name}'",
      spaces = "drive",
      fields = "nextPageToken, files(id, name, createdTime)",
      orderBy = "createdTime desc"
  ).execute ()

  # Obtaint most recent Dataset ID
  try:
    return match_list.get ("files", [])[0]["id"]
  except IndexError:
    return None

In [None]:
# Download file function
def download_and_save_file (file_id:str, file_name:str, local_path="") -> Union[None, str]:
  # Request to download File
  req = drive_service.files ().get_media (fileId = file_id)
  fh = io.BytesIO ()
  downloader = MediaIoBaseDownload (fh, req)
  done = False

  try:
    # Download Process
    with tqdm (total = 100, unit = "%", desc = f"Downloading {file_name} file") as pbar:
      while done is False:
        status, done = downloader.next_chunk ()
        pbar.update (int (status.progress () * 100) - pbar.n)

    # Writing Process in the Environment
    with open (os.path.join (local_path, file_name), "wb") as f:
      f.write (fh.getvalue())

    fh.seek (0)
    return fh.read().decode("utf-8")
  except HttpError:
    return None

In [None]:
# Download Dataset Files from Google Drive.
if IS_COLAB:
  try:
    # Identify dataset folder ID
    print_mess ("Search datasets folder...", "yellow")
    dataset_folder_id = search_folder (main_folder_id = main_folder_id,
                                       initial_folder_name = common_dataset_folder_name)
    if dataset_folder_id is None:
      raise ValueError ("The folder was not found.")
    print_mess (f"The most recent dataset folder ID is: {dataset_folder_id}\n")

    # Search Train and Test Datasets
    print_mess ("Search datasets...", "yellow")
    dataset_id_list = []
    for name in dataset_names:
      dataset_id = search_dataset (dataset_folder_id = dataset_folder_id,
                                   dataset_name = name)
      if dataset_id is None:
        raise ValueError (f"The '{name}' was not found.")
      print_mess (f"The file '{name}' was found with ID: {dataset_id}")
      dataset_id_list.append(dataset_id)

    # Download and Save Dataset files
    print_mess ("\nDownload and Save Datasets...", "yellow")
    dataset_file_list = []
    for name in dataset_names:
      dataset_file = download_and_save_file (file_id = dataset_id_list[dataset_names.index(name)],
                                             file_name = name,
                                             local_path = dataset_download_dir)
      if dataset_file is None:
        raise ValueError (f"The '{name}' file could not be downloaded and saved.")
      dataset_file_list.append(dataset_file)
  except Exception as e:
    print_mess ("Something went wrong.", "red")
    print_mess (f"Error: {e}", "yellow")

[93mSearch datasets folder...[0m
[92mThe most recent dataset folder ID is: 1PdTlqDHuvyHNZ0nkKl_JSr3_BFTr3iF8
[0m
[93mSearch datasets...[0m
[92mThe file 'RMI_Intentions_Train_Dataset.json' was found with ID: 12wBcKG5u0U8LE-wQOllStNyblhRjtvdD[0m
[92mThe file 'RMI_Intentions_Test_Dataset.json' was found with ID: 1j-GogIOl0CI2PDA65JKjPlG8AkzwUZFC[0m
[93m
Download and Save Datasets...[0m


Downloading RMI_Intentions_Train_Dataset.json file: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:00<00:00, 146.06%/s]
Downloading RMI_Intentions_Test_Dataset.json file: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:01<00:00, 94.91%/s]


# Load the Model

In [None]:
# Max Sequence Length
max_source_length = 300
max_target_length = 1024 # 2048

In [None]:
# QLoRA Configuration
if torch.cuda.is_available ():
  bnb_config = BitsAndBytesConfig (
      load_in_4bit = True,
      bnb_4bit_use_double_quant = True,
      bnb_4bit_quant_type = "nf4",
      bnb_4bit_compute_dtype = torch.bfloat16
  )
else:
  bnb_config = BitsAndBytesConfig (
      load_in_4bit = True,
      bnb_4bit_use_double_quant = True,
      bnb_4bit_quant_type = "nf4"
  )

In [None]:
# Define the model name
model_name = 'facebook/nllb-200-distilled-600M'

# Load the model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained (
    model_name,
    quantization_config=bnb_config,
    device_map = "auto"
)
tokenizer = NllbTokenizerFast.from_pretrained (model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [None]:
# Clean Memory
gc.collect()
torch.cuda.empty_cache()

# LoRa

In [None]:
# LoRA Parameters
rank = 16
droupout = 0.05
module_list = [
    "q_proj",
    "v_proj",
    "k_proj",
    "out_proj",
    "fc1",
    "fc2"
]

In [None]:
# LoRa Configuration
lora_config = LoraConfig (
    r = rank,
    lora_alpha = rank * 2,
    target_modules = module_list,
    lora_dropout = droupout,
    bias = "none",
    task_type = TaskType.SEQ_2_SEQ_LM,
    use_rslora = False,
    loftq_config = None,
)

In [None]:
# Apply LoRA configuration
# model.gradient_checkpointing_enable()
model = get_peft_model (model, lora_config)
model.print_trainable_parameters ()

trainable params: 8,650,752 || all params: 623,724,544 || trainable%: 1.3870


# Processing Datasets

In [None]:
# Processing Train and Test Datasets Function
def processing_dataset (raw_dataset:dict) -> Union[None, list]:
  processed_data_list = []

  try:
    # Process raw Data
    for conversation in raw_dataset["conversations"]:
      # Input = system_message + user_operator
      system_msg = conversation[0]["value"]
      user_msg = conversation[1]["value"]
      input_text = "Instruction: " + system_msg + " " + user_msg

      # Output = model_answer
      target_text = conversation[2]["value"]

      # Save Process Data
      processed_data_list.append({
          'input_text': input_text,
          'target_text': target_text
      })

    return processed_data_list
  except KeyError:
    return None

In [None]:
# Dataset Tokenize Function
def tokenize_function (samples:datasets.formatting.formatting.LazyBatch) -> datasets.arrow_dataset.Dataset:

  # Tokenize Input Text
  model_inputs = tokenizer (
      samples["input_text"],
      max_length = max_source_length,
      truncation = True,
      padding = "max_length"
  )

  # Tokenize Target Text (Labels)
  labels = tokenizer (
      samples["target_text"],
      max_length = max_target_length,
      truncation = True,
      padding = "max_length"
  )

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
# Processing Datasets
try:
  print_mess ("Processing Train Dataset...", "yellow")
  train_dataset = Dataset.from_list (processing_dataset (json.loads (dataset_file_list[0])))
  if train_dataset is None:
    raise ValueError ("Train dataset could not be processed.")
  print_mess ("Train Dataset.")
  print (train_dataset)

  print_mess ("\nProcessing Test Dataset...", "yellow")
  test_dataset = Dataset.from_list (processing_dataset (json.loads (dataset_file_list[1])))
  if train_dataset is None:
    raise ValueError ("Test dataset could not be processed.")
  print_mess ("Test Dataset.")
  print (test_dataset)
except Exception as e:
  print_mess ("Something went wrong.", "red")
  print_mess (f"Error: {e}", "yellow")

[93mProcessing Train Dataset...[0m
[92mTrain Dataset.[0m
Dataset({
    features: ['input_text', 'target_text'],
    num_rows: 242
})
[93m
Processing Test Dataset...[0m
[92mTest Dataset.[0m
Dataset({
    features: ['input_text', 'target_text'],
    num_rows: 81
})


In [None]:
# Tokenize Datasets
print_mess ("Tokenizing Train Dataset...", "yellow")
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=[
        "input_text",
        "target_text"
    ]
)

print_mess ("\nTokenizing Test Dataset...", "yellow")
tokenized_test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=[
        "input_text",
        "target_text"
    ]
)

# Present Tokenized Datasets
print_mess ("\nTrain Dataset.")
print (tokenized_train_dataset)
print_mess ("\nTest Dataset.")
print (tokenized_test_dataset)

[93mTokenizing Train Dataset...[0m


Map:   0%|          | 0/242 [00:00<?, ? examples/s]

[93m
Tokenizing Test Dataset...[0m


Map:   0%|          | 0/81 [00:00<?, ? examples/s]

[92m
Train Dataset.[0m
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 242
})
[92m
Test Dataset.[0m
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 81
})


# Setting the Fine Tuning

In [None]:
# Define TrainingArguments
training_args = TrainingArguments (
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 4,
    max_grad_norm = 0.3,
    warmup_steps = 100, # 5, # 50, # 100,
    # num_train_epochs = 100,
    max_steps = 1300, # 100, # 500,
    learning_rate = 2e-4,
    logging_steps = 1,
    eval_strategy = "steps",
    eval_steps = 100,
    optim = "paged_adamw_8bit", # optim = "adamw_8bit", # optim = "adamw_torch"
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = seed,
    output_dir="outputs",
    report_to = "none",
    bf16 = True,
    dataloader_num_workers = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss"
)

In [None]:
# Callback to prevent Overfitting.
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=5,
    early_stopping_threshold=0.0001
)

# Instantiate the Trainer again with the updated training arguments and add the data collator.
trainer = SFTTrainer (
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_test_dataset,
    args = training_args,
    callbacks=[early_stopping_callback],
    data_collator=DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        padding=True,
        label_pad_token_id=-100
    ),
    packing=False,
)

  super().__init__(


# Performing fine tuning

In [None]:
# Current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
3.588 GB of memory reserved.


In [None]:
# Start training.
trainer_stats = trainer.train()

Step,Training Loss,Validation Loss
100,4.1762,3.735336
200,3.0141,2.753447
300,3.8372,2.630855
400,3.1262,2.573314
500,2.453,2.546657
600,2.8915,2.528442


Step,Training Loss,Validation Loss
100,4.1762,3.735336
200,3.0141,2.753447
300,3.8372,2.630855
400,3.1262,2.573314
500,2.453,2.546657
600,2.8915,2.528442
700,3.4444,2.514242
800,2.8912,2.506902
900,3.1624,2.499517
1000,2.9743,2.49474


In [None]:
# Final memory and Time Stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

15112.0412 seconds used for training.
251.87 minutes used for training.
Peak reserved memory = 14.559 GB.
Peak reserved memory for training = 10.971 GB.
Peak reserved memory % of max memory = 98.765 %.
Peak reserved memory for training % of max memory = 74.425 %.


# Save Model

---

Ajustar cÃ³digo de abajo, optimizarlo.

In [None]:
# Save file parameters
def gen_parameters() -> list:
  execution_date = datetime.datetime.now()
  execution_date = execution_date.strftime("%Y%m%d_%H%M%S")
  save_name_folder = f"Translation_Intent_Model_{execution_date}"
  if IS_COLAB:
    save_folder_path = "1hg2OzG02rISe2O0LvmPvwGZP6pvKKu-8"
  else:
    save_folder_path = "Model_Versions_QLoRA"

  return [save_name_folder, save_folder_path]

In [None]:
# Create folder function
def create_folder() -> str:
  # Generate folder parameters
  [folder_name, folder_path] = gen_parameters()

  print_mess("Creating folder...", "green")
  if IS_COLAB:
    try:
      folder_metadata = {
          "name": folder_name,
          "mimeType": "application/vnd.google-apps.folder",
          "parents": [folder_path]
      }
      folder = drive_service.files().create(
          body = folder_metadata,
          fields = "id"
      ).execute()
      print_mess(f"Folder '{folder_name}' created correctly with ID: {folder.get('id')}", "green")

      return folder.get("id")
    except HttpError as e:
      print_mess("Something went wrong", "red")
      return ""
  else:
    try:
      folder_path = os.path.join(folder_path, folder_name)
      if not os.path.exists(folder_path):
        os.makedirs(folder_path)
      print_mess(f"Folder '{folder_name}' created correctly", "green")

      return os.path.join(folder_path, folder_name)
    except Exception as e:
      print_mess("Something went wrong", "red")
      return ""

In [None]:
def save_model_files(model_output_dir, folder_id):
  for filename in os.listdir(model_output_dir):
    filepath = os.path.join(model_output_dir, filename)
    if os.path.isfile(filepath):
      file_metadata = {
          'name': filename,
          'parents': [folder_id]
      }
      media = MediaFileUpload(filepath, mimetype='application/octet-stream')
      file = drive_service.files().create(
              body=file_metadata,
              media_body=media,
              fields='id').execute()
      print_mess(f"File '{filename}' uploaded to Google Drive with ID: {file.get('id')}", "green")

In [None]:
# Save model and tokenizer and upload to Google Drive
if IS_COLAB:
  print_mess("Saving model and uploading to Google Drive...", "yellow")
  try:
    # Create folder and get its ID
    folder_id = create_folder()
    if not folder_id:
        raise ValueError("Could not create Google Drive folder.")

    # Define output directory in the Colab environment
    output_dir = "model_trained_temp"
    os.makedirs(output_dir, exist_ok=True) # Ensure the directory exists

    # Save model and tokenizer to the temporary local directory
    model.save_pretrained(output_dir, max_shard_size="1GB", safe_serialization=True)
    tokenizer.save_pretrained(output_dir)
    print_mess(f"Model and tokenizer saved successfully in temporary environment directory: {output_dir}", "yellow")

    # Clear CUDA cache
    torch.cuda.empty_cache()
    gc.collect()
    print_mess("Attempted memory cleanup after saving.", "yellow")

    # Upload saved files to Google Drive
    print_mess("Uploading model files to Google Drive...", "white")
    save_model_files(output_dir, folder_id)
    print_mess("Model files saved successfully in Google Drive", "green")

  except Exception as e:
    print_mess("Something went wrong during saving or uploading.", "red")
    print_mess(f"Error: {e}", "yellow")
else:
    print_mess("Not running in Google Colaboratory Environment. Model will be saved locally.", "yellow")
    try:
        # Generate folder parameters for local save
        [folder_name, folder_path_base] = gen_parameters()
        output_dir_local = os.path.join(folder_path_base, folder_name)
        os.makedirs(output_dir_local, exist_ok=True)

        # Save model and tokenizer locally
        model.save_pretrained(output_dir_local)
        tokenizer.save_pretrained(output_dir_local)
        print_mess(f"Model saved successfully in local directory: {output_dir_local}", "yellow")
    except Exception as e:
        print_mess("Something went wrong during local saving.", "red")
        print_mess(f"Error: {e}", "yellow")

[93mSaving model and uploading to Google Drive...[0m
[92mCreating folder...[0m
[92mFolder 'Translation_Intent_Model_20251007_203533' created correctly with ID: 1fTHHVV1x6x1sbjJ-7AWttW6BbLUwyf7b[0m
[93mModel and tokenizer saved successfully in temporary environment directory: model_trained_temp[0m
[93mAttempted memory cleanup after saving.[0m
[97mUploading model files to Google Drive...[0m
[92mFile 'tokenizer.json' uploaded to Google Drive with ID: 1nfBILXLrXda0SzSNaPt3L0wTeAzS13CT[0m
[92mFile 'adapter_config.json' uploaded to Google Drive with ID: 1meSUb4QPavg7-D1ESH6562O0q-54YDv6[0m
[92mFile 'sentencepiece.bpe.model' uploaded to Google Drive with ID: 1FjVF3KvDrRWx7wfxU84z-QTS70m_D7TS[0m
[92mFile 'tokenizer_config.json' uploaded to Google Drive with ID: 1OsW-HLvlodW-KCL90o70SEpWFOLWkEIO[0m
[92mFile 'special_tokens_map.json' uploaded to Google Drive with ID: 1pBlSWvlvvXzhkUTW0SbhtRbS4-QNPP8N[0m
[92mFile 'README.md' uploaded to Google Drive with ID: 1lGat2c808maxTJ

In [None]:
print_mess("All done!")

[92mAll done![0m
