In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !pip install -q transformers datasets accelerate peft bitsandbytes trl sentencepiece

In [None]:
!pip install -U \
  transformers==4.40.2 \
  peft==0.11.1 \
  accelerate==0.30.1 \
  bitsandbytes==0.43.1 \
  datasets==2.16.1 \
  fsspec==2023.6.0 \
  gcsfs==2023.6.0

Collecting transformers==4.40.2
  Using cached transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
Collecting peft==0.11.1
  Using cached peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Collecting accelerate==0.30.1
  Using cached accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes==0.43.1
  Using cached bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting datasets==2.16.1
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting fsspec==2023.6.0
  Downloading fsspec-2023.6.0-py3-none-any.whl.metadata (6.7 kB)
Collecting gcsfs==2023.6.0
  Downloading gcsfs-2023.6.0-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.2)
  Using cached tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow-hotfix (from datasets==2.16.1)
  Using cached pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Downloading transformers-

In [None]:
import os
import json
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
# MODEL_NAME = "google/gemma-2b-it"  # Use gemma-4b-it if you have >24GB GPU (QLoRA recommended for that)
os.environ['HF_TOKEN']="" # add the hugging face token
MODEL_NAME = "google/flan-t5-large"
DATASET_DIR = "/content/drive/MyDrive/DL_project/data/references/train"
MAX_LEN = 1024

In [None]:
def load_jsonl_dataset(data_dir):
    all_data = []
    for lang_pair in os.listdir(data_dir):
        lang_dir = os.path.join(data_dir, lang_pair)
        train_path = os.path.join(lang_dir, "train.jsonl")
        if os.path.exists(train_path):
            with open(train_path, 'r', encoding='utf-8') as f:
                records = [json.loads(line) for line in f]
                all_data.extend(records)
    return Dataset.from_list(all_data)

In [None]:
raw_dataset = load_jsonl_dataset(DATASET_DIR)

In [None]:
def format_prompt(example):
    return {
        "text": f"### Source ({example['source_locale']}):\n{example['source']}\n\n### Translation ({example['target_locale']}):\n{example['target']}"
    }

formatted_dataset = raw_dataset.map(format_prompt)


Map:   0%|          | 0/32962 [00:00<?, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token


def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

tokenized_dataset = formatted_dataset.map(tokenize, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/32962 [00:00<?, ? examples/s]

In [None]:
# for gemma
# def load_model(full_finetune=True):
#     model = AutoModelForCausalLM.from_pretrained(
#         MODEL_NAME,
#         device_map="auto",
#         load_in_4bit=not full_finetune,
#         torch_dtype=torch.float16,
#         trust_remote_code=True
#     )

#     if full_finetune:
#         return model

#     model = prepare_model_for_kbit_training(model)

#     lora_config = LoraConfig(
#         r=8,
#         lora_alpha=16,
#         target_modules=["q_proj", "v_proj"],
#         lora_dropout=0.05,
#         bias="none",
#         task_type="CAUSAL_LM"
#     )

#     return get_peft_model(model, lora_config)


In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from peft import get_peft_model, prepare_model_for_kbit_training

from peft import LoraConfig
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],  # or adjust based on your model
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

def load_model(full_finetune=True):
    model_name = MODEL_NAME  # Make sure this global variable is set

    # Choose model class based on architecture
    if "t5" in model_name or "flan" in model_name:
        model_class = AutoModelForSeq2SeqLM
    else:
        model_class = AutoModelForCausalLM

    if full_finetune:
        # Full fine-tuning (no quantization, all parameters trainable)
        model = model_class.from_pretrained(model_name, device_map="auto")

    else:
        # QLoRA (4-bit quantization with LoRA adapters)
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )

        model = model_class.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto"
        )

        # Prepare for LoRA training
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

    return model


In [None]:
model = load_model(full_finetune=True)  # for full finetuning
# model = load_model(full_finetune=False)  # for LoRA



In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    evaluation_strategy="no",
    learning_rate=2e-5,
    fp16=True,
    gradient_checkpointing=True,
    report_to="none"
)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,0.0
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0848
70,0.0
80,0.0
90,0.0
100,0.0


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [None]:
model.save_pretrained("/content/drive/MyDrive/DL_project/flan_t5_large_finetuned")
tokenizer.save_pretrained("/content/drive/MyDrive/DL_project/flan_t5_large_finetuned")