In [22]:
from transformers import TrainingArguments
print(TrainingArguments.__init__.__code__.co_varnames)

('self', 'output_dir', 'overwrite_output_dir', 'do_train', 'do_eval', 'do_predict', 'evaluation_strategy', 'prediction_loss_only', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'per_gpu_train_batch_size', 'per_gpu_eval_batch_size', 'gradient_accumulation_steps', 'eval_accumulation_steps', 'eval_delay', 'learning_rate', 'weight_decay', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'max_grad_norm', 'num_train_epochs', 'max_steps', 'lr_scheduler_type', 'lr_scheduler_kwargs', 'warmup_ratio', 'warmup_steps', 'log_level', 'log_level_replica', 'log_on_each_node', 'logging_dir', 'logging_strategy', 'logging_first_step', 'logging_steps', 'logging_nan_inf_filter', 'save_strategy', 'save_steps', 'save_total_limit', 'save_safetensors', 'save_on_each_node', 'save_only_model', 'no_cuda', 'use_cpu', 'use_mps_device', 'seed', 'data_seed', 'jit_mode_eval', 'use_ipex', 'bf16', 'fp16', 'fp16_opt_level', 'half_precision_backend', 'bf16_full_eval', 'fp16_full_eval', 'tf32', 'local_rank', 'ddp_

**What is PEFT?**
PEFT (Parameter-Efficient Fine-Tuning) is a method where only a small number of parameters are trained (e.g., adapters or LoRA layers), while keeping the large pretrained model frozen.

This is especially useful when working with large language models like BioGPT on limited compute resources (like Google Colab or Kaggle).

💡**Key Reasons for Using PEFT in This Project**
1. Memory Efficiency
Full fine-tuning BioGPT (~347M parameters) is very memory-intensive.

With PEFT (e.g., using LoRA), you train only a few million parameters, saving a lot of VRAM and allowing training on free-tier GPUs.

2. Faster Training
Since most of the model is frozen, training is much faster and more stable.

Ideal for iterative experimentation and hyperparameter tuning.

3. Lower Risk of Overfitting
You reduce the risk of overfitting to small datasets.

The pretrained knowledge of BioGPT is retained, and only task-specific layers are adapted.

4. Modularity and Reusability
PEFT layers can be saved separately and added to other base models easily.

You can mix and match adapters, or fine-tune for other related tasks without starting from scratch.

> 

In [23]:
from transformers.training_args import TrainingArguments

In [25]:
from transformers import DataCollatorForLanguageModeling

# Imports

In [None]:
!pip install transformers==4.39.3 peft==0.10.0 accelerate==0.28.0 bitsandbytes

In [27]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_from_disk
import torch
import os

In [None]:
!pip uninstall -y transformers
!pip install transformers[torch]

In [28]:
from datasets import load_from_disk

# Load the datasets from the Kaggle input path
train_path = "/kaggle/input/tokenized-data/tokenized_train/tokenized_train"
val_path = "/kaggle/input/tokenized-data/tokenized_val/tokenized_val"

tokenized_train = load_from_disk(train_path)
tokenized_val = load_from_disk(val_path)

print(f"Train size: {len(tokenized_train)}")
print(f"Validation size: {len(tokenized_val)}")

Train size: 8400
Validation size: 1800


load_from_disk is used to load datasets that were previously saved to disk.

train_path and val_path specify the locations where the tokenized training and validation datasets are stored.

tokenized_train = load_from_disk(train_path) loads the preprocessed training dataset.

tokenized_val = load_from_disk(val_path) loads the preprocessed validation dataset.

print(f"Train size: {len(tokenized_train)}") prints the number of examples in the training dataset.

print(f"Validation size: {len(tokenized_val)}") prints the number of examples in the validation dataset.

# Device configuration

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


torch.device(...) sets the device where your PyTorch tensors and models will be placed (either GPU or CPU).

"cuda" if torch.cuda.is_available() else "cpu" checks if a GPU (CUDA) is available:

If yes, it uses the GPU ("cuda").

If not, it falls back to the CPU ("cpu").

device stores this information for later use in your code (e.g., moving your model or data to the correct device).

print("Using device:", device) simply prints which device (GPU or CPU) is being used.

# Load model and tokenizer

In [5]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [30]:
model_name = "microsoft/BioGPT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [32]:
# Ensure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))

# ⚙️ Apply LoRA (PEFT)

LoRA fine-tuning using PEFT to make training efficient. It updates only the q_proj and v_proj attention layers, while freezing most of the model. This reduces memory usage and speeds up training.

In [33]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"] # Added target_modules
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 786,432 || all params: 347,549,696 || trainable%: 0.22627900672944337


This means:

Total model parameters: 347,549,696 → the full size of BioGPT.

Trainable parameters: 786,432 → only this small portion is being updated.

Trainable %: 0.23% → less than 1% of the model is trained.

This shows that PEFT (LoRA) fine-tunes the model efficiently by updating a tiny subset of parameters, saving compute and memory.

In [None]:
!pip install -U transformers

In [34]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # For causal language modeling like BioGPT
)

# Training

In [35]:
# 🧪 Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",
    fp16=True  # Enable if you're using GPU on Kaggle
)

output_dir="./results": Saves model checkpoints here.

evaluation_strategy="epoch": Evaluates the model after every epoch.

save_strategy="epoch": Saves checkpoints after each epoch.

logging_strategy="steps": Logs progress every few steps.

per_device_train_batch_size=4: Batch size of 4 during training per GPU.

per_device_eval_batch_size=4: Same for evaluation.

num_train_epochs=3: Trains for 3 complete passes through the dataset.

learning_rate=5e-5: Sets the learning rate for optimization.

weight_decay=0.01: Adds regularization to avoid overfitting.

logging_dir="./logs": Saves logs in this directory.

logging_steps=100: Logs training loss every 100 steps.

save_total_limit=2: Keeps only the 2 most recent checkpoints.

load_best_model_at_end=True: Automatically loads the best checkpoint at the end (based on validation loss).

report_to="none": Disables logging to tools like TensorBoard or WandB.

fp16=True: Uses mixed precision (faster, less memory) if GPU is available.

💡 What's new or helpful here:
Enables efficient GPU usage (fp16=True).

Auto-loads best model to avoid manually checking validation scores.

Regular evaluation/saving/logging, helpful for tracking performance.

# Train the model

In [36]:
# 🧠 Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [37]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.2804,3.144971
2,3.1794,3.055067
3,3.1224,3.032788




TrainOutput(global_step=3150, training_loss=3.284864235529824, metrics={'train_runtime': 5248.5615, 'train_samples_per_second': 4.801, 'train_steps_per_second': 0.6, 'total_flos': 2.34641386635264e+16, 'train_loss': 3.284864235529824, 'epoch': 3.0})

**What it means:**
1. Training Loss: Measures how well the model is fitting the training data.

2. Validation Loss: Measures how well the model generalizes to unseen (validation) data.

🧠 Interpretation:
Both losses decrease gradually, which is a positive sign.

The gap between training and validation loss is small, suggesting:

No overfitting yet.

The model is learning useful general patterns from the data.

📌 Summary:
The model consistently improves with each epoch, and the learning process remains stable. You could optionally train for more epochs to check if it continues improving, but it's already showing solid convergence.

In [51]:
tokenizer.save_pretrained("finetuned_biogpt/tokenizer")

('finetuned_biogpt/tokenizer/tokenizer_config.json',
 'finetuned_biogpt/tokenizer/special_tokens_map.json',
 'finetuned_biogpt/tokenizer/vocab.json',
 'finetuned_biogpt/tokenizer/merges.txt',
 'finetuned_biogpt/tokenizer/added_tokens.json')

In [None]:
import shutil

# Create a ZIP file from the model directory
shutil.make_archive("/kaggle/working/finetuned_biogpt", 'zip', "/kaggle/working/finetuned_biogpt")


In [None]:
# Save your model and tokenizer manually to /kaggle/working
model_output_path = "/kaggle/working/finetuned_biogpt"
os.makedirs(model_output_path, exist_ok=True)