In [1]:
!pip install trl transformers datasets

Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.23.0-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.23.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
import numpy as np

In [4]:
# Load dataset and tokenizer
dataset = load_dataset("tatsu-lab/alpaca", split="train")
#Break this data into train, val, test in 80, 10, 10 percent splits after shuffling
dataset = dataset.shuffle(seed=42)
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, len(dataset)))

model_name = "EleutherAI/pythia-410m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [7]:
model = AutoModelForCausalLM.from_pretrained(model_name)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = model.to(device)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

In [8]:
def build_prompt(example):
    if example["input"].strip():
        return f"Instruction: {example['instruction']}\nInput: {example['input']}\nResponse:"
    else:
        return f"Instruction: {example['instruction']}\nResponse:"

In [5]:
def compute_perplexity(model, tokenizer, dataset, device):
    model.eval()
    total_loss = 0
    total_tokens = 0

    for i, example in enumerate(dataset):
        prompt = build_prompt(example)
        full_text = prompt + " " + example["output"]

        encodings = tokenizer(full_text, return_tensors="pt").to(device)
        input_ids = encodings["input_ids"]

        # Mask prompt tokens if you want only response perplexity
        prompt_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
        labels = input_ids.clone()
        labels[:, :prompt_ids.size(1)] = -100  # ignore prompt in loss

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=encodings["attention_mask"], labels=labels)
            loss = outputs.loss
            num_tokens = (labels != -100).sum().item()

        total_loss += loss.item() * num_tokens
        total_tokens += num_tokens

        print(f"Processed {i+1}/{len(dataset)} examples")

    avg_loss = total_loss / total_tokens
    perplexity = np.exp(avg_loss)
    return perplexity


In [17]:
def generate_output(model, tokenizer, prompt, device="cuda",
                    max_new_tokens=100, temperature=0.7, top_p=0.9):
    """
    Generate text from a fine-tuned causal LM.

    Args:
        model: Hugging Face model (e.g., AutoModelForCausalLM)
        tokenizer: Corresponding tokenizer
        prompt (str): The input text / instruction
        device (str): "cuda", "cpu", or "xla" (for TPU)
        max_new_tokens (int): Max tokens to generate
        temperature (float): Controls randomness (lower = more deterministic)
        top_p (float): Nucleus sampling parameter

    Returns:
        str: Generated text
    """
    model.eval()
    model.to(device)

    # Tokenize prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode output (skip the input part to show only generated continuation)
    generated_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    return generated_text


In [13]:
ppl = compute_perplexity(model, tokenizer, test_dataset.select(range(100)), device)
print(f"Baseline Perplexity: {ppl}")

Processed 1/100 examples
Processed 2/100 examples
Processed 3/100 examples
Processed 4/100 examples
Processed 5/100 examples
Processed 6/100 examples
Processed 7/100 examples
Processed 8/100 examples
Processed 9/100 examples
Processed 10/100 examples
Processed 11/100 examples
Processed 12/100 examples
Processed 13/100 examples
Processed 14/100 examples
Processed 15/100 examples
Processed 16/100 examples
Processed 17/100 examples
Processed 18/100 examples
Processed 19/100 examples
Processed 20/100 examples
Processed 21/100 examples
Processed 22/100 examples
Processed 23/100 examples
Processed 24/100 examples
Processed 25/100 examples
Processed 26/100 examples
Processed 27/100 examples
Processed 28/100 examples
Processed 29/100 examples
Processed 30/100 examples
Processed 31/100 examples
Processed 32/100 examples
Processed 33/100 examples
Processed 34/100 examples
Processed 35/100 examples
Processed 36/100 examples
Processed 37/100 examples
Processed 38/100 examples
Processed 39/100 exam

In [8]:
# Set hyperparameters using Hugging Face TrainingArguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/TRL/SFT",                 # Where to save the model
    overwrite_output_dir=True,
    num_train_epochs=3,                     # Total training epochs
    per_device_train_batch_size=4,          # Batch size per GPU/CPU
    gradient_accumulation_steps=4,          # Steps to accumulate gradients before updating
    learning_rate=1e-4,                     # Optimizer learning rate
    logging_steps=100,                      # Log every N steps
    save_steps=500,                         # Save checkpoint every N steps
    save_total_limit=2,                     # Keep only last 2 checkpoints
    fp16=True,                              # Use mixed-precision if possible
)


In [10]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
    processing_class=tokenizer,
)
trainer.train(resume_from_checkpoint=True)

Adding EOS to train dataset:   0%|          | 0/41601 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/41601 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/41601 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbhushansshah[0m ([33mbhushansshah-stony-brook-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,1.844
200,1.7908
300,1.789
400,1.7756
500,1.7855
600,1.7792
700,1.7686
800,1.7523
900,1.8404
1000,1.7384


TrainOutput(global_step=7803, training_loss=1.3291242003792236, metrics={'train_runtime': 7100.9085, 'train_samples_per_second': 17.576, 'train_steps_per_second': 1.099, 'total_flos': 4.542598846144512e+16, 'train_loss': 1.3291242003792236, 'entropy': 1.0294270184304979, 'num_tokens': 13917315.0, 'mean_token_accuracy': 0.7585629953278435, 'epoch': 3.0})

In [18]:
#load the finetuned model
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/TRL/SFT/checkpoint-7803")
model = model.to(device)


In [16]:
ppl = compute_perplexity(model, tokenizer, test_dataset.select(range(100)), device)
print(f"Finetuned Model Perplexity: {ppl}")

Processed 1/100 examples
Processed 2/100 examples
Processed 3/100 examples
Processed 4/100 examples
Processed 5/100 examples
Processed 6/100 examples
Processed 7/100 examples
Processed 8/100 examples
Processed 9/100 examples
Processed 10/100 examples
Processed 11/100 examples
Processed 12/100 examples
Processed 13/100 examples
Processed 14/100 examples
Processed 15/100 examples
Processed 16/100 examples
Processed 17/100 examples
Processed 18/100 examples
Processed 19/100 examples
Processed 20/100 examples
Processed 21/100 examples
Processed 22/100 examples
Processed 23/100 examples
Processed 24/100 examples
Processed 25/100 examples
Processed 26/100 examples
Processed 27/100 examples
Processed 28/100 examples
Processed 29/100 examples
Processed 30/100 examples
Processed 31/100 examples
Processed 32/100 examples
Processed 33/100 examples
Processed 34/100 examples
Processed 35/100 examples
Processed 36/100 examples
Processed 37/100 examples
Processed 38/100 examples
Processed 39/100 exam

In [25]:
#Testing the generations from pretrained model
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to(device)
prompt = '''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Write a short paragraph about the given topic.

### Input:
The importance of keeping our environment clean'''

output = generate_output(model, tokenizer, prompt, device)
print(output)

.

### Output:
A paragraph about the topic.

### Task:
Write a short paragraph about the given topic.

### Input:
The importance of keeping our environment clean.

### Output:
A paragraph about the topic.

### Task:
Write a short paragraph about the given topic.

### Input:
The importance of keeping our environment clean.

### Output:
A paragraph about the topic.

### Task:


In [27]:
#Testing the generations from finetuned model
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/TRL/SFT/checkpoint-7803")
model = model.to(device)
output = generate_output(model, tokenizer, prompt, device)
print(output)

.

### Response:
Keeping our environment clean is essential for maintaining a healthy and balanced lifestyle. By maintaining a clean environment, we can reduce our carbon footprint, conserve natural resources, and save money over time. By doing so, we can protect our natural habitats, reduce air pollution, and prevent the spread of diseases. Additionally, preserving our environment is essential for a healthier and more sustainable future.
