## Requirements and dependencies


In [None]:
%%capture
!pip install opacus
# !pip install -U bitsandbytes transformers accelerate
!pip install peft

In [None]:
!pip install pynvml



In [None]:
from random import sample
import numpy as np
print("NumPy version:", np.__version__)  # Should print "1.23.5"


import torch
from torch.amp import autocast, GradScaler  # Import automatic mixed precision tools
from transformers import AutoTokenizer, AutoModelForCausalLM
from opacus import PrivacyEngine
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from peft import get_peft_model, LoraConfig, TaskType

# Set up device - prioritize GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Print GPU info if available
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

NumPy version: 1.26.4
Using device: cuda
GPU Device: NVIDIA L4
Available GPU memory: 22.17 GB


In [None]:
## Clear GPU cache and storage
torch.cuda.empty_cache()  # Frees unused memory
torch.cuda.ipc_collect()  # Collects shared memory used in multiprocessing

In [None]:
from huggingface_hub import login
from google.colab import userdata

In [None]:
# Retrieve token securely
hf_token = userdata.get("HF_TOKEN")

if hf_token:
    login(token=hf_token)
    print("Logged in successfully!")
else:
    print("Hugging Face token not found. Please set it using `userdata.set`.")

Logged in successfully!


## CPU and GPU util functions

In [None]:
import psutil
import torch

try:
    from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates, nvmlSystemGetDriverVersion, nvmlDeviceGetName, nvmlShutdown
    nvmlInit()
    NVML_AVAILABLE = True
except ImportError:
    NVML_AVAILABLE = False

def get_cpu_stats():
    """ Get CPU usage stats """
    cpu_usage = psutil.cpu_percent(interval=1)  # Get CPU usage %
    cpu_freq = psutil.cpu_freq().current if psutil.cpu_freq() else "Unknown"  # CPU Frequency
    num_cores = psutil.cpu_count(logical=False)  # Physical Cores
    num_threads = psutil.cpu_count(logical=True)  # Logical Cores
    print(f"CPU Usage: {cpu_usage}%")
    print(f"CPU Frequency: {cpu_freq} MHz")
    print(f"Physical Cores: {num_cores}")
    print(f"Logical Cores: {num_threads}")

def get_ram_stats():
    """ Get system RAM stats """
    ram = psutil.virtual_memory()
    print("Total RAM:", round(ram.total / 1e9, 2), "GB")
    print("Available RAM:", round(ram.available / 1e9, 2), "GB")
    print("Used RAM:", round(ram.used / 1e9, 2), "GB")
    print("RAM Usage:", ram.percent, "%")

def get_gpu_stats():
    """ Get GPU stats if available """
    if not NVML_AVAILABLE:
        return {"Error": "pynvml not installed. Run: pip install nvidia-ml-py3"}

    num_gpus = torch.cuda.device_count()

    for i in range(num_gpus):
        handle = nvmlDeviceGetHandleByIndex(i)
        mem_info = nvmlDeviceGetMemoryInfo(handle)
        utilization = nvmlDeviceGetUtilizationRates(handle)

        print(f"GPU {i} - {nvmlDeviceGetName(handle)}")
        print(f"Driver Version: {nvmlSystemGetDriverVersion()}")
        print(f"Total VRAM: {round(mem_info.total / 1e9, 2)} GB")
        print(f"Used VRAM: {round(mem_info.used / 1e9, 2)} GB")
        print(f"Free VRAM: {round(mem_info.free / 1e9, 2)} GB")
        print(f"GPU Usage: {utilization.gpu}%")
        print()

    nvmlShutdown()  # Clean up NVML

# Run and print system stats

print("\n🔹 CPU Stats:", )
print("\n🔹 RAM Stats:", )
print("\n🔹 GPU Stats:", )



🔹 CPU Stats:

🔹 RAM Stats:

🔹 GPU Stats:


## CPU & GPU specs

In [None]:
get_cpu_stats()

CPU Usage: 1.0%
CPU Frequency: 2200.2180000000003 MHz
Physical Cores: 4
Logical Cores: 8


In [None]:
get_ram_stats()

Total RAM: 33.67 GB
Available RAM: 31.59 GB
Used RAM: 1.59 GB
RAM Usage: 6.2 %


In [None]:
get_gpu_stats()

GPU 0 - NVIDIA L4
Driver Version: 535.104.05
Total VRAM: 24.15 GB
Used VRAM: 0.36 GB
Free VRAM: 23.8 GB
GPU Usage: 0%



## Model Loading and Tokenizer

In [None]:
# Load Pretrained Model and Tokenizer
# model_name = "EleutherAI/gpt-neo-2.7B"
# model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit"
model_name = "meta-llama/Llama-3.1-8B"

# This line downloads (if needed) and initializes a tokenizer using the identifier stored in model_name.
# The tokenizer converts text into a numerical format (tokens) that the model can process,
# and it also handles the reverse process (converting tokens back to human-readable text).
tokenizer = AutoTokenizer.from_pretrained(model_name)

# This line loads a pre-trained causal language model (such as GPT-style models) using the same model identifier.
# It retrieves the model architecture and its pre-trained weights so you can use it for tasks like text generation.
# model = AutoModelForCausalLM.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,       # Loads model in FP16
    device_map="auto"                # Automatically distributes model across devices if needed
)

# !! NEW
# Freeze all model parameters (ensuring no gradients are computed for the base model)
for param in model.parameters():
    param.requires_grad = False

# Ensure a pad token exists (set to eos token if not present).
# 1. Check for the padding token id. If none, use the eos_token as the padding token
if tokenizer.pad_token_id is None:
    print("pad, token doesnt exists, using EOS token")
    tokenizer.pad_token = tokenizer.eos_token

# Adjusts the model's token embedding matrix to match the size of the tokenizer's vocabulary.
# This is important because adding or changing tokens (like defining a pad token)
# may change the size of the vocabulary, and the model's embedding layer needs to reflect that change.
model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

pad, token doesnt exists, using EOS token


Embedding(128256, 4096)

## LoRA Configuration

In [None]:
# To get all the intermediate layer config of the model
# for name, module in model.named_modules():
#     print(name, ":", module)

In [None]:
# Enable gradient checkpointing to save memory.

# This technique reduces memory usage during training by not storing all intermediate activations
# during the forward pass. Instead, it saves only a subset of them and recomputes the missing ones
# during the backward pass.
model.config.gradient_checkpointing = True

# Configure LoRA: update only a small set of additional parameters.
# tried r=4 and lora+alpha = 32. Maybe that destabilized training so modifying to 8 and 16 respectively
#initally was 0.1, changing to 0.05

# studies say best to apply Lora to all layers
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Fine-tuning for causal language modeling.
    inference_mode=False,          # Training mode.
    r=8,                           # Rank of low-rank decomposition.
    lora_alpha=16,                 # Scaling factor.
    lora_dropout=0.05,               # Dropout rate for LoRA layers.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"]
)



# This function call takes the pre-trained model and applies the LoRA configuration you defined.
# It modifies the model so that, instead of updating all parameters during fine-tuning,
# only a small subset (the LoRA adapters) is trained.
model = get_peft_model(model, lora_config)
print("LoRA applied. Trainable parameters:")
model.print_trainable_parameters()

# Move the model to the chosen device and set to training mode.
model.to(device)
model.train()

LoRA applied. Trainable parameters:
trainable params: 16,252,928 || all params: 8,046,514,176 || trainable%: 0.2020


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(


## Data loading and preprocessing

In [None]:
# Load and Format Training Data
import json

formatted_strings = []

with open("finetuning/train.jsonl", "r") as f:
    # j = 0
    for line in f:
        # Parse the JSON data from the line
        data = json.loads(line.strip())
        # Extract values
        rating = data['Rating']
        title = data['Title']
        review = data['Review']

        # Format the string as per the required format
        formatted_string = f'"System prompt : Given the Rating and Title, you are required to generate the review" | "Rating": {rating} | "Title": {title} | "Review": {review}'

        # Add the formatted string to the list
        formatted_strings.append(formatted_string)

        # j+=1
        # if j == 1000:
        #     break

In [None]:
# Now `formatted_strings` contains the list of strings in the desired format
print("Size: ",len(formatted_strings))
print(formatted_strings[0])
train_texts = formatted_strings
strs = [len(formatted_str) for formatted_str in formatted_strings]
print("length of largets string is: ",sum(strs) / len(strs))
# avg around 328

Size:  100000
"System prompt : Given the Rating and Title, you are required to generate the review" | "Rating": 4 | "Title": No white background! It’s clear! | "Review": I bought this bc I thought it had the nice white background. Turns out it’s clear & since my phone is blue it doesn’t look anything like this.  If I had known that I would have purchased something else. It works ok.
length of largets string is:  383.47625


## Data tokenization and dataset creation

In [None]:
# !! NEW - max_length=512

# Tokenize training texts with padding and truncation.
encodings = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']

# For causal language modeling, use input_ids as labels.
# Replace pad token positions with -100 so that they are ignored by the loss.

#creates a copy of your input IDs, so you can modify them without affecting the original tensor.
labels = input_ids.clone()

#replaces all padding token positions with -100. This is a common convention (especially with PyTorch’s CrossEntropyLoss)
# to indicate that these positions should be ignored during loss computatio
labels[input_ids == tokenizer.pad_token_id] = -100

print("Training data shape:", input_ids.shape)


# !! NEW - num_workers=4, pin_memory=True
# Create a TensorDataset and DataLoader with a small batch size.
train_dataset = TensorDataset(input_ids, attention_mask, labels)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, drop_last=True, num_workers=4, pin_memory=True)

Training data shape: torch.Size([100000, 128])


## Optimizer & Privacy engine setup

In [None]:
# !! NEW
# optimizer = AdamW(model.parameters(), lr=2e-5)
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4)

In [None]:
# privacy_engine = PrivacyEngine()
# model, optimizer, train_loader = privacy_engine.make_private(
#     module=model,
#     optimizer=optimizer,
#     data_loader=train_loader,
#     noise_multiplier=1.0,      # Adjust for your desired privacy guarantee.
#     max_grad_norm=1.5,         # Gradient clipping norm.
#     batch_first=True,
#     loss_reduction="mean",
#     poisson_sampling=False     # Use standard sampling.
# )

## Training loop

In [None]:
# Training loop
model.train()
epochs = 4  # Use more epochs for a real task.

# !! NEW
scaler = GradScaler('cuda')  # Create a gradient scaler to manage FP16 stability
accumulation_steps = 1  # Set gradient accumulation steps; use >1 to simulate larger batch sizes

In [None]:
# # FINETUNING, NOT WORKING NOW
# for epoch in range(epochs):  # Loop over each epoch
#     total_loss = 0.0  # Initialize total loss accumulator for the epoch
#     optimizer.zero_grad()  # Zero gradients at the start of the epoch
#     for i, batch in enumerate(train_loader):  # Loop over mini-batches from the DataLoader
#         # Move each tensor in the batch to the device (GPU) asynchronously if pin_memory is True
#         input_ids_batch, attention_mask_batch, labels_batch = [
#             x.to(device, non_blocking=True) for x in batch
#         ]

#         # Determine the sequence length for the current batch and create position IDs accordingly
#         seq_len = input_ids_batch.size(1)  # Get the sequence length from the input tensor
#         # Create a tensor [0, 1, ..., seq_len-1] and repeat it for each item in the batch
#         position_ids = torch.arange(seq_len, device=device).unsqueeze(0).repeat(input_ids_batch.size(0), 1)

#         # Use mixed precision context for the forward pass to save memory and speed up computation
#         with autocast():
#             outputs = model(
#                 input_ids=input_ids_batch,        # Input token IDs for the model
#                 attention_mask=attention_mask_batch,  # Attention mask to differentiate padded tokens
#                 position_ids=position_ids,          # Positional IDs for the tokens
#                 labels=labels_batch                 # Labels for computing the loss (typically same as input_ids for causal LM)
#             )
#             # Compute the loss; if using gradient accumulation, scale down the loss accordingly
#             loss = outputs.loss / accumulation_steps

#         # Scale the loss and perform the backward pass using the GradScaler for FP16 stability
#         scaler.scale(loss).backward()

#         # Every 'accumulation_steps' iterations, update the model weights
#         if (i + 1) % accumulation_steps == 0:
#             scaler.step(optimizer)  # Update parameters using scaled gradients
#             scaler.update()         # Update the scale for the next iteration
#             optimizer.zero_grad()   # Reset gradients after updating

#         # Accumulate the loss (multiply back to undo the earlier division, so total_loss is in original scale)
#         total_loss += loss.item() * accumulation_steps

#         # Optionally, print progress every 50 batches
#         if i % 50 == 0:
#             print(f"Batch {i} processed.")

#     # Compute the average loss over the epoch
#     avg_loss = total_loss / len(train_loader)
#     print(f"Epoch {epoch+1}/{epochs} - Average loss: {avg_loss:.4f}")

In [None]:
for epoch in range(epochs):
    total_loss = 0.0
    i = 0
    for batch in train_loader:
        if i%50 == 0:
            print(i)
        i+=1
        # Move each element of the batch to the device.
        # input_ids_batch, attention_mask_batch, labels_batch = [x.to(device) for x in batch]
        input_ids_batch, attention_mask_batch, labels_batch = [x.to(device, non_blocking=True) for x in batch]

        # Create a position_ids tensor: shape [batch_size, seq_len]
        seq_len = input_ids_batch.size(1)
        position_ids = torch.arange(seq_len, device=device).unsqueeze(0).repeat(input_ids_batch.size(0), 1)

        # Forward pass: compute the loss.
        outputs = model(
            input_ids=input_ids_batch,
            attention_mask=attention_mask_batch,
            position_ids=position_ids,
            labels=labels_batch
        )
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Average loss: {avg_loss:.4f}")

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
5050
5100
5150
5200
5250
5300
5350
5400
5450
5500
5550
5600
5650
5700
5750
5800
5850
5900
5950
6000
6050
6100
6150
6200
6250
6300
6350
6400
6450
6500
6550
6600
6650
6700
6750
6800
6850
6900
6950
7000
7050
7100
7150
7200
7250
7300
7350
7400
7450
7500
7550
7600
7650
7700
7750
7800
7850
7900
7950
8000
8050
8100
8150
8200
8250
8300
8350
8400
8450
8500
8550
8600
8650
8700
8750
8800
8850
8900
8950
9000
9050
9100
9150
9200
9250
9300
9350
9400
9450
9500
9550
9600
9650
9700
9750
9800
9850
9900
9950
10000
10050
10100
10150

In [None]:
# get the privacy budget
# epsilon = privacy_engine.accountant.get_epsilon(delta=1e-5)
# print(f"Achieved privacy budget: ε = {epsilon:.2f}")

## Model saving

In [None]:
# Remove DP hooks to restore the underlying model.
# model.remove_hooks()
# model = model._module  # Unwrap the model.

# Specify the directory where you want to save your fine-tuned model
save_directory = "./finetuned_model_dp"

# Save the model weights and configuration
model.save_pretrained(save_directory)

# Save the tokenizer (this ensures that any custom tokens are preserved)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to ./finetuned_model_dp


## Interactive testing

In [None]:
# Evaluate using a sample prompt.
while True:
    sample_prompt = input("Input: ")
    if sample_prompt.lower() == "bye":
        break
    enc = tokenizer(sample_prompt, return_tensors='pt', padding=True, truncation=True)
    enc = {k: v.to(device, non_blocking=True) for k, v in enc.items()}

    with torch.no_grad():
        generated_ids = model.generate(**enc, max_length=512, do_sample=True, top_k=50)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print("Generated text:", generated_text)

Input: System prompt : Given the Rating and Title, you are required to generate the review" | "Rating": 4 | "Title": No white background! It’s clear! | "Review":


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated text: System prompt : Given the Rating and Title, you are required to generate the review" | "Rating": 4 | "Title": No white background! It’s clear! | "Review": I was looking for a white background, but this is clear.  It’s a little bit thick but I like it.  It’s a little difficult to remove the case to put on my phone.  I also have a hard time removing it to put on a popsocket.  I think it’s because of the thickness of the case.  I like the feel of it.  I’m glad I purchased this case.  I like that it’s clear.  I’ve had a couple of cases that are white.  I didn’t like that it would get dirty quickly.  I would have to wash it to get it clean.  This case is clear so it’s easy to clean.  I like that.  I also like that it’s a little bit thicker.  I like the feel of it.  It’s a little bit difficult to put on my phone.  I’m glad I purchased this case.  I’m glad I purchased this case.  I’m glad I purchased this case.  I’m glad I purchased this case.  I’m glad I purchased this case. 

KeyboardInterrupt: Interrupted by user

In [None]:
# while True:
#     sample_prompt = input("Input: ")
#     if sample_prompt.lower() == "bye":
#         break
#     enc = tokenizer(sample_prompt, return_tensors='pt', padding=True, truncation=True)
#     enc = {k: v.to(device) for k, v in enc.items()}
#     generated_ids = model.generate(**enc, max_length=512, do_sample=True, top_k=50)
#     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
#     print("Generated text:", generated_text)

    #lora_alpha = 4
    #reduce loss to 2e-5