In [None]:
# RUN_MODE = "test"
RUN_MODE = "main"

## Requirements and dependencies


In [None]:
%%capture
!pip install opacus
# !pip install -U bitsandbytes transformers accelerate
!pip install peft

In [None]:
!pip install pynvml



In [None]:
from random import sample
import numpy as np
print("NumPy version:", np.__version__)  # Should print "1.23.5"


import torch
from torch.amp import autocast, GradScaler  # Import automatic mixed precision tools
from transformers import AutoTokenizer, AutoModelForCausalLM
from opacus import PrivacyEngine
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from peft import get_peft_model, LoraConfig, TaskType

# Set up device - prioritize GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Print GPU info if available
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

NumPy version: 1.26.4
Using device: cuda
GPU Device: NVIDIA L4
Available GPU memory: 22.17 GB


In [None]:
## Clear GPU cache and storage
torch.cuda.empty_cache()  # Frees unused memory
torch.cuda.ipc_collect()  # Collects shared memory used in multiprocessing

In [None]:
from huggingface_hub import login
from google.colab import userdata

In [None]:
# Retrieve token securely
hf_token = userdata.get("HF_TOKEN")

if hf_token:
    login(token=hf_token)
    print("Logged in successfully!")
else:
    print("Hugging Face token not found. Please set it using `userdata.set`.")

Logged in successfully!


## CPU and GPU util functions

In [None]:
import psutil
import torch

try:
    from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates, nvmlSystemGetDriverVersion, nvmlDeviceGetName, nvmlShutdown
    nvmlInit()
    NVML_AVAILABLE = True
except ImportError:
    NVML_AVAILABLE = False

def get_cpu_stats():
    """ Get CPU usage stats """
    cpu_usage = psutil.cpu_percent(interval=1)  # Get CPU usage %
    cpu_freq = psutil.cpu_freq().current if psutil.cpu_freq() else "Unknown"  # CPU Frequency
    num_cores = psutil.cpu_count(logical=False)  # Physical Cores
    num_threads = psutil.cpu_count(logical=True)  # Logical Cores
    print(f"CPU Usage: {cpu_usage}%")
    print(f"CPU Frequency: {cpu_freq} MHz")
    print(f"Physical Cores: {num_cores}")
    print(f"Logical Cores: {num_threads}")

def get_ram_stats():
    """ Get system RAM stats """
    ram = psutil.virtual_memory()
    print("Total RAM:", round(ram.total / 1e9, 2), "GB")
    print("Available RAM:", round(ram.available / 1e9, 2), "GB")
    print("Used RAM:", round(ram.used / 1e9, 2), "GB")
    print("RAM Usage:", ram.percent, "%")

def get_gpu_stats():
    """ Get GPU stats if available """
    if not NVML_AVAILABLE:
        return {"Error": "pynvml not installed. Run: pip install nvidia-ml-py3"}

    num_gpus = torch.cuda.device_count()

    for i in range(num_gpus):
        handle = nvmlDeviceGetHandleByIndex(i)
        mem_info = nvmlDeviceGetMemoryInfo(handle)
        utilization = nvmlDeviceGetUtilizationRates(handle)

        print(f"GPU {i} - {nvmlDeviceGetName(handle)}")
        print(f"Driver Version: {nvmlSystemGetDriverVersion()}")
        print(f"Total VRAM: {round(mem_info.total / 1e9, 2)} GB")
        print(f"Used VRAM: {round(mem_info.used / 1e9, 2)} GB")
        print(f"Free VRAM: {round(mem_info.free / 1e9, 2)} GB")
        print(f"GPU Usage: {utilization.gpu}%")
        print()

    nvmlShutdown()  # Clean up NVML

# Run and print system stats

print("\n🔹 CPU Stats:", )
print("\n🔹 RAM Stats:", )
print("\n🔹 GPU Stats:", )



🔹 CPU Stats:

🔹 RAM Stats:

🔹 GPU Stats:


## CPU & GPU specs

In [None]:
get_cpu_stats()

CPU Usage: 8.0%
CPU Frequency: 2200.2020000000007 MHz
Physical Cores: 6
Logical Cores: 12


In [None]:
get_ram_stats()

Total RAM: 50.53 GB
Available RAM: 48.34 GB
Used RAM: 1.55 GB
RAM Usage: 4.3 %


In [None]:
get_gpu_stats()

GPU 0 - NVIDIA L4
Driver Version: 535.104.05
Total VRAM: 24.15 GB
Used VRAM: 0.36 GB
Free VRAM: 23.8 GB
GPU Usage: 0%



## Model Loading and Tokenizer

In [None]:
# Load Pretrained Model and Tokenizer
# model_name = "EleutherAI/gpt-neo-2.7B"
# model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit"
LLAMA_1B = "meta-llama/Llama-3.2-1B"
LLAMA_8B = "meta-llama/Llama-3.1-8B"

model_name = LLAMA_8B if RUN_MODE == "main" else LLAMA_1B

# This line downloads (if needed) and initializes a tokenizer using the identifier stored in model_name.
# The tokenizer converts text into a numerical format (tokens) that the model can process,
# and it also handles the reverse process (converting tokens back to human-readable text).
tokenizer = AutoTokenizer.from_pretrained(model_name)

# This line loads a pre-trained causal language model (such as GPT-style models) using the same model identifier.
# It retrieves the model architecture and its pre-trained weights so you can use it for tasks like text generation.
# model = AutoModelForCausalLM.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,       # Loads model in FP16
    device_map="auto"                # Automatically distributes model across devices if needed
)

# !! NEW
# Freeze all model parameters (ensuring no gradients are computed for the base model)
for param in model.parameters():
    param.requires_grad = False

# Ensure a pad token exists (set to eos token if not present).
# 1. Check for the padding token id. If none, use the eos_token as the padding token
if tokenizer.pad_token_id is None:
    print("pad, token doesnt exists, using EOS token")
    tokenizer.pad_token = tokenizer.eos_token

# Adjusts the model's token embedding matrix to match the size of the tokenizer's vocabulary.
# This is important because adding or changing tokens (like defining a pad token)
# may change the size of the vocabulary, and the model's embedding layer needs to reflect that change.
model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

pad, token doesnt exists, using EOS token


Embedding(128256, 4096)

## LoRA Configuration

In [None]:
# To get all the intermediate layer config of the model
# for name, module in model.named_modules():
#     print(name, ":", module)

In [None]:
# Enable gradient checkpointing to save memory.

# This technique reduces memory usage during training by not storing all intermediate activations
# during the forward pass. Instead, it saves only a subset of them and recomputes the missing ones
# during the backward pass.
model.config.gradient_checkpointing = True

# Configure LoRA: update only a small set of additional parameters.
# tried r=4 and lora+alpha = 32. Maybe that destabilized training so modifying to 8 and 16 respectively
#initally was 0.1, changing to 0.05

# studies say best to apply Lora to all layers
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Fine-tuning for causal language modeling.
    inference_mode=False,          # Training mode.
    r=8,                           # Rank of low-rank decomposition.
    lora_alpha=16,                 # Scaling factor.
    lora_dropout=0.05,               # Dropout rate for LoRA layers.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"]
)



# This function call takes the pre-trained model and applies the LoRA configuration you defined.
# It modifies the model so that, instead of updating all parameters during fine-tuning,
# only a small subset (the LoRA adapters) is trained.
model = get_peft_model(model, lora_config)
print("LoRA applied. Trainable parameters:")
model.print_trainable_parameters()

# Move the model to the chosen device and set to training mode.
model.to(device)
model.train()

LoRA applied. Trainable parameters:
trainable params: 16,252,928 || all params: 8,046,514,176 || trainable%: 0.2020


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Line

## Data loading and preprocessing

In [None]:
# Load and Format Training Data
import json

formatted_strings = []

with open("finetuning/train.jsonl", "r") as f:
    j = 0
    for line in f:
        # Parse the JSON data from the line
        data = json.loads(line.strip())
        # Extract values
        rating = data['Rating']
        title = data['Title']
        review = data['Review']

        # Format the string as per the required format
        formatted_string = f'"System prompt : Given the Rating and Title, you are required to generate the review" | "Rating": {rating} | "Title": {title} | "Review": {review}'

        # Add the formatted string to the list
        formatted_strings.append(formatted_string)

        if RUN_MODE == "test":
          j += 1
          if j == 1000:
              break

In [None]:
# Now `formatted_strings` contains the list of strings in the desired format
print("Size: ",len(formatted_strings))
print(formatted_strings[0])
train_texts = formatted_strings
strs = [len(formatted_str) for formatted_str in formatted_strings]
print("length of largets string is: ",sum(strs) / len(strs))
# avg around 328

Size:  100000
"System prompt : Given the Rating and Title, you are required to generate the review" | "Rating": 4 | "Title": No white background! It’s clear! | "Review": I bought this bc I thought it had the nice white background. Turns out it’s clear & since my phone is blue it doesn’t look anything like this.  If I had known that I would have purchased something else. It works ok.
length of largets string is:  383.47625


## Data tokenization and dataset creation

In [None]:
# !! NEW - max_length=512

DATA_PARAMS = {
  "max_length": 128,
  "batch_size": 4,
}

# Tokenize training texts with padding and truncation.
encodings = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True, max_length=DATA_PARAMS['max_length'])
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']

# For causal language modeling, use input_ids as labels.
# Replace pad token positions with -100 so that they are ignored by the loss.

#creates a copy of your input IDs, so you can modify them without affecting the original tensor.
labels = input_ids.clone()

#replaces all padding token positions with -100. This is a common convention (especially with PyTorch’s CrossEntropyLoss)
# to indicate that these positions should be ignored during loss computatio
labels[input_ids == tokenizer.pad_token_id] = -100

print("Training data shape:", input_ids.shape)


# !! NEW - num_workers=4, pin_memory=True
# Create a TensorDataset and DataLoader with a small batch size.
train_dataset = TensorDataset(input_ids, attention_mask, labels)
train_loader = DataLoader(train_dataset, batch_size=DATA_PARAMS['batch_size'], shuffle=True, drop_last=True, num_workers=4, pin_memory=True)

Training data shape: torch.Size([100000, 128])


## Optimizer & Privacy engine setup

In [None]:
# !! NEW
# optimizer = AdamW(model.parameters(), lr=2e-5)
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)

In [None]:
privacy_engine = PrivacyEngine()
model, optimizer, train_loader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    noise_multiplier=0.3,      # Lower noise multiplier to reduce added noise
    max_grad_norm=5,           # Increase clipping norm to allow larger gradients
    batch_first=True,
    loss_reduction="mean",
    poisson_sampling=False      # UPDATE - ERRORING OUT, SO NOT USING. Use Poisson sampling for potentially more stable training
)



## Tracking

In [None]:
import json
import os
import datetime
import pytz # PST time zone
import pandas as pd
from pathlib import Path

class TrainingTracker:
    def __init__(self, base_dir="./tracking_results"):
        """
        Initialize the training tracker.

        Args:
            base_dir: Directory to save tracking results
        """
        self.base_dir = Path(base_dir)
        self.base_dir.mkdir(exist_ok=True, parents=True)

        # Generate a unique run ID based on timestamp
        # Define PST timezone
        pst = pytz.timezone("America/Los_Angeles")
        # Get current time in PST
        pst_time = datetime.datetime.now(pytz.utc).astimezone(pst)
        # Format the time
        timestamp = pst_time.strftime("%d-%m_%H-%M-%S")
        # timestamp = datetime.datetime.now().strftime("%d-%m_%H-%M-%S")
        self.run_id = f"run_{timestamp}"

        # Create run directory
        self.run_dir = self.base_dir / self.run_id
        self.run_dir.mkdir(exist_ok=True)

        # Initialize tracking data structures
        self.params = {}
        self.epoch_metrics = []
        self.generated_samples = []
        self.privacy_metrics = {}

    def record_parameters(self, **kwargs):
        """
        Record training parameters for the current run.

        Args:
            **kwargs: Key-value pairs of parameters to record
        """
        self.params.update(kwargs)

        # Save parameters to file
        with open(self.run_dir / "parameters.json", "w") as f:
            json.dump(self.params, f, indent=4)

    def record_epoch_metrics(self, epoch, loss, batch_times=None, **kwargs):
        """
        Record metrics for a training epoch.

        Args:
            epoch: Current epoch number
            loss: Loss value for the epoch
            batch_times: Optional list of batch processing times
            **kwargs: Additional metrics to record
        """
        metrics = {
            "epoch": epoch,
            "loss": loss,
            **kwargs
        }

        if batch_times:
            metrics["avg_batch_time"] = sum(batch_times) / len(batch_times)
            metrics["min_batch_time"] = min(batch_times)
            metrics["max_batch_time"] = max(batch_times)

        self.epoch_metrics.append(metrics)

        # Save updated metrics to file
        with open(self.run_dir / "epoch_metrics.json", "w") as f:
            json.dump(self.epoch_metrics, f, indent=4)

        # Also save as CSV for easier analysis
        pd.DataFrame(self.epoch_metrics).to_csv(
            self.run_dir / "epoch_metrics.csv", index=False)

    def record_privacy_budget(self, epsilon, delta=1e-5, **kwargs):
        """
        Record privacy budget metrics.

        Args:
            epsilon: Achieved epsilon value
            delta: Delta value used
            **kwargs: Additional privacy metrics
        """
        self.privacy_metrics = {
            "epsilon": epsilon,
            "delta": delta,
            **kwargs
        }

        # Save privacy metrics to file
        with open(self.run_dir / "privacy_metrics.json", "w") as f:
            json.dump(self.privacy_metrics, f, indent=4)

    def record_sample(self, prompt, generated_text):
        """
        Record a sample of generated text.

        Args:
            prompt: Input prompt
            generated_text: Generated text output
        """
        sample = {
            "prompt": prompt,
            "generated_text": generated_text,
            "timestamp": datetime.datetime.now().isoformat()
        }

        self.generated_samples.append(sample)

        # Save samples to file
        with open(self.run_dir / "generated_samples.json", "w") as f:
            json.dump(self.generated_samples, f, indent=4)

    def save_model_info(self, model_path, model_type, tokenizer_info=None):
        """
        Record information about the saved model.

        Args:
            model_path: Path where model was saved
            model_type: Type of model (e.g., "with_dp", "without_dp")
            tokenizer_info: Additional tokenizer information
        """
        model_info = {
            "model_path": str(model_path),
            "model_type": model_type,
            "tokenizer_info": tokenizer_info or {}
        }

        # Save model info to file
        with open(self.run_dir / "model_info.json", "w") as f:
            json.dump(model_info, f, indent=4)

    def generate_summary(self):
        """
        Generate a summary of the training run.

        Returns:
            str: Summary text
        """
        summary_lines = [
            f"Training Run: {self.run_id}",
            "=" * 50,
            "\nParameters:",
        ]

        for key, value in self.params.items():
            summary_lines.append(f"  {key}: {value}")

        if self.epoch_metrics:
            summary_lines.extend([
                "\nTraining Results:",
                f"  Epochs completed: {len(self.epoch_metrics)}",
                f"  Final loss: {self.epoch_metrics[-1]['loss']:.6f}",
                f"  Initial loss: {self.epoch_metrics[0]['loss']:.6f}",
                f"  Loss reduction: {self.epoch_metrics[0]['loss'] - self.epoch_metrics[-1]['loss']:.6f}"
            ])

        if self.privacy_metrics:
            summary_lines.extend([
                "\nPrivacy Budget:",
                f"  Epsilon: {self.privacy_metrics['epsilon']:.4f}",
                f"  Delta: {self.privacy_metrics['delta']}"
            ])

        summary_text = "\n".join(summary_lines)

        # Save summary to file
        with open(self.run_dir / "summary.txt", "w") as f:
            f.write(summary_text)

        return summary_text

    # def compare_with_previous_runs(self, metric="loss"):
    #     """
    #     Compare this run with previous runs based on a specific metric.

    #     Args:
    #         metric: Metric to compare (default: "loss")

    #     Returns:
    #         DataFrame: Comparison data
    #     """
    #     # Collect data from all previous runs
    #     all_runs = []

    #     for run_dir in self.base_dir.iterdir():
    #         if not run_dir.is_dir() or run_dir == self.run_dir:
    #             continue

    #         params_file = run_dir / "parameters.json"
    #         metrics_file = run_dir / "epoch_metrics.json"

    #         if params_file.exists() and metrics_file.exists():
    #             with open(params_file, "r") as f:
    #                 params = json.load(f)

    #             with open(metrics_file, "r") as f:
    #                 metrics = json.load(f)

    #             if metrics:
    #                 final_metric = metrics[-1].get(metric)

    #                 run_data = {
    #                     "run_id": run_dir.name,
    #                     f"final_{metric}": final_metric,
    #                     **params
    #                 }

    #                 all_runs.append(run_data)

    #     # Add current run
    #     if self.epoch_metrics:
    #         current_run_data = {
    #             "run_id": self.run_id,
    #             f"final_{metric}": self.epoch_metrics[-1].get(metric),
    #             **self.params
    #         }
    #         all_runs.append(current_run_data)

    #     # Convert to DataFrame and sort
    #     if all_runs:
    #         df = pd.DataFrame(all_runs)
    #         df = df.sort_values(by=f"final_{metric}")

    #         # Save comparison to file
    #         df.to_csv(self.run_dir / f"comparison_{metric}.csv", index=False)

    #         return df

    #     return pd.DataFrame()

In [None]:
# Record initial parameters after setting them up
def record_initial_params():
    # Record model configuration
    tracker.record_parameters(
        model_name=model_name,
        device=str(device),
        epochs=epochs,
        batch_size=train_loader.batch_size,
        learning_rate=optimizer.param_groups[0]['lr'],
        gradient_accumulation_steps=accumulation_steps,

        # LoRA parameters
        lora_r=lora_config.r,
        lora_alpha=lora_config.lora_alpha,
        lora_dropout=lora_config.lora_dropout,
        lora_target_modules=list(lora_config.target_modules),

        # Privacy parameters (if using Opacus)
        using_differential_privacy=hasattr(model, "remove_hooks"),
        noise_multiplier=0.6 if hasattr(model, "remove_hooks") else None,
        max_grad_norm=1.5 if hasattr(model, "remove_hooks") else None,

        # Dataset info
        dataset_size=len(formatted_strings),
        avg_sample_length=sum(len(s) for s in formatted_strings) / len(formatted_strings),
        tokenizer_max_length=DATA_PARAMS['max_length'],  # From tokenization step
        data_batch_size=DATA_PARAMS['batch_size'],

        # Tokenizer info
        tokenizer_vocab_size=len(tokenizer),
        tokenizer_model_max_length=tokenizer.model_max_length,


        # System info
        cuda_available=torch.cuda.is_available(),
        gpu_name=torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
    )

## Training setup


In [None]:
epochs = 4 if RUN_MODE == "main" else 1

# !! NEW
scaler = GradScaler('cuda')  # Create a gradient scaler to manage FP16 stability
accumulation_steps = 1  # Set gradient accumulation steps; use >1 to simulate larger batch sizes

In [None]:
# Initialize the tracker before loading the model
tracker = TrainingTracker()
# Call this after all parameters are set but before training starts
record_initial_params()

In [None]:
# Training loop
model.train()

GradSampleModule(PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k

## Sanity check


In [None]:
model_name

'meta-llama/Llama-3.1-8B'

In [None]:
epochs

4

In [None]:
len(train_texts)

100000

## Training loop

In [None]:
# # FINETUNING, NOT WORKING NOW
# for epoch in range(epochs):  # Loop over each epoch
#     total_loss = 0.0  # Initialize total loss accumulator for the epoch
#     optimizer.zero_grad()  # Zero gradients at the start of the epoch
#     for i, batch in enumerate(train_loader):  # Loop over mini-batches from the DataLoader
#         # Move each tensor in the batch to the device (GPU) asynchronously if pin_memory is True
#         input_ids_batch, attention_mask_batch, labels_batch = [
#             x.to(device, non_blocking=True) for x in batch
#         ]

#         # Determine the sequence length for the current batch and create position IDs accordingly
#         seq_len = input_ids_batch.size(1)  # Get the sequence length from the input tensor
#         # Create a tensor [0, 1, ..., seq_len-1] and repeat it for each item in the batch
#         position_ids = torch.arange(seq_len, device=device).unsqueeze(0).repeat(input_ids_batch.size(0), 1)

#         # Use mixed precision context for the forward pass to save memory and speed up computation
#         with autocast():
#             outputs = model(
#                 input_ids=input_ids_batch,        # Input token IDs for the model
#                 attention_mask=attention_mask_batch,  # Attention mask to differentiate padded tokens
#                 position_ids=position_ids,          # Positional IDs for the tokens
#                 labels=labels_batch                 # Labels for computing the loss (typically same as input_ids for causal LM)
#             )
#             # Compute the loss; if using gradient accumulation, scale down the loss accordingly
#             loss = outputs.loss / accumulation_steps

#         # Scale the loss and perform the backward pass using the GradScaler for FP16 stability
#         scaler.scale(loss).backward()

#         # Every 'accumulation_steps' iterations, update the model weights
#         if (i + 1) % accumulation_steps == 0:
#             scaler.step(optimizer)  # Update parameters using scaled gradients
#             scaler.update()         # Update the scale for the next iteration
#             optimizer.zero_grad()   # Reset gradients after updating

#         # Accumulate the loss (multiply back to undo the earlier division, so total_loss is in original scale)
#         total_loss += loss.item() * accumulation_steps

#         # Optionally, print progress every 50 batches
#         if i % 50 == 0:
#             print(f"Batch {i} processed.")

#     # Compute the average loss over the epoch
#     avg_loss = total_loss / len(train_loader)
#     print(f"Epoch {epoch+1}/{epochs} - Average loss: {avg_loss:.4f}")

In [None]:
for epoch in range(epochs):
    total_loss = 0.0
    batch_times = []
    start_time = datetime.datetime.now()

    for i, batch in enumerate(train_loader):
        batch_start = datetime.datetime.now()


        # Move each element of the batch to the device.
        # input_ids_batch, attention_mask_batch, labels_batch = [x.to(device) for x in batch]
        input_ids_batch, attention_mask_batch, labels_batch = [x.to(device, non_blocking=True) for x in batch]

        # Create a position_ids tensor: shape [batch_size, seq_len]
        seq_len = input_ids_batch.size(1)
        position_ids = torch.arange(seq_len, device=device).unsqueeze(0).repeat(input_ids_batch.size(0), 1)

        # Forward pass: compute the loss.
        outputs = model(
            input_ids=input_ids_batch,
            attention_mask=attention_mask_batch,
            position_ids=position_ids,
            labels=labels_batch
        )
        loss = outputs.loss
        # total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track metrics
        total_loss += loss.item()
        batch_end = datetime.datetime.now()
        batch_times.append((batch_end - batch_start).total_seconds())

        # Print progress
        if i % 50 == 0:
            print(f"Epoch {epoch+1}/{epochs} - Batch {i}/{len(train_loader)} - Loss: {loss.item():.4f}")

    # Compute epoch metrics
    avg_loss = total_loss / len(train_loader)
    end_time = datetime.datetime.now()
    epoch_duration = (end_time - start_time).total_seconds()
    print(f"Epoch {epoch+1}/{epochs} - Average loss: {avg_loss:.4f} - Duration: {epoch_duration:.2f}s")

    # Record epoch metrics
    tracker.record_epoch_metrics(
        epoch=epoch+1,
        loss=avg_loss,
        batch_times=batch_times,
        epoch_duration=epoch_duration,
        timestamp=datetime.datetime.now().isoformat()
    )

    # Record privacy budget if using differential privacy
    if hasattr(model, "remove_hooks"):
        epsilon = privacy_engine.accountant.get_epsilon(delta=1e-5)
        print(f"Achieved privacy budget: ε = {epsilon:.2f}")
        tracker.record_privacy_budget(epsilon=epsilon)

  self._maybe_warn_non_full_backward_hook(args, result, grad_fn)


Epoch 1/4 - Batch 0/25000 - Loss: 3.7414
Epoch 1/4 - Batch 50/25000 - Loss: 3.5234
Epoch 1/4 - Batch 100/25000 - Loss: 3.5384
Epoch 1/4 - Batch 150/25000 - Loss: 3.6427
Epoch 1/4 - Batch 200/25000 - Loss: 3.0998
Epoch 1/4 - Batch 250/25000 - Loss: 3.3532
Epoch 1/4 - Batch 300/25000 - Loss: 3.3488
Epoch 1/4 - Batch 350/25000 - Loss: 3.1736
Epoch 1/4 - Batch 400/25000 - Loss: 3.5294
Epoch 1/4 - Batch 450/25000 - Loss: 3.2866
Epoch 1/4 - Batch 500/25000 - Loss: 3.2757
Epoch 1/4 - Batch 550/25000 - Loss: 3.2063
Epoch 1/4 - Batch 600/25000 - Loss: 3.3168
Epoch 1/4 - Batch 650/25000 - Loss: 3.2022
Epoch 1/4 - Batch 700/25000 - Loss: 3.4977
Epoch 1/4 - Batch 750/25000 - Loss: 3.5639
Epoch 1/4 - Batch 800/25000 - Loss: 3.4709
Epoch 1/4 - Batch 850/25000 - Loss: 3.3943
Epoch 1/4 - Batch 900/25000 - Loss: 3.5366
Epoch 1/4 - Batch 950/25000 - Loss: 3.1521
Epoch 1/4 - Batch 1000/25000 - Loss: 3.3253
Epoch 1/4 - Batch 1050/25000 - Loss: 2.9724
Epoch 1/4 - Batch 1100/25000 - Loss: 3.2340
Epoch 1/4 -

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Specify the directory where your model and tokenizer are saved
save_directory = "."

# Load the model and tokenizer from the saved directory
model = AutoModelForCausalLM.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Set up the device: use CUDA if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the selected device and set it to evaluation mode
model.to(device)
model.eval()

# Define a sample prompt
sample_prompt = ("System prompt: Given the Rating and Title, you are required to generate the review, "
                 "Rating: 5, Title: Would definitely buy again, Review:")

# Tokenize the input text
inputs = tokenizer(sample_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Generate text using the model
generated_ids = model.generate(**inputs, max_length=128, do_sample=True, top_k=50)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Generated text:", generated_text)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using device: cuda


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 22.17 GiB of which 22.88 MiB is free. Process 232487 has 22.14 GiB memory in use. Of the allocated memory 20.93 GiB is allocated by PyTorch, and 996.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Model saving

In [None]:
# Remove DP hooks if present
if hasattr(model, "remove_hooks"):
    model.remove_hooks()
    model = model._module  # Unwrap the model

# # Remove DP hooks to restore the underlying model.
# model.remove_hooks()
# model = model._module  # Unwrap the model.

# Define model type based on whether differential privacy was used
model_type = "with_dp" if hasattr(privacy_engine, "accountant") else "without_dp"

# Specify the directory where you want to save your fine-tuned model
save_directory = "./finetuned_model_dp"

# Save the model weights and configuration
model.save_pretrained(save_directory)

# Save the tokenizer (this ensures that any custom tokens are preserved)
tokenizer.save_pretrained(save_directory)

# Record model info
tracker.save_model_info(
    model_path=save_directory,
    model_type=model_type,
    tokenizer_info={
        "vocab_size": len(tokenizer),
        "model_max_length": tokenizer.model_max_length,
    }
)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to ./finetuned_model_dp


## Interactive testing

In [None]:
# Evaluate using a sample prompt.
while True:
    sample_prompt = input("Input: ")
    if sample_prompt.lower() == "bye":
        break
    enc = tokenizer(sample_prompt, return_tensors='pt', padding=True, truncation=True)
    enc = {k: v.to(device, non_blocking=True) for k, v in enc.items()}

    with torch.no_grad():
        generated_ids = model.generate(**enc, max_length=128, do_sample=True, top_k=50)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print("Generated text:", generated_text)

In [None]:
# while True:
#     sample_prompt = input("Input: ")
#     if sample_prompt.lower() == "bye":
#         break
#     enc = tokenizer(sample_prompt, return_tensors='pt', padding=True, truncation=True)
#     enc = {k: v.to(device) for k, v in enc.items()}
#     generated_ids = model.generate(**enc, max_length=512, do_sample=True, top_k=50)
#     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
#     print("Generated text:", generated_text)

    #lora_alpha = 4
    #reduce loss to 2e-5