In [1]:
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from accelerate import Accelerator

# Initialize Accelerator
accelerator = Accelerator()

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set pad_token as eos_token for compatibility
tokenizer.pad_token = tokenizer.eos_token

# Low-Rank Approximation Function
def low_rank_approximation(weight_matrix, rank):
    u, s, v = torch.svd(weight_matrix)
    s[rank:] = 0
    low_rank_weight = torch.mm(u, torch.mm(torch.diag(s), v.t()))
    return low_rank_weight

# Apply Low-Rank Approximation to Model
def apply_low_rank_approximation(model, rank=50):
    # Example layer (adjust based on the model architecture)
    layer = model.transformer.h[0].attn.c_attn
    original_weight = layer.weight
    approx_weight = low_rank_approximation(original_weight, rank)
    layer.weight = torch.nn.Parameter(approx_weight)
    return model

# Load and prepare datasets
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")  # Use only 1% of the dataset

# Split dataset into train, validation, and test subsets
train_val_test_split = dataset.train_test_split(test_size=0.4, seed=42)
train_dataset = train_val_test_split['train']
val_test_split = train_val_test_split['test'].train_test_split(test_size=0.5, seed=42)
val_dataset = val_test_split['train']
test_dataset = val_test_split['test']

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Data collator to include labels
def data_collator(batch):
    input_ids = torch.stack([example['input_ids'] for example in batch])
    
    # Handle attention_mask
    if 'attention_mask' in batch[0]:
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
    else:
        attention_mask = (input_ids != tokenizer.pad_token_id).long()
    
    # Labels are the same as input_ids, but shifted by one token to the right
    labels = input_ids.clone()
    
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Fine-Tuning Setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Disable evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Reduce batch size
    per_device_eval_batch_size=2,  # Reduce batch size during evaluation
    num_train_epochs=1,  # Set to 1 for a quick experiment; increase for more thorough training
    weight_decay=0.01,
    fp16=True if torch.cuda.is_available() else False,  # Use FP16 if available for faster training
    gradient_accumulation_steps=2,  # Use gradient accumulation
)

def fine_tune_model(model, model_name, rank=None):
    if rank:
        model = apply_low_rank_approximation(model, rank)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Fine-Tune the Model
    print(f"Training model: {model_name} {'with low-rank approximation' if rank else 'baseline'}")
    trainer.train()
    
    # Save the model to disk to reload later
    model.save_pretrained(f"./{model_name}_fine_tuned")
    tokenizer.save_pretrained(f"./{model_name}_fine_tuned")

    # Clear GPU memory after training
    del trainer
    torch.cuda.empty_cache()
    gc.collect()

def evaluate_model(model_name):
    # Reload the fine-tuned model
    model = AutoModelForCausalLM.from_pretrained(f"./{model_name}_fine_tuned").to(device)
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Evaluate the Model
    with torch.no_grad():  # Disable gradient calculation for evaluation
        results = trainer.evaluate()
    print(f"Evaluation Results for {model_name}: {results}")
    
    # # Measure Inference Time
    # import time
    # start_time = time.time()
    # with torch.no_grad():  # Disable gradient calculation for inference
    #     trainer.predict(tokenized_test_dataset)
    # end_time = time.time()
    # inference_time = end_time - start_time
    # print(f"Inference Time for {model_name}: {inference_time} seconds")
    
    # Measure GPU Memory Usage
    memory_usage = torch.cuda.max_memory_allocated(device) / (1024**2)  # in MB
    print(f"GPU Memory Usage for {model_name}: {memory_usage} MB")
    
    # Clear GPU memory after evaluation
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()

# Define and load the models
low_rank_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
baseline_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Fine-Tune and Evaluate Both Models
fine_tune_model(low_rank_model, model_name, rank=50)  # Adjust rank as needed



  from .autonotebook import tqdm as notebook_tqdm





  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training model: distilgpt2 with low-rank approximation


  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 27/27 [00:49<00:00,  1.83s/it]


{'train_runtime': 49.3504, 'train_samples_per_second': 4.458, 'train_steps_per_second': 0.547, 'train_loss': 4.266697918927228, 'epoch': 0.98}


In [2]:
evaluate_model(model_name)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 37/37 [00:04<00:00,  8.88it/s]


Evaluation Results for distilgpt2: {'eval_loss': 0.2541678845882416, 'eval_model_preparation_time': 0.0, 'eval_runtime': 4.3334, 'eval_samples_per_second': 16.846, 'eval_steps_per_second': 8.538}


 38%|â–ˆâ–ˆâ–ˆâ–Š      | 14/37 [01:23<03:19,  8.66s/it]

OutOfMemoryError: CUDA out of memory. Tried to allocate 5.75 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 6.71 GiB is allocated by PyTorch, and 2.60 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)