In [1]:
import torch
import numpy
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from accelerate import Accelerator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("mps" if 0 else "cpu")
print(device)

cpu


In [3]:

# Initialize Accelerator
accelerator = Accelerator()

# Check for GPU availability (using Metal Performance Shaders on Apple Silicon)
device = torch.device("mps" if 0 else "cpu")

# Load the tokenizer and model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set pad_token as eos_token for compatibility
tokenizer.pad_token = tokenizer.eos_token

# Low-Rank Approximation Function
def low_rank_approximation(weight_matrix, rank):
    u, s, v = torch.svd(weight_matrix)
    s[rank:] = 0
    low_rank_weight = torch.mm(u, torch.mm(torch.diag(s), v.t()))
    return low_rank_weight

# Apply Low-Rank Approximation to Model
def apply_low_rank_approximation(model, rank=50):
    # Example layer (adjust based on the model architecture)
    layer = model.transformer.h[0].attn.c_attn
    original_weight = layer.weight
    approx_weight = low_rank_approximation(original_weight, rank)
    layer.weight = torch.nn.Parameter(approx_weight)
    return model

# Load and prepare datasets
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")  # Use only 1% of the dataset

# Split dataset into train, validation, and test subsets
train_val_test_split = dataset.train_test_split(test_size=0.4, seed=42)
train_dataset = train_val_test_split['train']
val_test_split = train_val_test_split['test'].train_test_split(test_size=0.5, seed=42)
val_dataset = val_test_split['train']
test_dataset = val_test_split['test']

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Data collator to include labels
def data_collator(batch):
    input_ids = torch.stack([example['input_ids'] for example in batch])
    
    # Handle attention_mask
    if 'attention_mask' in batch[0]:
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
    else:
        attention_mask = (input_ids != tokenizer.pad_token_id).long()
    
    # Labels are the same as input_ids, but shifted by one token to the right
    labels = input_ids.clone()
    
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Fine-Tuning Setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Disable evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Reduce batch size
    per_device_eval_batch_size=2,  # Reduce batch size during evaluation
    num_train_epochs=1,  # Set to 1 for a quick experiment; increase for more thorough training
    weight_decay=0.01,
    fp16=False,  # Disable FP16 on MPS (currently unsupported)
    gradient_accumulation_steps=2,  # Use gradient accumulation
)

def fine_tune_model(model, model_name, rank=None):
    if rank:
        model = apply_low_rank_approximation(model, rank)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Fine-Tune the Model
    print(f"Training model: {model_name} {'with low-rank approximation' if rank else 'baseline'}")
    trainer.train()
    
    # Save the model to disk to reload later
    model.save_pretrained(f"./{model_name}_fine_tuned")
    tokenizer.save_pretrained(f"./{model_name}_fine_tuned")

    # Clear GPU memory after training
    del trainer
    gc.collect()

def evaluate_model(model_name):
    # Reload the fine-tuned model
    model = AutoModelForCausalLM.from_pretrained(f"./{model_name}_fine_tuned").to(device)
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Evaluate the Model
    with torch.no_grad():  # Disable gradient calculation for evaluation
        results = trainer.evaluate()
    print(f"Evaluation Results for {model_name}: {results}")
    
    # Measure Inference Time
    import time
    start_time = time.time()
    with torch.no_grad():  # Disable gradient calculation for inference
        trainer.predict(tokenized_test_dataset)
    end_time = time.time()
    inference_time = end_time - start_time
    print(f"Inference Time for {model_name}: {inference_time} seconds")
    
    # Measure GPU Memory Usage (MPS backend doesn't provide memory metrics easily)
    print("GPU Memory Usage is currently not easily accessible on MPS backend.")
    
    # Clear GPU memory after evaluation
    del model, trainer
    gc.collect()



In [4]:
print("Baseline -\n")
baseline_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
fine_tune_model(baseline_model, model_name)  # Adjust rank as needed
evaluate_model(model_name)
del baseline_model

Baseline -

Training model: distilgpt2 baseline


100%|██████████| 27/27 [00:33<00:00,  1.25s/it]


{'train_runtime': 33.691, 'train_samples_per_second': 6.53, 'train_steps_per_second': 0.801, 'train_loss': 3.261283309371383, 'epoch': 0.98}


100%|██████████| 37/37 [00:03<00:00, 10.57it/s]


Evaluation Results for distilgpt2: {'eval_loss': 1.2017920017242432, 'eval_model_preparation_time': 0.0005, 'eval_runtime': 3.6254, 'eval_samples_per_second': 20.136, 'eval_steps_per_second': 10.206}


100%|██████████| 37/37 [02:37<00:00,  4.26s/it]


Inference Time for distilgpt2: 157.64915084838867 seconds
GPU Memory Usage is currently not easily accessible on MPS backend.


In [None]:
print(device)

In [5]:
n = 50
print("Rank ",n,"-\n")
low_rank_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
fine_tune_model(low_rank_model, model_name, rank=n)  # Adjust rank as needed
evaluate_model(model_name)
del low_rank_model

Rank  50 -

Training model: distilgpt2 with low-rank approximation


100%|██████████| 27/27 [00:36<00:00,  1.34s/it]


{'train_runtime': 36.173, 'train_samples_per_second': 6.082, 'train_steps_per_second': 0.746, 'train_loss': 3.0594360916702836, 'epoch': 0.98}


100%|██████████| 37/37 [00:03<00:00, 10.70it/s]


Evaluation Results for distilgpt2: {'eval_loss': 0.8591861724853516, 'eval_model_preparation_time': 0.0006, 'eval_runtime': 3.4987, 'eval_samples_per_second': 20.865, 'eval_steps_per_second': 10.575}


100%|██████████| 37/37 [02:32<00:00,  4.12s/it]


Inference Time for distilgpt2: 152.36383509635925 seconds
GPU Memory Usage is currently not easily accessible on MPS backend.


In [6]:
n = 100
print("Rank ",n,"-\n")
low_rank_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
fine_tune_model(low_rank_model, model_name, rank=n)  # Adjust rank as needed
evaluate_model(model_name)
del low_rank_model

Rank  100 -

Training model: distilgpt2 with low-rank approximation


100%|██████████| 27/27 [00:33<00:00,  1.26s/it]


{'train_runtime': 33.9655, 'train_samples_per_second': 6.477, 'train_steps_per_second': 0.795, 'train_loss': 3.0713224057798034, 'epoch': 0.98}


100%|██████████| 37/37 [00:03<00:00, 10.76it/s]


Evaluation Results for distilgpt2: {'eval_loss': 0.7671974897384644, 'eval_model_preparation_time': 0.0005, 'eval_runtime': 3.4734, 'eval_samples_per_second': 21.017, 'eval_steps_per_second': 10.652}


100%|██████████| 37/37 [03:11<00:00,  5.17s/it]


Inference Time for distilgpt2: 191.55461192131042 seconds
GPU Memory Usage is currently not easily accessible on MPS backend.


In [8]:
n = 150
print("Rank ",n,"-\n")
low_rank_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
fine_tune_model(low_rank_model, model_name, rank=n)  # Adjust rank as needed
evaluate_model(model_name)
del low_rank_model

Rank  150 -

Training model: distilgpt2 with low-rank approximation


                                               
100%|██████████| 27/27 [00:34<00:00,  1.29s/it]


{'train_runtime': 34.7999, 'train_samples_per_second': 6.322, 'train_steps_per_second': 0.776, 'train_loss': 3.1600098786530673, 'epoch': 0.98}


100%|██████████| 37/37 [00:03<00:00, 10.62it/s]


Evaluation Results for distilgpt2: {'eval_loss': 0.9068130254745483, 'eval_model_preparation_time': 0.0005, 'eval_runtime': 3.5134, 'eval_samples_per_second': 20.777, 'eval_steps_per_second': 10.531}




RuntimeError: MPS backend out of memory (MPS allocated: 30.05 GB, other allocations: 250.19 MB, max allowed: 36.27 GB). Tried to allocate 6.52 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
n = 200
print("Rank ",n,"-\n")
low_rank_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
fine_tune_model(low_rank_model, model_name, rank=n)  # Adjust rank as needed
evaluate_model(model_name)
del low_rank_model