In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from accelerate import Accelerator

# Clear CUDA cache
torch.cuda.empty_cache()

# Initialize Accelerator
accelerator = Accelerator()

# Reinitialize Accelerator State
accelerator.state._reset_state()

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set pad_token as eos_token for compatibility
tokenizer.pad_token = tokenizer.eos_token

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)

# Load a synthetic dataset or use a subset of a standard dataset (for simplicity, we use wikitext here)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Data collator to include labels
def data_collator(batch):
    input_ids = torch.stack([example['input_ids'] for example in batch])
    
    # Handle attention_mask
    if 'attention_mask' in batch[0]:
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
    else:
        attention_mask = (input_ids != tokenizer.pad_token_id).long()
    
    # Labels are the same as input_ids, but shifted by one token to the right
    labels = input_ids.clone()
    
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Fine-Tuning Setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # Set to 1 for a quick experiment; increase for more thorough training
    weight_decay=0.01,
    fp16=True if torch.cuda.is_available() else False,  # Use FP16 if available for faster training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-Tune the Model
trainer.train()

# Evaluate the Model
results = trainer.evaluate()
print(f"Evaluation Results: {results}")

# Measure Inference Time
import time

start_time = time.time()
trainer.predict(tokenized_datasets)
end_time = time.time()

inference_time = end_time - start_time
print(f"Inference Time: {inference_time} seconds")

# Measure GPU Memory Usage
memory_usage = torch.cuda.max_memory_allocated(device) / (1024**2)  # in MB
print(f"GPU Memory Usage: {memory_usage} MB")


  from .autonotebook import tqdm as notebook_tqdm
  9%|▊         | 4/46 [05:08<54:25, 77.76s/it]