In [1]:

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import horovod.torch as hvd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Set up model, tokenizer, and device
#model_name = "gpt2"
#tokenizer = GPT2Tokenizer.from_pretrained(model_name)
#model = GPT2LMHeadModel.from_pretrained(model_name)

# Initialize Horovod
hvd.init()

# Carica il modello e il tokenizer DistilGPT-2
model_name = 'distilgpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Adjust the model for distributed training
device = torch.device("cuda" if torch.cuda.is_available() and not hvd.local_rank() else "cpu")
model.to(device)

# Broadcast parameters from rank 0 to ensure all ranks start from the same weights
hvd.broadcast_parameters(model.state_dict(), root_rank=0)


In [3]:

from datasets import load_dataset

train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
val_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

print(train_dataset.column_names)
print(val_dataset.column_names)



['text']
['text']


In [None]:
from datasets import Dataset
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer
import torch
import horovod.torch as hvd

# Ensure padding token is set if it's not already
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize train and validation datasets
train_dataset = tokenizer(train_dataset['text'], 
                           max_length=1024, 
                           padding='max_length', 
                           truncation=True, 
                           return_tensors="pt")

val_dataset = tokenizer(val_dataset['text'], 
                         max_length=1024, 
                         padding='max_length', 
                         truncation=True, 
                         return_tensors="pt")

# Convert to Dataset objects if needed (use `datasets` library's `Dataset` class)
train_dataset = Dataset.from_dict(train_dataset)
val_dataset = Dataset.from_dict(val_dataset)

# TrainingArguments configuration
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
    remove_unused_columns=False,  # Ensure unused columns are kept
)

# Wrap optimizer with Horovod's DistributedOptimizer for synchronized gradient updates
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

# Broadcast the optimizer state from rank 0 to ensure all workers start with the same initial state
hvd.broadcast_optimizer_state(optimizer, root_rank=0)


# Set up the Trainer with Horovod support
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, None),  # Pass the optimizer to Trainer
)

# Run training
if hvd.rank() == 0:
    trainer.train()


Step,Training Loss
