Import Required libraries

In [None]:
import os
from accelerate import Accelerator, notebook_launcher
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
import torch
from lighteval import LightevalModel

Set up Global Configuration

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"  # Replace with your model
dataset_name = "AI-MO/NuminaMath-TIR"  # Replace with your dataset
output_dir = "/mnt/st1/results/DeepSeek-Train"
log_dir = "/mnt/st1/logs"
learning_rate = 2.0e-5
batch_size = 4
num_train_epochs = 3
max_seq_length = 4096
gradient_accumulation_steps = 4
logging_steps = 5
eval_steps = 100
bf16 = True  # Set to True if you're using bf16 precision

Load Dataset and Tokenizer

In [None]:
dataset = load_dataset(dataset_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Tokenization of Dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=max_seq_length)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

Set up Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    logging_dir=log_dir,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    bf16=bf16,  # Enable bf16 if supported by your hardware
    save_steps=500,
    load_best_model_at_end=True,
)

Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

Fine-tune the Model

In [None]:
trainer.train()

Save the Fine-tuned Model

In [None]:
model.save_pretrained(os.path.join(output_dir, "fine_tuned_model"))
tokenizer.save_pretrained(os.path.join(output_dir, "fine_tuned_model"))

Evaluate the Model

In [None]:
# Initialize a pipeline for evaluation
eval_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Evaluate with a sample input
input_text = "Given the equation x + y = 10, solve for x when y = 3."
output = eval_pipeline(input_text, max_length=50)
print("Model Output: ", output)

Using Accelerate for Multi-GPU Setup

In [None]:
accelerator = Accelerator()

# You can specify your custom configurations for DeepSpeed and Multi-GPU training
accelerate_config = "configs/zero3.yaml"  # Assuming you have a config for DeepSpeed
notebook_launcher(fine_tune_model, args=(accelerator,))

Evaluating with Lighteval

In [None]:

lighteval_model = LightevalModel(model_name)
lighteval_model.evaluate(task="aime24", output_dir=os.path.join(output_dir, "eval_results"))