In [None]:
alias = "<alias>"

# ModelTrainer
The ModelTrainer is a new interface for training designed to tackle many of the challenges that exist in todays Estimator class. Some key features include:

1. Improved Design Principles making the interface much more user friendly and approachable than previous Estimator.
2. Remove Dependency on Training-Toolkit for Script Mode and BYOC, and bring the training toolkit’s runtime driver code within PySDK for a smoother 1-stop and an extendible distributed runner.
3. Training Recipe support for easier setup of training for LLMs with PySDK

## ModelTrainer - Script Mode Case - 1

This case show cases the minimal setup for a ModelTrainer. A user need only to provide a desired training image and the commands they wish to execute in the container using the `SourceCode` class object config.


In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import SourceCode

pytorch_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310"

source_code = SourceCode(
    command="echo 'Hello World' && env",
)
model_trainer = ModelTrainer(
    training_image=pytorch_image,
    source_code=source_code,
    base_job_name=f"{alias}-simple-case-1",
)

In [None]:
model_trainer.train()

## ModelTrainer - Script Mode Case - 2

This case show cases an abstracted setup for script mode where a user can provide their training image and a `SourceCode` object config with path to their `source_dir`, `enty_script`, and any additional `requirements` to install in the training container.

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import SourceCode

pytorch_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310"

source_code = SourceCode(
    source_dir="basic-script-mode",
    requirements="requirements.txt",
    entry_script="custom_script.py",
)

model_trainer = ModelTrainer(
    training_image=pytorch_image,
    source_code=source_code,
    base_job_name=f"{alias}-simple-case-2",
)

In [None]:
model_trainer.train()

# Distributed Training Setup

In [None]:
!pip install "datasets[s3]" "requests<2.32.0"

In [None]:
import os

access_token = "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"
os.environ["HUGGING_FACE_HUB_TOKEN"] = access_token

model_id = "meta-llama/Llama-2-7b-hf"

dataset_name = "tatsu-lab/alpaca"

#### Load Data Set

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

from huggingface_hub.hf_api import HfFolder

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, access_token=access_token)


# Load dataset from huggingface.co
dataset = load_dataset(dataset_name)

dataset = dataset.shuffle(seed=42)

In [None]:
if "validation" not in dataset.keys():
    dataset["validation"] = load_dataset(dataset_name, split="train[:1%]")

    dataset["train"] = load_dataset(dataset_name, split="train[1%:]")

#### Prepare Dataset

In [None]:
from itertools import chain
from functools import partial


def group_texts(examples, block_size=2048):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


column_names = dataset["train"].column_names

lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"], return_token_type_ids=False),
    batched=True,
    remove_columns=list(column_names),
).map(
    partial(group_texts, block_size=2048),
    batched=True,
)

#### Save Dataset

In [None]:
# save data locally

training_input_path = f"distributed-training/processed/data/"
lm_dataset.save_to_disk(training_input_path)

print(f"Saved data to: {training_input_path}")

## ModelTrainer - Distributed Training - Case 1

This use cases shows how a user could perform a more complex setup for DistributedTraining using `torchrun` and setting up commands for the execution directly using the `command` parameter in the `SourceCode` class

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import Compute, SourceCode, InputData

env = {}
env["FI_PROVIDER"] = "efa"
env["NCCL_PROTO"] = "simple"
env["NCCL_SOCKET_IFNAME"] = "eth0"
env["NCCL_IB_DISABLE"] = "1"
env["NCCL_DEBUG"] = "WARN"

compute = Compute(
    instance_count=1,
    instance_type="ml.g5.48xlarge",
    volume_size_in_gb=96,
    keep_alive_period_in_seconds=3600,
)

hugging_face_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04"

hyperparameters = {
    "dataset_path": "/opt/ml/input/data/dataset",
    "epochs": 1,
    "max_steps": 100,
    "fsdp": "full_shard auto_wrap",
    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
    "gradient_checkpointing": True,
    "optimizer": "adamw_torch",
    "per_device_train_batch_size": 1,
    "model_id": model_id,
    "access_token": access_token,
}

In [None]:
source_code = SourceCode(
    source_dir="distributed-training/scripts",
    requirements="requirements.txt",
    command="torchrun --nnodes 1 \
            --nproc_per_node 8 \
            --master_addr algo-1 \
            --master_port 7777 \
            --node_rank $SM_CURRENT_HOST_RANK \
            run_clm_no_trainer.py",
)

model_trainer = ModelTrainer(
    training_image=hugging_face_image,
    compute=compute,
    environment=env,
    hyperparameters=hyperparameters,
    source_code=source_code,
    base_job_name=f"{alias}-distributed-case-1",
)

In [None]:
test_data = InputData(
    channel_name="dataset",
    data_source=training_input_path,
)
model_trainer.train(input_data_config=[test_data])

## ModelTrainer - Distributed Training - Case 2

This distributed training case showcases how a user could perform distributed training using an abstracted approach provided by the PySDK via the `Torchrun` distributed runner object.

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import (
    Compute, SourceCode, InputData
)

compute = Compute(
    instance_count=2,
    instance_type="ml.g5.48xlarge",
    volume_size_in_gb=96,
    keep_alive_period_in_seconds=3600
)

hugging_face_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04"

hyperparameters = {
    "dataset_path": "/opt/ml/input/data/dataset",
    "epochs": 1,
    "max_steps": 100,
    "fsdp": "full_shard auto_wrap",
    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
    "gradient_checkpointing": True,
    "optimizer": "adamw_torch",
    "per_device_train_batch_size": 1,
    "model_id": model_id,
    "access_token": access_token,
}

In [None]:
from sagemaker.modules.distributed import Torchrun, SMP

source_code = SourceCode(
    source_dir="distributed-training/scripts",
    requirements="requirements.txt",
    entry_script="run_clm_no_trainer.py",
)

# Run using Torchrun
torchrun = Torchrun()

# Run using Torchrun with SMP
torchrun_smp = Torchrun(
    smp=SMP(
        sm_activation_offloading=True,
        activation_loading_horizon=2,
    )
)

model_trainer = ModelTrainer(
    training_image=hugging_face_image,
    compute=compute,
    hyperparameters=hyperparameters,
    source_code=source_code,
    distributed_runner=torchrun,
    base_job_name=f"{alias}-distributed-case-2",
)

In [None]:
test_data = InputData(
    channel_name="dataset",
    data_source=training_input_path,
)
model_trainer.train(input_data_config=[test_data])

## ModelTrainer - Distributed Training - Case 3

In this case, we show how distributed training job can be setup and ran using a more generic `MPI` distributed runner with some additional mpi options set through the `mpi_additional_options` parameter.

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import (
    Compute, SourceCode, InputData
)

compute = Compute(
    instance_count=2,
    instance_type="ml.g5.48xlarge",
    volume_size_in_gb=96,
    keep_alive_period_in_seconds=3600
)

hugging_face_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04"

hyperparameters = {
    "dataset_path": "/opt/ml/input/data/dataset",
    "epochs": 1,
    "max_steps": 100,
    "fsdp": "full_shard auto_wrap",
    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
    "gradient_checkpointing": True,
    "optimizer": "adamw_torch",
    "per_device_train_batch_size": 1,
    "model_id": model_id,
    "access_token": access_token,
}

In [None]:
from sagemaker.modules.distributed import MPI

source_code = SourceCode(
    source_dir="distributed-training/scripts",
    requirements="requirements.txt",
    entry_script="run_clm_no_trainer.py",
)

# Run using MPI
mpi = MPI(
    mpi_additional_options=[
        "-x",
        "MASTER_ADDR=algo-1",
        "-x",
        "MASTER_PORT=7777",
    ]
)

model_trainer = ModelTrainer(
    training_image=hugging_face_image,
    compute=compute,
    hyperparameters=hyperparameters,
    source_code=source_code,
    distributed_runner=mpi,
    base_job_name=f"{alias}-distributed-case-3",
)

In [None]:
test_data = InputData(
    channel_name="dataset",
    data_source=training_input_path,
)
model_trainer.train(input_data_config=[test_data])

# ModelTrainer Recipes

Training recipes is an abstracted way to create a ModelTrainer for training of LLM models via a recipe.yaml with configuration for the Trainer, model weights, etc.

## ModelTrainer - Training Recipes - Case 1

This example showcases how a user could leverage SageMaker pre-defined training recipe `training/llama/hf_llama3_8b_seq8192_gpu` for training a llama3 model using synthetic data.

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import Compute

recipe_overrides = {
    "run": {
        "results_dir": "/opt/ml/model",
    },
    "exp_manager": {
        "exp_dir": "",
        "explicit_log_dir": "/opt/ml/output/tensorboard",
        "checkpoint_dir": "/opt/ml/checkpoints",
        "export_full_model": {
            "save_last": False
        }
    },   
    "model": {
        "data": {
            "train_dir": "/opt/ml/input/data/train",
            "val_dir": "/opt/ml/input/data/val",
            "use_synthetic_data": True,
        },
        "train_batch_size": 1,
        "num_hidden_layers": 4,
        "fp8": False,
    },
    "trainer": {
        "num_nodes": 1
    }
}

training_image = "438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu"

model_trainer = ModelTrainer.from_recipe(
    training_recipe="training/llama/hf_llama3_8b_seq8192_gpu",
    training_image=training_image,
    recipe_overrides=recipe_overrides,
    compute=Compute(instance_type="ml.g5.48xlarge"),
    base_job_name=f"{alias}-recipe-case-1",
)

In [None]:
model_trainer.train()

## ModelTrainer - Training Recipes - Case 2

This example showcases how a user can leverage the sagemaker recipe adaptors to train a model with configurations in a custom local `custom-recipe.yaml`.

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import Compute

training_image = "438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu"

model_trainer = ModelTrainer.from_recipe(
    training_recipe="recipes/custom-recipe.yaml",
    training_image=training_image,
    compute=Compute(instance_type="ml.g5.48xlarge"),
    base_job_name=f"{alias}-recipe-case-2",
)

In [None]:
model_trainer.train()

## ModelTrainer - Training Recipes - Case 3

This usecase shows how a `neuronx-distributed-training` recipe from github url can be training on a trainum instance using custom data in an S3 bucket.

In [None]:
from sagemaker import session

session = session.Session()
base_job_name = "trn-llama"
compiler_cache_bucket = f"s3://{session.default_bucket()}/{base_job_name}/compiler-cache"
print(f"Compiler cache: {compiler_cache_bucket}")

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import Compute, InputData, StoppingCondition

recipe_overrides = {
    "data": {
        "train_dir": "/opt/ml/input/data/train",
    },
    "model": {
        "model_config": "/opt/ml/input/data/train/config.json",
    },
    "trainer": {
        "max_epochs": 1,
    },
    "compiler_cache_url": compiler_cache_bucket,
}
env = {
    "FI_EFA_FORK_SAFE": "1"
}

training_image = "438465156666.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-neuron"

model_trainer = ModelTrainer.from_recipe(
    training_recipe="https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/refs/heads/main/examples/conf/hf_llama3_8B_config.yaml",
    recipe_overrides=recipe_overrides,
    training_image=training_image,
    compute=Compute(
        instance_type="ml.trn1.32xlarge",
        instance_count=2,
    ),
    stopping_condition=StoppingCondition(
        max_runtime_in_seconds=3600
    ),
    environment=env
)

In [None]:
train = InputData(
    channel_name="train",
    data_source="s3://sagemaker-recipes-438465156666-data/data_llama3/",
)

model_trainer.train(input_data_config=[train], wait=False)