In [None]:
import sys, os

# Get the absolute path of the root directory
root_dir = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
sys.path.insert(0, root_dir)

## Simple Case Minimally Setup ModelTrainer and Execute Commands

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import SourceCode

pytorch_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310"

source_code = SourceCode(
    command="echo 'Hello World' && env",
)
model_trainer = ModelTrainer(
    training_image=pytorch_image,
    source_code=source_code,
)

In [None]:
model_trainer.train()

Successful Run - https://tiny.amazon.com/3maxeyb/IsenLink

## Simple Script Mode Case - 1

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import SourceCode

pytorch_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310"

source_code = SourceCode(
    source_dir="basic-script-mode",
    command="python custom_script.py",
)

hyperparameters = {
    "secret_token": "123456",
}

env_vars = {
    "PASSWORD": "123456"
}

model_trainer = ModelTrainer(
    training_image=pytorch_image,
    source_code=source_code,
    hyperparameters=hyperparameters,
    environment=env_vars,
)

model_trainer.train()

Successful Run - https://tiny.amazon.com/6uy7pmpj/IsenLink

## Simple Script Mode Case - 2


In [None]:
from sagemaker.modules.configs import SourceCode

source_code = SourceCode(
    source_dir="basic-script-mode",
    requirements="requirements.txt",
    entry_script="custom_script.py",
)

model_trainer = ModelTrainer(
    training_image=pytorch_image,
    source_code=source_code,
)

In [None]:
model_trainer.train()

Successful Run - https://tiny.amazon.com/7n4n4ogr/IsenLink

## Distributed Training Case

In [None]:
!pip install "datasets[s3]"

In [None]:
access_token = os.environ.get("HF_TOKEN", "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")
model_id = "meta-llama/Llama-2-7b-hf"

dataset_name = "tatsu-lab/alpaca"

#### Load Data Set

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

from huggingface_hub.hf_api import HfFolder

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, access_token=access_token)


# Load dataset from huggingface.co
dataset = load_dataset(dataset_name)

dataset = dataset.shuffle(seed=42)

In [None]:
if "validation" not in dataset.keys():
    dataset["validation"] = load_dataset(dataset_name, split="train[:1%]")

    dataset["train"] = load_dataset(dataset_name, split="train[1%:]")

#### Prepare Dataset

In [None]:
from itertools import chain
from functools import partial


def group_texts(examples, block_size=2048):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


column_names = dataset["train"].column_names

lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"], return_token_type_ids=False),
    batched=True,
    remove_columns=list(column_names),
).map(
    partial(group_texts, block_size=2048),
    batched=True,
)

#### Save Dataset

In [None]:
# save data locally

training_input_path = f"distributed-training/processed/data/"
lm_dataset.save_to_disk(training_input_path)

print(f"Saved data to: {training_input_path}")

#### Model Trainer Torchrun

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import Compute, SourceCode, InputData

env = {}
env["FI_PROVIDER"] = "efa"
env["NCCL_PROTO"] = "simple"
env["NCCL_SOCKET_IFNAME"] = "eth0"
env["NCCL_IB_DISABLE"] = "1"
env["NCCL_DEBUG"] = "WARN"
env["FI_EFA_USE_DEVICE_RDMA"] = "1"
env["RDMAV_FORK_SAFE"] = "1"

compute = Compute(
    instance_count=2,
    instance_type="ml.p4d.24xlarge",
    volume_size_in_gb=96,
    keep_alive_period_in_seconds=3600
)

hugging_face_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04"

hyperparameters = {
    "dataset_path": "/opt/ml/input/data/dataset",
    "epochs": 1,
    "max_steps": 100,
    "fsdp": "full_shard auto_wrap",
    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
    "gradient_checkpointing": True,
    "optimizer": "adamw_torch",
    "per_device_train_batch_size": 1,
    "model_id": model_id,
    "access_token": access_token,
}

In [None]:
source_code = SourceCode(
    source_dir="distributed-training/scripts",
    requirements="requirements.txt",
    command="torchrun --nnodes 2 \
            --nproc_per_node 8 \
            --master_addr algo-1 \
            --master_port 7777 \
            --node_rank $SM_CURRENT_HOST_RANK \
            run_clm_no_trainer.py",
)

model_trainer = ModelTrainer(
    training_image=hugging_face_image,
    compute=compute,
    environment=env,
    hyperparameters=hyperparameters,
    source_code=source_code,
)

In [None]:
test_data = InputData(
    channel_name="dataset",
    data_source=training_input_path,
)
model_trainer.train(input_data_config=[test_data], wait=False)

Successful Run - https://tiny.amazon.com/10wljn1yu/IsenLink

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import (
    Compute, SourceCode, InputData
)
from sagemaker.modules.distributed import (
    Torchrun,
    MPI
)

compute = Compute(
    instance_count=2,
    instance_type="ml.p4d.24xlarge",
    volume_size_in_gb=96,
    keep_alive_period_in_seconds=3600
)

hugging_face_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04"

hyperparameters = {
    "dataset_path": "/opt/ml/input/data/dataset",
    "epochs": 1,
    "max_steps": 100,
    "fsdp": "full_shard auto_wrap",
    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
    "gradient_checkpointing": True,
    "optimizer": "adamw_torch",
    "per_device_train_batch_size": 1,
    "model_id": model_id,
    "access_token": access_token,
}

In [None]:
source_code = SourceCode(
    source_dir="distributed-training/scripts",
    requirements="requirements.txt",
    entry_script="run_clm_no_trainer.py",
)

# Run using Torchrun
torchrun = Torchrun()

# Run using MPI
mpi = MPI(
    mpi_additional_options=[
        "-x",
        "MASTER_ADDR=algo-1",
        "-x",
        "MASTER_PORT=7777",
    ]
)

model_trainer = ModelTrainer(
    training_image=hugging_face_image,
    compute=compute,
    hyperparameters=hyperparameters,
    source_code=source_code,
    distributed_runner=mpi,
)

In [None]:
test_data = InputData(
    channel_name="dataset",
    data_source=training_input_path,
)
model_trainer.train(input_data_config=[test_data], wait=False)