In [None]:
import sys, os

from tests.integ.test_processing import image_uri

# Get the absolute path of the root directory
root_dir = os.path.abspath(os.path.join(os.getcwd(), '../../..'))
sys.path.insert(0, root_dir)

## Simple Case Minimally Setup ModelTrainer and Execute Commands

1. No longer have an issue about API Limit
2. Maintain contract with container (no secret handshake)
3. Decouple Training Toolkit

In [None]:
from sagemaker.modules.train.model_trainer import ModelTrainer
model_trainer = ModelTrainer(training_image="python:3.10.15-slim")


In [None]:
from sagemaker.modules.configs import SourceCodeConfig

source_code_config = SourceCodeConfig(
    command="echo 'Hello World' && env",
)
model_trainer.train(source_code_config=source_code_config)

## Simple Script Mode Case - 1

1. No more code smell of user having to use /opt/ml/input/data/code/custom_script.py
```python
source_code_config = SourceCodeConfig(
    source_dir="script-mode-basic",
    command="python /opt/ml/input/data/code/custom_script.py",
)
```

2. No more risk of API Limit being reached

In [None]:
from sagemaker.modules.configs import SourceCodeConfig

source_code_config = SourceCodeConfig(
    source_dir="script-mode-basic",
    command="python custom_script.py",
)
model_trainer.train(
    source_code_config=source_code_config,
    input_data_channels={
        "train": "script-mode-basic/data/train",
        "test": "script-mode-basic/data/test",
    }
)

## Simple Script Mode Case - 2


In [None]:
from sagemaker.modules.configs import SourceCodeConfig

source_code_config = SourceCodeConfig(
    source_dir="script-mode-basic",
    requirements="requirements.txt",
    entry_script="custom_script.py",
)
model_trainer.train(
    source_code_config=source_code_config,
    input_data_channels={
        "train": "script-mode-basic/data/train",
        "test": "script-mode-basic/data/test",
    }
)

## Distributed Training Case

In [None]:
!pip install "datasets[s3]"

In [None]:
access_token = os.environ.get("HF_TOKEN", "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")
model_id = "meta-llama/Llama-2-7b-hf"

dataset_name = "tatsu-lab/alpaca"


#### Load Data Set

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer 

from huggingface_hub.hf_api import HfFolder;

# Load Tokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_id, access_token=access_token)


# Load dataset from huggingface.co
dataset = load_dataset(dataset_name)

dataset = dataset.shuffle(seed=42)


In [None]:
if "validation" not in dataset.keys():
    dataset["validation"] = load_dataset(
        dataset_name,
        split="train[:1%]"
    )

    dataset["train"] = load_dataset(
        dataset_name,
        split="train[1%:]"
    )

#### Prepare Dataset

In [None]:
from itertools import chain
from functools import partial

def group_texts(examples,block_size = 2048):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

column_names = dataset["train"].column_names

lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"],return_token_type_ids=False), batched=True, remove_columns=list(column_names)
).map(
    partial(group_texts, block_size=2048),
    batched=True,
)

#### Save Dataset

In [None]:
#save data locally

training_input_path = f'distributed-training/processed/data/'
lm_dataset.save_to_disk(training_input_path)

print(f"Saved data to: {training_input_path}")

#### Model Trainer Torchrun

In [None]:
from sagemaker import image_uris
from sagemaker.modules.train.model_trainer import ModelTrainer
from sagemaker.modules.configs import ResourceConfig

env = {}
env['FI_PROVIDER'] = 'efa'
env['NCCL_PROTO'] = 'simple'
env['NCCL_SOCKET_IFNAME'] = "eth0"
env['NCCL_IB_DISABLE'] = "1"
env['NCCL_DEBUG'] = "WARN"
env['FI_EFA_USE_DEVICE_RDMA'] = '1'
env['RDMAV_FORK_SAFE'] = '1'


resource_config = ResourceConfig(
    instance_count=2,
    instance_type="ml.p4d.24xlarge" ,
    volume_size_in_gb=96,
)

model_trainer = ModelTrainer(
    training_image=image_uris.retrieve(
        framework="huggingface",
        region="us-west-2",
        version="4.28.1",
        base_framework_version="pytorch2.0.0",
        image_scope="training",
        py_version="py310",
        distribution={"torch_distributed": {"enabled": True}}
    ),
    environment=env,
    resource_config=resource_config
)


In [None]:
from sagemaker.modules.configs import SourceCodeConfig


hyper_parameters={
    'dataset_path': '/opt/ml/input/data/dataset',
    'epochs': 1,
    'max_steps':100,
    'fsdp': '"full_shard auto_wrap"',
    'fsdp_transformer_layer_cls_to_wrap': "LlamaDecoderLayer",
    'gradient_checkpointing': True,
    'optimizer': "adamw_torch",
    'per_device_train_batch_size': 1,
    'model_id': model_id,
    'access_token': access_token
}

source_code_config = SourceCodeConfig(
    source_dir="distributed-training/scripts",
    requirements="requirements.txt",
    command="torchrun --nnodes 2 \
            --nproc_per_node $SM_NUM_GPUS \
            --master_addr algo-1 \
            --master_port 7777 \
            --node_rank $SM_CURRENT_HOST_INDEX \
            run_clm_no_trainer.py",
)

model_trainer.train(
    source_code_config=source_code_config,
    hyper_parameters=hyper_parameters,
    input_data_channels={"dataset": training_input_path},
)

## Problems to Solve

### How can we make it easier for user to build an image?
* There are challenges here because of some of the OS differences in installing some ML frameworks which depend on some lower level dependencies being installed like CUDA - https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Amazon-Linux&target_version=2023&target_type=rpm_network

```python
custom_image = custom_image_builder(
    image_name="my_custom_image",
    requirements=requirements.txt,
    base_image="python:3.10.15-slim",
    docker_file="Dockerfile",
    enviornment={"SOME_VAR": 4},
    upload_ecr=True,
    local_snapshot=True,
)

model_trainer = ModelTrainer(training_image=custom_image)
```

In [None]:
from sagemaker.modules.train.model_trainer import ModelTrainer

python_ecr_image = "public.ecr.aws/docker/library/python:3.10.15-slim"
python_docker_image = "python:3.10.15-slim"

# This does not work
model_trainer = ModelTrainer(training_image=python_ecr_image)


In [None]:
source_code_config = SourceCodeConfig(
    command="echo 'Hello World'",
)
model_trainer.train(source_code_config=source_code_config)