## Prerequisites

In [None]:
!pip uninstall sagemaker-core -y

In [None]:
!pip install sagemaker ../../dist/sagemaker_core-1.0.0.tar.gz

## Torchrun - Distributed Training

In [None]:
import os

access_token = os.environ.get("HF_TOKEN", "<hard_code>") # update the access_token and change the model name to use llama 2 
model_id = "meta-llama/Llama-2-7b-hf"
model_id_2 = "facebook/opt-13b"

dataset_name = "tatsu-lab/alpaca"


In [None]:
!pip install "transformers" "datasets[s3]" "sagemaker" "boto3" --upgrade --quiet

In [None]:
!pip install -r scripts/requirements.txt

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer 

from huggingface_hub.hf_api import HfFolder;

# Load Tokenizer 

tokenizer = AutoTokenizer.from_pretrained(model_id, access_token=access_token)

# Load dataset from huggingface.co
dataset = load_dataset(dataset_name)

# downsample dataset to 10k
dataset = dataset.shuffle(42)


In [None]:
if "validation" not in dataset.keys():
    dataset["validation"] = load_dataset(
        dataset_name,
        split="train[:1%]"
    )

    dataset["train"] = load_dataset(
        dataset_name,
        split="train[1%:]"
    )

In [None]:
from itertools import chain
from functools import partial


def group_texts(examples,block_size = 2048):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

column_names = dataset["train"].column_names

lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"],return_token_type_ids=False), batched=True, remove_columns=list(column_names)
).map(
    partial(group_texts, block_size=2048),
    batched=True,
)

In [None]:
#save data locally

training_input_path = f'processed/data/'
lm_dataset.save_to_disk(training_input_path)

print(f"Saved data to: {training_input_path}")

In [None]:
training_input_path = f's3://{sess.default_bucket()}/processed/data/'
print(f"training dataset to: {training_input_path}")# save train_dataset to s3
lm_dataset.save_to_disk(training_input_path)

print(f"uploaded data to: {training_input_path}")

In [None]:
import time
from sagemaker.huggingface import HuggingFace
from sagemaker.pytorch import PyTorch
# define Training Job Name 
job_name = f'huggingface-fsdp-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'


# hyperparameters, which are passed into the training job
hyperparameters={
    'model_id': model_id, # model id from huggingface.co/models
    'dataset_path': '/opt/ml/input/data/train', # path where sagemaker will save training dataset
    'valid_path':"/opt/ml/input/data/valid",
    'gradient_checkpointing': True, # enable gradient checkpointing
    'bf16': True, # enable mixed precision training
    'optimizer': "adamw_torch", # optimizer
    'per_device_train_batch_size': 1, # batch size per device during training
    'epochs': 1, # number of epochs to train
    'fsdp': '"full_shard auto_wrap"', # fully sharded data parallelism
    'fsdp_transformer_layer_cls_to_wrap': "LlamaDecoderLayer", # transformer layer to wrap
    'max_steps':100,
    'access_token': access_token
}

# this environment variables are required for P4d instances to enable EFA.
env = {}
env['FI_PROVIDER'] = 'efa'
env['NCCL_PROTO'] = 'simple'
env['FI_EFA_USE_DEVICE_RDMA'] = '1'
env['RDMAV_FORK_SAFE'] = '1'

# estimator 
huggingface_estimator = HuggingFace(
    entry_point='run_clm_lora.py',
    source_dir='./scripts',
    instance_type="ml.p4d.24xlarge",
    instance_count=1,
    volume_size=96,
    role=role,
    job_name=job_name,
    transformers_version='4.28.1',
    pytorch_version='2.0.0',
    py_version="py310",
    environment=env,
    hyperparameters = hyperparameters,
    disable_output_compression=True,
    keep_alive_period_in_seconds=600,
    distribution={"torch_distributed": {"enabled": True}} # enable torchrun 
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {'train': training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

In [None]:
from sagemaker_core.model_trainer.ModelTrainer import ModelTrainer, FrameworkImageSpec
from sagemaker_core.shapes import ResourceConfig


instance_type = "ml.p4d.24xlarge"
resource_config = ResourceConfig(
)

framework_image = FrameworkImageSpec(
)

model_trainer = ModelTrainer()