In [None]:
%pip install --upgrade sagemaker


In [None]:
%%time
import os

import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.huggingface import HuggingFace

role = (
    get_execution_role()
)  # provide a pre-existing role ARN as an alternative to creating a new role
print(f"SageMaker Execution Role: {role}")

client = boto3.client("sts")
account = client.get_caller_identity()["Account"]
print(f"AWS account: {account}")

session = boto3.session.Session()
region = session.region_name
print(f"AWS region: {region}")

sm_boto_client = boto3.client("sagemaker")
sagemaker_session = sagemaker.session.Session(boto_session=session)

# get default bucket
default_bucket = sagemaker_session.default_bucket()
print()
print("Default bucket for this session: ", default_bucket)

In [None]:
s3_train_bucket = "s3://<path>/train/openweb/bloom/train/"  # Specify your S3 bucket path for training dataset
s3_test_bucket = "s3://<path>/train/openweb/bloom/val/"  # Specify your S3 bucket path for test dataset



In [None]:
s3_output_location = f"s3://{default_bucket}/output/"
print(f"your output data will be stored in: s3://{default_bucket}/output/")

In [None]:
train = sagemaker.inputs.TrainingInput(
            s3_train_bucket, distribution="FullyReplicated", s3_data_type="S3Prefix"
        )
data_channels = {"train": train}

test = sagemaker.inputs.TrainingInput(
        s3_test_bucket, distribution="FullyReplicated", s3_data_type="S3Prefix")
data_channels["test"] = test

print(data_channels)

In [None]:
hyperparameters = {
    "num_train_epochs": 1,
    "seed": 100,
    "lr_scheduler_type": "linear",
    "num_warmup_steps": 1,
    "max_train_steps" : 1000,
    "max_eval_steps" : 100
}

In [None]:
instance_type = "ml.p4d.24xlarge"

# for gpt2 30b, you need at least 16 p4d instances
# gpt2 xl can be run using a single p4d at the minimum
# gpt2 small can be run using a single p3.16 at the minimum
instance_count = 1

# set to the number of GPUs on that instance
processes_per_host = 8

In [None]:
volume_size = 1024

In [None]:
from sagemaker.pytorch import PyTorch

smp_estimator = PyTorch(
    entry_point="train_bloom_ds.py",
    source_dir="./dscode",
    role=role,
    instance_type=instance_type,
    volume_size=volume_size,
    instance_count=instance_count,
    sagemaker_session=sagemaker_session,
    distribution={
        "mpi": {
            "enabled": True,
            "processes_per_host": processes_per_host,
            "custom_mpi_options": "-verbose --NCCL_DEBUG=INFO"

        }
    },
    framework_version="1.12.0",
    py_version="py38",
    output_path=s3_output_location,
    hyperparameters=hyperparameters,
    debugger_hook_config=False,
    disable_profiler=True
    
)

In [None]:
smp_estimator.fit(inputs=data_channels, wait=True)