## SageMaker Training jobs demo

## 3. Preparing Notebook 

#### Install Required dependencies 

In [None]:
! pip install datasets transformers
! pip install -U sagemaker boto3

## 4. Preperaing Dataset

For this example we will be using Food 101 dataset which originally contains 76k samples of food images comprising of 101 labels. For this run we will limit the number of samples to 11k with 10k used for training and 1k used for validation.

In [None]:
from datasets import load_dataset,DatasetDict

# Load a dataset Let's start by loading a small image classification dataset and taking a look at its structure
ds = DatasetDict()
ds["train"] = load_dataset('food101',split="train")
ds["validation"] = load_dataset('food101',split="validation")

### Upload Dataset to S3 for Training

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


In [None]:

training_input_path = f's3://{sess.default_bucket()}/dataset/food101'
print(f"training dataset to: {training_input_path}")# save train_dataset to s3
ds.save_to_disk(training_input_path)

print(f"uploaded data to: {training_input_path}")

### Run training on SM

In [None]:
hyperparameters = {}

# change the model name/path here to switch between resnet: "microsoft/resnet-101" and vit: "google/vit-base-patch16-224-in21k" 
#hyperparameters["model_name_or_path"] = "microsoft/resnet-101"
hyperparameters["model_name_or_path"] = "google/vit-base-patch16-224-in21k"

hyperparameters["seed"] = 100
hyperparameters["per_device_train_batch_size"] = 128
hyperparameters["per_device_eval_batch_size"] = 128
hyperparameters["learning_rate"] = 5e-5

hyperparameters["max_train_steps"] = 10000 # use 10000
hyperparameters["num_train_epochs"] = 25
hyperparameters["output_dir"] = "/opt/ml/model"


In [None]:
from sagemaker.pytorch import PyTorch
from sagemaker.tensorflow import TensorFlow
from sagemaker.debugger import TensorBoardOutputConfig
import os

In [None]:
## Tensorboard configuration

output_path = os.path.join(
   "s3://"+sagemaker_session_bucket, "sagemaker-output", "01102024", "vit-img-classification-ddp"
)
LOG_DIR = "/opt/ml/output/tensorboard"

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=os.path.join(output_path, 'tensorboard'),
    container_local_output_path=LOG_DIR
)

In [None]:
# Profiler configuration

from sagemaker import ProfilerConfig, Profiler
profiler_config = ProfilerConfig(
    profile_params = Profiler(cpu_profiling_duration=3600)
)

In [None]:
import os

base_job_name="vit-img-classification-ddp"

estimator = PyTorch(
    base_job_name=base_job_name,
    source_dir="scripts",
    entry_point= "train_ddp.py",
    role=role,
    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-gpu-py310-cu118-ubuntu20.04-sagemaker",
    #framework_version="2.0.0",
    #py_version="py310",
    instance_count=1,
    instance_type="ml.p4de.24xlarge",
    hyperparameters=hyperparameters,
    distribution={ "pytorchddp": { "enabled": True } },
    keep_alive_period_in_seconds=900,
    tensorboard_output_config=tensorboard_output_config,
    profiler_config=profiler_config,
    disable_output_compression=True,
    enable_remote_debug=True
)

In [None]:
estimator.fit({"train":training_input_path},wait=True)

In [None]:
# To ssh to the nodes use the below command
# aws ssm start-session --target sagemaker-training-job:<training_job_name>_algo-1