In [None]:
! pip install transformers boto3 "sagemaker-core==1.0.41" "datasets[s3]==2.18.0" "sagemaker>=2.190.0" --upgrade --quiet

In [None]:
import sagemaker
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer
import boto3
import os

sagemaker_session = sagemaker.Session()
bucket_name = sagemaker_session.default_bucket()
default_prefix = sagemaker_session.default_bucket_prefix

In [None]:

# Provide hf_token value to get models and data
os.environ['hf_token']=""

In [None]:
from huggingface_hub import login
login(os.environ['hf_token'])

### Get Data and Upload to Amazon S3

In [None]:
train_dataset = load_dataset("benjamin-paine/imagenet-1k-256x256")

In [None]:
# save train_dataset to s3 using our SageMaker session
input_path = f's3://{sagemaker_session.default_bucket()}/datasets/imagenet-1k-256x256-sample'

# Save datasets to s3
#train_dataset['train'].select(range(500000)).to_parquet(f"{input_path}/train/train_dataset.parquet")
train_dataset_s3_path = f"{input_path}/train/train_dataset.parquet"
#train_dataset['validation'].to_parquet(f"{input_path}/validation/validation_dataset.parquet")
validation_dataset_s3_path = f"{input_path}/validation/validation_dataset.parquet"
#train_dataset['test'].to_parquet(f"{input_path}/test/test_dataset.parquet")
test_dataset_s3_path = f"{input_path}/test/test_dataset.parquet"
print(f"Training data uploaded to:")
print(train_dataset_s3_path)
print(f"validation data uploaded to:")
print(validation_dataset_s3_path)
print(f"Test data uploaded to:")
print(test_dataset_s3_path)
print(f"https://s3.console.aws.amazon.com/s3/buckets/{sagemaker_session.default_bucket()}/?region={sagemaker_session.boto_region_name}&prefix={input_path.split('/', 3)[-1]}/")


In [None]:
# os.environ['train_dataset'] = train_dataset_s3_path
# os.environ['validation_dataset'] = validation_dataset_s3_path
# os.environ['test_dataset'] = test_dataset_s3_path

## Model fine-tuning

We are now ready to fine-tune our model. We will use the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) from transfomers to fine-tune our model. We prepared a script [run_training.py](./scripts/run_training.py) which will loads the dataset from disk, prepare the model, tokenizer and start the training.

For configuration we use `TrlParser`, that allows us to provide hyperparameters in a `yaml` file. This yaml will be uploaded and provided to Amazon SageMaker similar to our datasets. Below is the config file for fine-tuning the model on `ml.p4d.24xlarge`. We are saving the config file as `args.yaml` and upload it to S3.

In [None]:
from sagemaker.config import load_sagemaker_config
configs = load_sagemaker_config()

#### Get PyTorch image_uri and set compute resources

We are going to use the native PyTorch container image, pre-built for Amazon SageMaker and set the compute we need

In [None]:
from sagemaker.modules.train import ModelTrainer
from sagemaker.modules.configs import Compute, SourceCode, InputData, StoppingCondition, CheckpointConfig

env = {}
env["FI_PROVIDER"] = "efa"
env["NCCL_PROTO"] = "simple"
env["NCCL_SOCKET_IFNAME"] = "eth0"
env["NCCL_IB_DISABLE"] = "1"
env["NCCL_DEBUG"] = "WARN"
env["HF_token"] = os.environ['hf_token']
env["train_data_location"] = train_dataset_s3_path
env["validation_data_location"] = validation_dataset_s3_path
env["test_data_location"] = test_dataset_s3_path

instance_type = "ml.p4d.24xlarge" #"ml.g6.24xlarge", #"ml.g6.48xlarge", #"ml.p4de.24xlarge", # "ml.g6.48xlarge", # "ml.p4d.24xlarge",

compute = Compute(
    instance_count=1,
    instance_type= instance_type,
    volume_size_in_gb=96,
    keep_alive_period_in_seconds=3600,
)

image_uri = sagemaker.image_uris.retrieve(
    framework="pytorch",
    region=sagemaker_session.boto_session.region_name,
    version="2.6.0",
    instance_type=instance_type,
    image_scope="training"
)

image_uri


In [None]:
checkpoint_s3_path = f"s3://{bucket_name}/imagenet-checkpoints/checkpoints"
job_prefix = f"model-trainer-distributed-imagenet-1k-256x256"


#### Setup ModelTrainer

Setup hyperparameters for sagemaker training job and setup source code which comes from [scripts](./scripts/) which contains receipe, deepspeed configs, and training scripts. 

In [None]:
hyperparameters = {
    "dataset_path": "/opt/ml/input/data/dataset",
    "model_dir": "/opt/ml/model",
}

source_code = SourceCode(
    source_dir="./scripts",
    requirements="requirements.txt",
    entry_script="run.sh",
)

model_trainer = ModelTrainer(
    training_image=image_uri,
    compute=compute,
    hyperparameters=hyperparameters,
    environment=env,
    source_code=source_code,
    stopping_condition=StoppingCondition(
        max_runtime_in_seconds=9000,
    ),
    checkpoint_config=CheckpointConfig(
        s3_uri=f"{checkpoint_s3_path}/{job_prefix}",
    ),
    base_job_name=job_prefix

)

Setup the input channel for model which come from S3

In [None]:
training_data = InputData(
    channel_name="training_dataset",
    data_source=train_dataset_s3_path,
)

validation_data = InputData(
    channel_name="validation_dataset",
    data_source=validation_dataset_s3_path,
)
test_data = InputData(
    channel_name="test_dataset",
    data_source=test_dataset_s3_path,
)

### Run Training Job

In [None]:
model_trainer.train(input_data_config=[training_data, validation_data, test_data], wait=True)