_API Reference: https://sagemaker.readthedocs.io/en/stable/workflows/pipelines/sagemaker.workflow.pipelines.html#steps_

In [1]:
!pip install sagemaker transformers==4.44.2 --quiet

In [None]:
import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import TrainingStep, ProcessingStep, CreateModelStep, CacheConfig
from sagemaker.workflow.parameters import ParameterString
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.model_step import ModelStep
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
import boto3

sess = sagemaker.Session()

sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

## 取得 Training Datasets

In [None]:
import json
import boto3

# 初始化 S3 客戶端，來源區域是 ap-northeast-1，目標區域是 us-west-2
s3_source = boto3.client('s3', region_name="ap-northeast-1")
s3_target = boto3.client('s3', region_name="us-west-2")

def parse_s3_uri(uri):
    parts = uri.replace("s3://", "").split("/")
    bucket = parts[0]
    key = "/".join(parts[1:])
    return bucket, key

def copy_s3_object(source_uri, target_bucket):
    source_bucket, source_key = parse_s3_uri(source_uri)
    try:
        # 從來源 bucket 下載檔案
        response = s3_source.get_object(Bucket=source_bucket, Key=source_key)
        file_content = response['Body'].read()

        # 將檔案上傳到目標 bucket
        s3_target.put_object(Bucket=target_bucket, Key=source_key, Body=file_content)
        print(f"Copied {source_key} to {target_bucket}")
    except Exception as e:
        print(f"Error copying {source_key}: {str(e)}")

# s3 URI
base_uri = 's3://aws-educate-09-28-sagemaker-workshop/datasets/phi-3/'
train_uri = base_uri + 'train_dataset.json'
test_uri = base_uri + 'test_dataset.json'

# 你的目標 S3 bucket
target_bucket = sess.default_bucket()


# 複製 train 和 test 資料到新的 S3 bucket
copy_s3_object(train_uri, target_bucket)
copy_s3_object(test_uri, target_bucket)

datasets_s3_uri = "s3://" + target_bucket + "/datasets/phi-3/"


## Pipeline Parameters

In [11]:
# 定義參數
training_datasets_s3_uri = ParameterString(name="TrainingDatasetesS3Uri", default_value=datasets_s3_uri)
model_package_group_name = "Demo-SageMaker-Pipeline-Group"

# 其他配置
cache_config = CacheConfig(enable_caching=True, expire_after="30d")

## TraningStep

In [12]:
from sagemaker.huggingface import HuggingFace
from transformers import AutoTokenizer

model_id = "microsoft/Phi-3.5-mini-instruct"



# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'num_train_epochs': 3,                            # number of training epochs
  'per_device_train_batch_size': 1,                 # batch size for training
  'gradient_accumulation_steps': 2,                 # Number of updates steps to accumulate 
  'gradient_checkpointing': True,                   # save memory but slower backward pass
  'fp16': True ,
  'learning_rate': 2e-4,                            # learning rate
  'max_grad_norm': 0.3,                             # Maximum norm (for gradient clipping)
  'warmup_ratio': 0.03,                             # warmup ratio
  "lr_scheduler_type":"constant",                   # learning rate scheduler
  'save_strategy': "epoch",                         # save strategy for checkpoints
  "logging_steps": 10,                              # log every x steps
  'merge_adapters': True,                           # wether to merge LoRA into the model (needs more memory)
  'use_flash_attn': True,                           # Whether to use Flash Attention
  'output_dir': '/tmp/run',                         # output directory, where to save assets during training
}

# define Training Job Name 
job_name = f'huggingface-qlora-{hyperparameters["model_id"].replace("/","-").replace(".","-")}'


huggingface_estimator = HuggingFace(
    entry_point          = 'run_qlora.py',    # train script
    source_dir           = '../scripts',      # directory which includes all the files needed for training
    instance_type        = 'ml.p3.2xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    max_run              = 2*24*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.36',            # the transformers version used in the training job
    pytorch_version      = '2.1',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
)


# 定義訓練步驟
train_step = TrainingStep(
    name="TrainModel",
    estimator=huggingface_estimator,
    inputs={
        'training': TrainingInput(training_datasets_s3_uri, content_type="application/json")
    },
    cache_config=cache_config
)


## Register Model Step
Documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-steps.html#step-type-register-model

In [None]:
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  session=sess,
)

config = {
    'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
    'SM_NUM_GPUS': json.dumps(1), # Number of GPU used per replica
    'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
    'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
}

model = HuggingFaceModel( # https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/sagemaker.huggingface.html#hugging-face-model
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts, # https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribeTrainingJob.html
    role=role,
    image_uri=llm_image,
    sagemaker_session=sess,
    env=config
)

register_step = RegisterModel(
    name="DemoSageMakerPipelineModel",
    model=model,
    content_types=["application/json"],
    response_types=["application/json"],
    inference_instances=["ml.g4dn.xlarge", "ml.g5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status="Approved",
)

## Deploy Model Step

In [15]:
# from sagemaker.model import Model
# from sagemaker.workflow.pipeline_context import PipelineSession
# from sagemaker.huggingface import get_huggingface_llm_image_uri

# # retrieve the llm image uri
# llm_image = get_huggingface_llm_image_uri(
#   "huggingface",
#   session=sess,
# )

# # print ecr image uri
# print(f"llm image uri: {llm_image}")

# # 創建模型
# model = Model(
#     image_uri=llm_image,
#     model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
#     role=role,
#     sagemaker_session=PipelineSession()
# )

# model

In [16]:
# from sagemaker.workflow.model_step import ModelStep

# # 定義模型部署步驟
# deploy_step = ModelStep(
#     name='ModelDeployment',
#     step_args=model.create(instance_type="ml.m5.large"),
# )

## Pipeline Definition

In [None]:
# 定義 Pipeline
pipeline = Pipeline(
    name="TrainingAndRegisterPipeline",
    parameters=[training_datasets_s3_uri],
    steps=[train_step, register_step]
)

# 開始執行 Pipeline
pipeline.upsert(role_arn=role)
# execution = pipeline.start()


In [18]:
# # 取得執行狀態
# execution.describe()

# # 等待 Pipeline 執行完成
# execution.wait()


In [None]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  session=sess,
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

## Manually Deploy a Model from the Registry
Domumentation: https://docs.aws.amazon.com/sagemaker/latest/dg/model-registry-deploy.html#model-registry-deploy-smsdk

In [None]:
from sagemaker import ModelPackage # https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.ModelPackage
from time import gmtime, strftime

model_package_arn = "arn:aws:sagemaker:us-west-2:097724924093:model-package/Demo-SageMaker-Pipeline-Group/3"
model = ModelPackage(role=role, # https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.ModelPackage
                     model_package_arn=model_package_arn,
                     sagemaker_session=sess)
model.deploy(initial_instance_count=1, instance_type='ml.g5.xlarge', container_startup_health_check_timeout=1000) # https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy