# 모델 훈련 및 모델을 Model Registry 에 등록을 위한 SageMaker Pipeline 생성
이 노트북은 다음과 같은 작업을 합니다.
- SageMaker Pipeline 입력 변수 설정 및 Cache Config 설정
- 모델 훈련 스텝 생성
- SageMaker Model 생성 스텝 생성
- Model Registry 의 Model Group 생성
- SageMaker Model 을 Model Group 에 등록
- SageMaker Pipeline 실행

    
---

# 1.환경 설정 



### HF 토큰 설정

In [1]:
%load_ext autoreload
%autoreload 2

import os
from getpass import getpass
 
is_sagemaker_notebook = True
# is_sagemaker_notebook = False # use VS Code

if is_sagemaker_notebook:
    HF_TOKEN = getpass("Enter HUGGINGFACE Access Token: ")
else: # VS Code
    from dotenv import load_dotenv
    HF_TOKEN = os.getenv('HF_TOKEN') or getpass("Enter HUGGINGFACE Access Token: ")
    print("token: ", HF_TOKEN)

# Log in to HF
!huggingface-cli login --token {HF_TOKEN}

Enter HUGGINGFACE Access Token:  ········


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/ec2-user/SageMaker/.cache/token
Login successful


### SageMaker 기본 변수 가져오기

In [2]:
import sagemaker
import boto3
from sagemaker.workflow.pipeline_context import LocalPipelineSession, PipelineSession
from sagemaker.workflow.pipeline_context import PipelineSession

pipeline_session = PipelineSession()
sess = sagemaker.Session()

region = pipeline_session.boto_region_name
default_bucket = pipeline_session.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

print("region :", region)
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
region : us-east-1
sagemaker role arn: arn:aws:iam::057716757052:role/gen_ai_gsmoon
sagemaker bucket: sagemaker-us-east-1-057716757052


# 2. 훈련 설정 값 준비

#### 훈련 인스턴스 및 로깅할 메트릭 등 설정


In [3]:
import torch

instance_type = 'ml.g5.4xlarge'
metric_definitions=[
    {"Name": "train:loss", "Regex": "'train_loss':(.*?),"},
    {"Name": "validation:loss", "Regex": "'eval_loss':(.*?),"}
]
instance_count = 1

### 훈련 데이터 S3 위치 로딩

In [4]:
%store -r data
print("data: \n", data)

data: 
 {'train': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json', 'validation': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/validation/validation_dataset.json', 'config': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml'}


# 3. 세이지 메이커 파이프라인 생성

## 3.1. 모델 빌딩 파이프라인 설정

### 파이프라인 변수 설정
파이프라인에 인자로 넘길 변수는 아래 크게 3가지 종류가 있습니다.
- 모델 레지스트리에 모델 등록시에 모델 승인 상태 값    


In [5]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)

### 캐싱 정의
참고: 캐싱 파이프라인 단계: [Caching Pipeline Steps](https://docs.aws.amazon.com/ko_kr/sagemaker/latest/dg/pipelines-caching.html)

In [6]:
from sagemaker.workflow.steps import CacheConfig

cache_config = CacheConfig(enable_caching=True, 
                           expire_after="1d")

## 3.2. 파이프라인 스텝 단계 정의

### 3.2.1 모델 훈련 스텝

####  Estimator 생성

Estimator 생성시에 인자가 필요 합니다. 주요한 인자만 보겠습니다.


In [7]:
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

import time
# define Training Job Name 
job_name = f'llama3-8b-naver-news-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'sm_run_fsdp_qlora_llama3_mlflow.py',      # train script
    source_dir           = 'src',  # directory which includes all the files needed for training        
    instance_type        = instance_type,  # instances type used for the training job
    instance_count       = instance_count,                 # the number of instances used for training
    sagemaker_session    = pipeline_session,
    max_run              = 2*24*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 256,               # the size of the EBS volume in GB
    transformers_version = '4.36.0',          # the transformers version used in the training job
    pytorch_version      = '2.1.0',           # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    metric_definitions = metric_definitions,
    hyperparameters      =  {
        "config": "/opt/ml/input/data/config/sm_llama_3_8b_fsdp_qlora.yaml" # path to TRL config which was uploaded to s3
    },
    disable_output_compression = True,        # not compress output to save training time and cost    
    distribution={"torch_distributed": {"enabled": True}},   # enables torchrun
    environment  = {
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache", # set env variable to cache models in /tmp
        "HF_TOKEN": HF_TOKEN,       # huggingface token to access gated models, e.g. llama 3
        "ACCELERATE_USE_FSDP": "1",             # enable FSDP
        "FSDP_CPU_RAM_EFFICIENT_LOADING": "1"   # enable CPU RAM efficient loading
    }, 
)

  from .autonotebook import tqdm as notebook_tqdm


#### 모델 훈련 스탭 생성


In [8]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep


step_train = TrainingStep(
    name= "llama3-8b-naver-news-Training",
    estimator=huggingface_estimator,
    # estimator=host_estimator,
    inputs=data,
    # cache_config = cache_config, # 캐시 정의     
)

### 3.2.2 SageMaker 모델 생성 스텝

#### 추론 이미지 정의

In [9]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  session=sess,
  version="2.0.2",
)
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.3.0-tgi2.0.2-gpu-py310-cu121-ubuntu22.04


### SageMaker Model 생성 스텝
- 일반적으로 모델 아티펙트의 결과인 model.tar.gz 의 s3 경로를 model_data 로 지정을 하나, 파라미터가 많은 LLM 의 경우는 압축을 하지 않고, 원본 모델 파라미터 파일들을 model_data 에 등록을 함

In [10]:
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.workflow.model_step import ModelStep
import optparse
from sagemaker.workflow.functions import Join

# S3 URI에 끝 슬래시 추가
s3_uri_with_slash = Join(
    on='',
    values=[
        step_train.properties.ModelArtifacts.S3ModelArtifacts,
        '/'  # 끝에 슬래시 추가
    ]
)

# model_data 딕셔너리 생성
model_data = {
    'S3DataSource': {
        'S3Uri': s3_uri_with_slash,
        'S3DataType': 'S3Prefix',
        'CompressionType': 'None'
    }
}

# create HuggingFaceModel with the image uri
huggingface_model = HuggingFaceModel(
    # model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    model_data=model_data,
    # model_data={'S3DataSource': {'S3Uri': 's3://sagemaker-us-east-1-057716757052/llama3-8b-naver-news-2024-08-25-01-02-0-2024-08-25-01-18-13-794/output/model/', 'S3DataType': 'S3Prefix', 'CompressionType': 'None'}},    
    image_uri=llm_image,
    transformers_version="4.28.1",
    pytorch_version="2.0.0",
    py_version="py310",
    model_server_workers=1,
    role=role,
    # name=f"HuggingFaceModel-Llama2-7b-{rand_id}",
    sagemaker_session=pipeline_session
)

inference_instance_type = ["ml.g5.4xlarge", "ml.g5.12xlarge"]
create_step_args = huggingface_model.create(instance_type=inference_instance_type)
step_create_model = ModelStep(
    name="CreateModel",
    step_args=create_step_args
)




### Model Registry 등록 스텝

#### 모델 그룹 생성

- 참고
    - 모델 그룹 릭스팅 API:  [ListModelPackageGroups](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ListModelPackageGroups.html)
    - 모델 지표 등록: [Model Quality Metrics](https://docs.aws.amazon.com/ko_kr/sagemaker/latest/dg/model-monitor-model-quality-metrics.html)

In [11]:
sm_client = boto3.client('sagemaker', region_name=region)

model_package_group_name = f"Llama3-8b-Naver-News-Summarization"
model_package_group_input_dict = {
 "ModelPackageGroupName" : model_package_group_name,
 "ModelPackageGroupDescription" : "Sample model package group"
}
response = sm_client.list_model_package_groups(NameContains=model_package_group_name)
if len(response['ModelPackageGroupSummaryList']) == 0:
    print("No model group exists")
    print("Create model group")    
    
    create_model_pacakge_group_response = sm_client.create_model_package_group(**model_package_group_input_dict)
    print('ModelPackageGroup Arn : {}'.format(create_model_pacakge_group_response['ModelPackageGroupArn']))    
else:
    print(f"{model_package_group_name} exitss")

Llama3-8b-Naver-News-Summarization exitss


### 모델을 Model Group에  등록

In [12]:
customer_metadata = {
    'Model-S3-URI': s3_uri_with_slash,
    "training-image-uri": huggingface_estimator.training_image_uri(),
    "model-name": "llama3-8b-naver-news",
    "training-job-name": step_train.properties.TrainingJobName,
    "base-model": "meta-llama/Llama-3-8b",
    "fine-tuning-dataset": "naver-news-summarization",
    "created-by": "ML-team"
}

register_args = huggingface_model.register(
    content_types=["application/json"],
    response_types=["application/json"],
    inference_instances=[
        "ml.g5.12xlarge",
    ],
    customer_metadata_properties = customer_metadata,
    model_package_group_name=model_package_group_name,
)
step_register = ModelStep(name="RegisterModel", step_args=register_args)

# 4.모델 빌딩 파이프라인 정의 및 실행
위에서 정의한 아래의 4개의 스텝으로 파이프라인 정의를 합니다.


In [13]:
from sagemaker.workflow.pipeline import Pipeline

project_prefix = 'llama3-8b-naver-neews-summarization'

pipeline_name = project_prefix
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        # s3_data_loc,                
        model_approval_status,        
    ],
    sagemaker_session=pipeline_session,
    steps=[step_train, step_create_model, step_register],
)

In [14]:
pipeline.upsert(role_arn=role)
execution = pipeline.start()

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


### 파이프라인 운영: 파이프라인 대기 및 실행상태 확인

실행이 완료될 때까지 기다립니다.

In [15]:
execution.wait()

실행된 단계들을 리스트업합니다. 파이프라인의 단계실행 서비스에 의해 시작되거나 완료된 단계를 보여줍니다.

In [16]:
execution.list_steps()

[{'StepName': 'CreateModel-CreateModel',
  'StartTime': datetime.datetime(2024, 8, 27, 4, 25, 9, 436000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 8, 27, 4, 25, 10, 792000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'Model': {'Arn': 'arn:aws:sagemaker:us-east-1:057716757052:model/pipelines-dzkdufzqivop-CreateModel-CreateMo-lnuu8eRBSU'}},
  'AttemptCount': 1},
 {'StepName': 'RegisterModel-RegisterModel',
  'StartTime': datetime.datetime(2024, 8, 27, 4, 25, 9, 436000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 8, 27, 4, 25, 10, 556000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'RegisterModel': {'Arn': 'arn:aws:sagemaker:us-east-1:057716757052:model-package/Llama3-8b-Naver-News-Summarization/10'}},
  'AttemptCount': 1},
 {'StepName': 'llama3-8b-naver-news-Training',
  'StartTime': datetime.datetime(2024, 8, 27, 4, 12, 45, 707000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 8, 27, 4, 25, 8, 369000, tzinfo=tzloc