# SageMaker 와 ML Flow 를 이용한 Llama 3 파인 튜닝

## 1. 환경 설정

In [25]:
# import os
# from getpass import getpass
 
# is_sagemaker_notebook = True
# # is_sagemaker_notebook = False # use VS Code

# if is_sagemaker_notebook:
#     # HF_TOKEN = getpass("Enter HUGGINGFACE Access Token: ")
#     HF_TOKEN = "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"
# else: # VS Code
#     from dotenv import load_dotenv
#     HF_TOKEN = os.getenv('HF_TOKEN') or getpass("Enter HUGGINGFACE Access Token: ")
#     print("token: ", HF_TOKEN)

# # Log in to HF
# !huggingface-cli login --token {HF_TOKEN}


### 저장된 데이터 불러오기

In [26]:
%store -r data_folder
%store -r train_data_json 
%store -r validation_data_json 
%store -r test_data_json 
%store -r full_train_data_json 
%store -r full_validation_data_json 
%store -r full_test_data_json


print("data_folder: ", data_folder)
print("train_data_json: ", train_data_json)
print("validation_data_json: ", validation_data_json)
print("test_data_json: ", test_data_json)
print("full_train_data_json: ", full_train_data_json)
print("full_validation_data_json: ", full_validation_data_json)
print("full_test_data_json: ", full_test_data_json)

data_folder:  ../data/naver-news-summarization-ko
train_data_json:  ../data/naver-news-summarization-ko/train/train_dataset.json
validation_data_json:  ../data/naver-news-summarization-ko/validation/validation_dataset.json
test_data_json:  ../data/naver-news-summarization-ko/test/test_dataset.json
full_train_data_json:  ../data/naver-news-summarization-ko/full_train/train_dataset.json
full_validation_data_json:  ../data/naver-news-summarization-ko/full_validation/validation_dataset.json
full_test_data_json:  ../data/naver-news-summarization-ko/full_test/test_dataset.json


### SageMaker 기본 변수 가져오기

In [27]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


sagemaker role arn: arn:aws:iam::057716757052:role/gen_ai_gsmoon
sagemaker bucket: sagemaker-us-east-1-057716757052
sagemaker session region: us-east-1


## 2. 데이터 준비
- 두가지 종류의 데이터 셋을 업로드 합니다.
    - Full Dataset: 전체 데이타를 업로드 합니다.
    - Sample Dataset: 디버깅 용도의 일부 데이타를 업로드 합니다.    

### S3 데이터 셋 경로 생성

In [28]:
def create_s3_path(sess,is_full, data_folder,train_data_json,validation_data_json,test_data_json,verbose=True  ):
    dataset_name = data_folder.split('/')[-1]
    # save train_dataset to s3 using our SageMaker session
    input_path = f's3://{sess.default_bucket()}/datasets/{dataset_name}'
    print("input_path: \n", input_path)

    trian_file_name = train_data_json.split('/')[-1]
    validation_file_name = validation_data_json.split('/')[-1]
    test_file_name = test_data_json.split('/')[-1]

    if is_full:
        train_dataset_s3_path = f"{input_path}/full_train/{trian_file_name}"
        validation_dataset_s3_path = f"{input_path}/full_validation/{validation_file_name}"
        test_dataset_s3_path = f"{input_path}/full_test/{test_file_name}"
    else:
        train_dataset_s3_path = f"{input_path}/train/{trian_file_name}"
        validation_dataset_s3_path = f"{input_path}/validation/{validation_file_name}"
        test_dataset_s3_path = f"{input_path}/test/{test_file_name}"

    if verbose:
        print("train_dataset_s3_path: \n", train_dataset_s3_path)
        print("validation_dataset_s3_path: \n", validation_dataset_s3_path)
        print("test_dataset_s3_path: \n", test_dataset_s3_path)

    return train_dataset_s3_path, validation_dataset_s3_path, test_dataset_s3_path, input_path

train_dataset_s3_path, validation_dataset_s3_path, test_dataset_s3_path, input_path = create_s3_path(
                                                                            sess=sess,
                                                                            is_full = False,
                                                                            data_folder=data_folder,
                                                                            train_data_json=train_data_json,
                                                                            validation_data_json=validation_data_json,
                                                                            test_data_json=test_data_json)    
print("")
full_train_dataset_s3_path, full_validation_dataset_s3_path, full_test_dataset_s3_path, input_path = create_s3_path(
                                                                            sess=sess,
                                                                            is_full = True,
                                                                            data_folder=data_folder,
                                                                            train_data_json=full_train_data_json,
                                                                            validation_data_json=full_validation_data_json,
                                                                            test_data_json=full_test_data_json)    

# full_train_data_json

input_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko
train_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json
validation_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/validation/validation_dataset.json
test_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/test_dataset.json

input_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko
train_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_train/train_dataset.json
validation_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_validation/validation_dataset.json
test_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_test/test_dataset.json


In [29]:
%store input_path

Stored 'input_path' (str)


### 데이타를 S3 에 업로딩

In [30]:
def get_s3_prefix_name(s3_path, verbose=True):
    file_name = s3_path.split('/')[-1]
    file_name = '/' + file_name
    desired_s3_uri = s3_path.split(file_name)[0]

    if verbose:
        print("file_name: ", file_name)
        print("desired_s3_uri: ", desired_s3_uri)
    return desired_s3_uri

from sagemaker.s3 import S3Uploader

def upload_data_s3(desired_s3_uri, file_name, verbose=True):
    # upload the model yaml file to s3
    
    file_s3_path = S3Uploader.upload(local_path=file_name, desired_s3_uri=desired_s3_uri)

    print(f"{file_name} is uploaded to:")
    print(file_s3_path)

    return file_s3_path


### Debug 용 작은 데이터셋 S3 업로딩

In [31]:

######## Train File
# return s3 URI, e.g: s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train 
train_desired_s3_uri = get_s3_prefix_name(train_dataset_s3_path)    
# upload local file to e.g: s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train 
upload_data_s3(desired_s3_uri=train_desired_s3_uri, file_name=train_data_json, verbose=True)
######## Validation File
print("")
validation_desired_s3_uri = get_s3_prefix_name(validation_dataset_s3_path)    
upload_data_s3(desired_s3_uri=validation_desired_s3_uri, file_name=validation_data_json, verbose=True)
######## Test File
print("")
test_desired_s3_uri = get_s3_prefix_name(test_dataset_s3_path)    
upload_data_s3(desired_s3_uri=test_desired_s3_uri, file_name=test_data_json, verbose=True)

file_name:  /train_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train
../data/naver-news-summarization-ko/train/train_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json

file_name:  /validation_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/validation
../data/naver-news-summarization-ko/validation/validation_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/validation/validation_dataset.json

file_name:  /test_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test
../data/naver-news-summarization-ko/test/test_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/test_dataset.json


's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/test_dataset.json'

### 평가용 큰 데이터셋 S3 업로딩

In [32]:

######## Train File
# return s3 URI, e.g: s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train 
full_train_desired_s3_uri = get_s3_prefix_name(full_train_dataset_s3_path)    
# upload local file to e.g: s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train 
upload_data_s3(desired_s3_uri=full_train_desired_s3_uri, file_name=full_train_data_json, verbose=True)
######## Validation File
print("")
full_validation_desired_s3_uri = get_s3_prefix_name(full_validation_dataset_s3_path)    
upload_data_s3(desired_s3_uri=full_validation_desired_s3_uri, file_name=full_validation_data_json, verbose=True)
######## Test File
print("")
full_test_desired_s3_uri = get_s3_prefix_name(full_test_dataset_s3_path)    
upload_data_s3(desired_s3_uri=full_test_desired_s3_uri, file_name=full_test_data_json, verbose=True)

file_name:  /train_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_train
../data/naver-news-summarization-ko/full_train/train_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_train/train_dataset.json

file_name:  /validation_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_validation
../data/naver-news-summarization-ko/full_validation/validation_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_validation/validation_dataset.json

file_name:  /test_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_test
../data/naver-news-summarization-ko/full_test/test_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_test/test_dataset.json


's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_test/test_dataset.json'

#### 업로드 확인

In [33]:
! aws s3 ls {input_path}  --recursive --human-readable

2024-08-24 01:56:42    2.6 KiB datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml
2024-08-24 01:58:17    8.4 MiB datasets/naver-news-summarization-ko/full_test/test_dataset.json
2024-08-24 01:58:16   68.0 MiB datasets/naver-news-summarization-ko/full_train/train_dataset.json
2024-08-24 01:58:17    7.6 MiB datasets/naver-news-summarization-ko/full_validation/validation_dataset.json
2024-08-24 01:58:14   33.4 KiB datasets/naver-news-summarization-ko/test/test_dataset.json
2024-08-24 01:58:14   28.1 KiB datasets/naver-news-summarization-ko/train/train_dataset.json
2024-08-24 01:58:14   26.1 KiB datasets/naver-news-summarization-ko/validation/validation_dataset.json


In [34]:
# ! aws s3 rm {input_path} --recursive

In [35]:
# ! aws s3 cp {train_data_json} {train_dataset_s3_path}
# ! aws s3 cp {validation_data_json} {validation_dataset_s3_path}
# ! aws s3 cp {test_data_json} {test_dataset_s3_path}


## 3. 훈련 준비

In [36]:
import os
config_folder_name = "accelerator_config"
os.makedirs(config_folder_name, exist_ok=True)

### 훈련 설정 파일 준비
- 목적에 맞게 아래의 두 개의 부분을 주석을 이용하여 사용 하세요.
    - For Debug 부분은 일부 샘플 데이타를 통해서 빠르게 디버깅 목적의 파라미터 값 입니다.
    - For evaluation: 전체 데이터를 통해서 최적의 파라미터 값 입니다.
```
###########################             
# For Debug
###########################             
num_train_epochs: 5                    # number of training epochs
per_device_train_batch_size: 1         # batch size per device during training
per_device_eval_batch_size: 1          # batch size for evaluation
gradient_accumulation_steps: 2         # number of steps before performing a backward/update pass
###########################             
# For evaluation
###########################             
# num_train_epochs: 3                    # number of training epochs
# per_device_train_batch_size: 16         # batch size per device during training
# per_device_eval_batch_size: 8          # batch size for evaluation
# gradient_accumulation_steps: 2         # number of steps before performing a backward/update pass
###########################             
```

In [37]:
%%writefile accelerator_config/sm_llama_3_8b_fsdp_qlora.yaml
# script parameters
model_id:  "meta-llama/Meta-Llama-3-8B" # Hugging Face model id
max_seq_len:  2048              # max sequence length for model and packing of the dataset
# sagemaker specific parameters
train_dataset_path: "/opt/ml/input/data/train/" # path to where SageMaker saves train dataset
validation_dataset_path: "/opt/ml/input/data/validation/" # path to where SageMaker saves train dataset
test_dataset_path: "/opt/ml/input/data/test/"   # path to where SageMaker saves test dataset
output_dir: "/tmp/llama3"            # where the LoRA adapter weight is
# training parameters
# report_to: "tensorboard" 
report_to: "mlflow" 
mlflow_experiment_name: "llama3-naver-news-fine-tuning"
# report metrics to tensorboard
MLFLOW_TRACKING_ARN: "arn:aws:sagemaker:us-east-1:057716757052:mlflow-tracking-server/my-setup-test3"
learning_rate: 0.0002                  # learning rate 2e-4
lr_scheduler_type: "constant"          # learning rate scheduler
###########################             
# For Debug
###########################             
num_train_epochs: 1                    # number of training epochs
per_device_train_batch_size: 1         # batch size per device during training
per_device_eval_batch_size: 1          # batch size for evaluation
gradient_accumulation_steps: 2         # number of steps before performing a backward/update pass
###########################             
# For evaluation
###########################             
# num_train_epochs: 3                    # number of training epochs
# per_device_train_batch_size: 16         # batch size per device during training
# per_device_eval_batch_size: 8          # batch size for evaluation
# gradient_accumulation_steps: 2         # number of steps before performing a backward/update pass
###########################             
optim: adamw_torch                     # use torch adamw optimizer
logging_steps: 10                      # log every 10 steps
save_strategy: epoch                   # save checkpoint every epoch
evaluation_strategy: epoch             # evaluate every epoch
max_grad_norm: 0.3                     # max gradient norm
warmup_ratio: 0.03                     # warmup ratio
bf16: true                             # use bfloat16 precision
tf32: true                             # use tf32 precision
gradient_checkpointing: true           # use gradient checkpointing to save memory
# FSDP parameters: https://huggingface.co/docs/transformers/main/en/fsdp
fsdp: "full_shard auto_wrap offload" # remove offload if enough GPU memory
fsdp_config:
  backward_prefetch: "backward_pre"
  forward_prefetch: "false"
  use_orig_params: "false"

Overwriting accelerator_config/sm_llama_3_8b_fsdp_qlora.yaml


### 설정 파일을 S3 에 업로드
- 위에 정의한 파일을 업로드 합니다.


In [38]:

config_desired_s3_uri = f"{input_path}/config"
config_model_name = "accelerator_config/sm_llama_3_8b_fsdp_qlora.yaml"
train_config_s3_path = upload_data_s3(desired_s3_uri=config_desired_s3_uri, file_name=config_model_name, verbose=True)


accelerator_config/sm_llama_3_8b_fsdp_qlora.yaml is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml


### 데이터 입력 구성

#### 데이터 사이즈 조정 
- 디버그 용도이면 run_debug_sample = True, 전데 데이터 이면 False 로 조절 하세요

In [39]:

run_debug_sample = True
# run_debug_sample = False
if run_debug_sample:
  local_data = {
    'train': f'file://{train_data_json}',
    'validation': f'file://{validation_data_json}',
    'config': f'file://{config_model_name}'
    }

  s3_data = {
    'train': train_dataset_s3_path,
    'validation': validation_dataset_s3_path,
    'config': train_config_s3_path
    }  
else:
  local_data = {
    'train': f'file://{train_data_json}',
    'validation': f'file://{validation_data_json}',
    'config': f'file://{config_model_name}'
    }
  s3_data = {
    'train': full_train_dataset_s3_path,
    'validation': full_validation_dataset_s3_path,
    'config': train_config_s3_path
    }  
s3_data    

{'train': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json',
 'validation': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/validation/validation_dataset.json',
 'config': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml'}

### Clolud 모드 및 Local 사용
- 현재 로컬 모드는 에러 발행. 확인 중 임

In [40]:
# USE_LOCAL_MODE = True
USE_LOCAL_MODE = False

import torch

if USE_LOCAL_MODE:
    instance_type = 'local_gpu' if torch.cuda.is_available() else 'local'
    instance_count = 1
    from sagemaker.local import LocalSession
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    data = local_data 
    # data = s3_data
    metric_definitions = None
    nKeepAliveSeconds = None # Warmpool feature
    print("## Local mode is set")
else:
    instance_type = 'ml.g5.4xlarge'
    # instance_type = 'ml.g5.12xlarge'
    # instance_type = 'ml.g5.48xlarge'
    # instance_type = 'ml.p4d.24xlarge'
    # Emit: 
    # {'train_runtime': 37.2985, 'train_samples_per_second': 0.375, 'train_steps_per_second': 0.054, 'train_loss': 2.3541293144226074, 'epoch': 1.0}
    # {'eval_loss': 2.50766658782959, 'eval_runtime': 3.4741, 'eval_samples_per_second': 3.454, 'eval_steps_per_second': 0.864, 'epoch': 1.0}
    metric_definitions=[
        {"Name": "train:loss", "Regex": "'train_loss':(.*?),"},
        {"Name": "validation:loss", "Regex": "'eval_loss':(.*?),"}
    ]
    instance_count = 1
    sagemaker_session = sagemaker.session.Session()
    data = s3_data
    nKeepAliveSeconds = 3600 # Warmpool feature, 1 hour
    print(f"## Cloud mode is set with {instance_type} and {instance_count} of instance_count")
print("dataset: \n", data)

## Cloud mode is set with ml.g5.4xlarge and 1 of instance_count
dataset: 
 {'train': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json', 'validation': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/validation/validation_dataset.json', 'config': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml'}


In [41]:
%store data

Stored 'data' (dict)


### 훈련 Estimator 생성

In [104]:
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

import time
# define Training Job Name 
job_name = f'llama3-8b-naver-news-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'
# chkpt_s3_path = f's3://{sess.default_bucket()}/{s3_prefix}/native/checkpoints'

# create the Estimator
os.environ['USE_SHORT_LIVED_CREDENTIALS']="1" 
huggingface_estimator = HuggingFace(
    entry_point          = 'sm_run_fsdp_qlora_llama3_mlflow.py',      # train script
    source_dir           = '../../scripts',  # directory which includes all the files needed for training
    instance_type        = instance_type,  # instances type used for the training job
    instance_count       = instance_count,                 # the number of instances used for training
    sagemaker_session    = sagemaker_session,
    max_run              = 2*24*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 256,               # the size of the EBS volume in GB
    transformers_version = '4.36.0',          # the transformers version used in the training job
    pytorch_version      = '2.1.0',           # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    metric_definitions = metric_definitions,
    hyperparameters      =  {
        "config": "/opt/ml/input/data/config/sm_llama_3_8b_fsdp_qlora.yaml" # path to TRL config which was uploaded to s3
    },
    disable_output_compression = True,        # not compress output to save training time and cost    
    keep_alive_period_in_seconds = nKeepAliveSeconds,     # warm pool 
    distribution={"torch_distributed": {"enabled": True}},   # enables torchrun
    environment  = {
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache", # set env variable to cache models in /tmp
        "HF_TOKEN": HF_TOKEN,       # huggingface token to access gated models, e.g. llama 3
        "ACCELERATE_USE_FSDP": "1",             # enable FSDP
        "FSDP_CPU_RAM_EFFICIENT_LOADING": "1"   # enable CPU RAM efficient loading
    }, 
)

## 4. 훈련 실행
- 로컬 모드시에는 모델 저장을 하지 않습니다. 훈련 스크립트에서 처리 합니다. (현재 모델 저장시에 /tmp 의 용량이 차서 에러가 발생 합니다.)

In [105]:
huggingface_estimator.fit(data, wait=False)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: llama3-8b-naver-news-2024-08-23-15-11-2-2024-08-23-15-11-28-213


In [106]:
huggingface_estimator.logs()

2024-08-23 15:18:37 Starting - Starting the training job...
2024-08-23 15:18:56 Downloading - Downloading the training image
2024-08-23 15:18:56 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2024-08-23 15:18:58,188 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-08-23 15:18:58,206 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-08-23 15:18:58,218 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-08-23 15:18:58,221 sagemaker_pytorch_container.training INFO     Invoking TorchDistributed...[0m
[34m2024-08-23 15:18:58,221 sagemaker_pytorch_container.training INFO     Invoking user training script.[

## 5. 모델 경로 저장

In [None]:
# model_s3_path = huggingface_estimator.model_data
# print("model_s3_path: \n", model_s3_path)

# %store model_s3_path