# 파이프라인 데이타 준비 및 훈련 단계 준비

## 1. 환경 설정

### 경로 설정

In [1]:
%load_ext autoreload
%autoreload 2

%load_ext autoreload
%autoreload 2

import sys, os

def add_python_path(module_path):
    if os.path.abspath(module_path) not in sys.path:
        sys.path.append(os.path.abspath(module_path))
        print(f"python path: {os.path.abspath(module_path)} is added")
    else:
        print(f"python path: {os.path.abspath(module_path)} already exists")
    print("sys.path: ", sys.path)

module_path = "../.."
add_python_path(module_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
python path: /home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/30_fine_tune/03-fine-tune-llama3 is added
sys.path:  ['/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/30_fine_tune/03-fine-tune-llama3/notebook/03-naver-news-lllama3-mlops', '/home/ec2-user/SageMaker/.cs/conda/envs/llama3_puy310/lib/python310.zip', '/home/ec2-user/SageMaker/.cs/conda/envs/llama3_puy310/lib/python3.10', '/home/ec2-user/SageMaker/.cs/conda/envs/llama3_puy310/lib/python3.10/lib-dynload', '', '/home/ec2-user/SageMaker/.cs/conda/envs/llama3_puy310/lib/python3.10/site-packages', '/home/ec2-user/SageMaker/huggingface-inferentia2-samples/llama3-70b/llmperf/src', '/home/ec2-user/SageMaker/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/30_fine_tune/03-fine-tune-llama3']


### 저장된 데이터 불러오기

In [2]:


%store -r data_folder
%store -r train_data_json 
%store -r validation_data_json 
%store -r test_data_json 
%store -r full_train_data_json 
%store -r full_validation_data_json 
%store -r full_test_data_json


print("data_folder: ", data_folder)
print("train_data_json: ", train_data_json)
print("validation_data_json: ", validation_data_json)
print("test_data_json: ", test_data_json)
print("full_train_data_json: ", full_train_data_json)
print("full_validation_data_json: ", full_validation_data_json)
print("full_test_data_json: ", full_test_data_json)

data_folder:  ../data/naver-news-summarization-ko
train_data_json:  ../data/naver-news-summarization-ko/train/train_dataset.json
validation_data_json:  ../data/naver-news-summarization-ko/validation/validation_dataset.json
test_data_json:  ../data/naver-news-summarization-ko/test/test_dataset.json
full_train_data_json:  ../data/naver-news-summarization-ko/full_train/train_dataset.json
full_validation_data_json:  ../data/naver-news-summarization-ko/full_validation/validation_dataset.json
full_test_data_json:  ../data/naver-news-summarization-ko/full_test/test_dataset.json


### SageMaker 기본 변수 가져오기

In [3]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::057716757052:role/gen_ai_gsmoon
sagemaker bucket: sagemaker-us-east-1-057716757052
sagemaker session region: us-east-1


## 2. 데이터 준비
- 두가지 종류의 데이터 셋을 업로드 합니다.
    - Full Dataset: 전체 데이타를 업로드 합니다.
    - Sample Dataset: 디버깅 용도의 일부 데이타를 업로드 합니다.    

### S3 데이터 셋 경로 생성

In [4]:
def create_s3_path(sess,is_full, data_folder,train_data_json,validation_data_json,test_data_json,verbose=True  ):
    dataset_name = data_folder.split('/')[-1]
    # save train_dataset to s3 using our SageMaker session
    input_path = f's3://{sess.default_bucket()}/datasets/{dataset_name}'
    print("input_path: \n", input_path)

    trian_file_name = train_data_json.split('/')[-1]
    validation_file_name = validation_data_json.split('/')[-1]
    test_file_name = test_data_json.split('/')[-1]

    if is_full:
        train_dataset_s3_path = f"{input_path}/full_train/{trian_file_name}"
        validation_dataset_s3_path = f"{input_path}/full_validation/{validation_file_name}"
        test_dataset_s3_path = f"{input_path}/full_test/{test_file_name}"
    else:
        train_dataset_s3_path = f"{input_path}/train/{trian_file_name}"
        validation_dataset_s3_path = f"{input_path}/validation/{validation_file_name}"
        test_dataset_s3_path = f"{input_path}/test/{test_file_name}"

    if verbose:
        print("train_dataset_s3_path: \n", train_dataset_s3_path)
        print("validation_dataset_s3_path: \n", validation_dataset_s3_path)
        print("test_dataset_s3_path: \n", test_dataset_s3_path)

    return train_dataset_s3_path, validation_dataset_s3_path, test_dataset_s3_path, input_path

train_dataset_s3_path, validation_dataset_s3_path, test_dataset_s3_path, input_path = create_s3_path(
                                                                            sess=sess,
                                                                            is_full = False,
                                                                            data_folder=data_folder,
                                                                            train_data_json=train_data_json,
                                                                            validation_data_json=validation_data_json,
                                                                            test_data_json=test_data_json)    
print("")
full_train_dataset_s3_path, full_validation_dataset_s3_path, full_test_dataset_s3_path, input_path = create_s3_path(
                                                                            sess=sess,
                                                                            is_full = True,
                                                                            data_folder=data_folder,
                                                                            train_data_json=full_train_data_json,
                                                                            validation_data_json=full_validation_data_json,
                                                                            test_data_json=full_test_data_json)    

# full_train_data_json

input_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko
train_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json
validation_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/validation/validation_dataset.json
test_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/test_dataset.json

input_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko
train_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_train/train_dataset.json
validation_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_validation/validation_dataset.json
test_dataset_s3_path: 
 s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_test/test_dataset.json


In [5]:
%store input_path

Stored 'input_path' (str)


### 데이타를 S3 에 업로딩

In [6]:
def get_s3_prefix_name(s3_path, verbose=True):
    file_name = s3_path.split('/')[-1]
    file_name = '/' + file_name
    desired_s3_uri = s3_path.split(file_name)[0]

    if verbose:
        print("file_name: ", file_name)
        print("desired_s3_uri: ", desired_s3_uri)
    return desired_s3_uri

from sagemaker.s3 import S3Uploader

def upload_data_s3(desired_s3_uri, file_name, verbose=True):
    # upload the model yaml file to s3
    
    file_s3_path = S3Uploader.upload(local_path=file_name, desired_s3_uri=desired_s3_uri)

    print(f"{file_name} is uploaded to:")
    print(file_s3_path)

    return file_s3_path


### Debug 용 작은 데이터셋 S3 업로딩

In [7]:

######## Train File
# return s3 URI, e.g: s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train 
train_desired_s3_uri = get_s3_prefix_name(train_dataset_s3_path)    
# upload local file to e.g: s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train 
upload_data_s3(desired_s3_uri=train_desired_s3_uri, file_name=train_data_json, verbose=True)
######## Validation File
print("")
validation_desired_s3_uri = get_s3_prefix_name(validation_dataset_s3_path)    
upload_data_s3(desired_s3_uri=validation_desired_s3_uri, file_name=validation_data_json, verbose=True)
######## Test File
print("")
test_desired_s3_uri = get_s3_prefix_name(test_dataset_s3_path)    
upload_data_s3(desired_s3_uri=test_desired_s3_uri, file_name=test_data_json, verbose=True)

file_name:  /train_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train
../data/naver-news-summarization-ko/train/train_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json

file_name:  /validation_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/validation
../data/naver-news-summarization-ko/validation/validation_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/validation/validation_dataset.json

file_name:  /test_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test
../data/naver-news-summarization-ko/test/test_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/test_dataset.json


's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/test/test_dataset.json'

### 평가용 큰 데이터셋 S3 업로딩

In [8]:

######## Train File
# return s3 URI, e.g: s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train 
full_train_desired_s3_uri = get_s3_prefix_name(full_train_dataset_s3_path)    
# upload local file to e.g: s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train 
upload_data_s3(desired_s3_uri=full_train_desired_s3_uri, file_name=full_train_data_json, verbose=True)
######## Validation File
print("")
full_validation_desired_s3_uri = get_s3_prefix_name(full_validation_dataset_s3_path)    
upload_data_s3(desired_s3_uri=full_validation_desired_s3_uri, file_name=full_validation_data_json, verbose=True)
######## Test File
print("")
full_test_desired_s3_uri = get_s3_prefix_name(full_test_dataset_s3_path)    
upload_data_s3(desired_s3_uri=full_test_desired_s3_uri, file_name=full_test_data_json, verbose=True)

file_name:  /train_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_train
../data/naver-news-summarization-ko/full_train/train_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_train/train_dataset.json

file_name:  /validation_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_validation
../data/naver-news-summarization-ko/full_validation/validation_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_validation/validation_dataset.json

file_name:  /test_dataset.json
desired_s3_uri:  s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_test
../data/naver-news-summarization-ko/full_test/test_dataset.json is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_test/test_dataset.json


's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/full_test/test_dataset.json'

#### 업로드 확인

In [9]:
! aws s3 ls {input_path}  --recursive --human-readable

2024-08-26 02:52:14    2.6 KiB datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml
2024-08-27 02:28:55    8.4 MiB datasets/naver-news-summarization-ko/full_test/test_dataset.json
2024-08-27 02:28:54   68.0 MiB datasets/naver-news-summarization-ko/full_train/train_dataset.json
2024-08-27 02:28:55    7.6 MiB datasets/naver-news-summarization-ko/full_validation/validation_dataset.json
2024-08-27 02:28:53   33.4 KiB datasets/naver-news-summarization-ko/test/test_dataset.json
2024-08-27 02:28:53   28.1 KiB datasets/naver-news-summarization-ko/train/train_dataset.json
2024-08-27 02:28:53   26.1 KiB datasets/naver-news-summarization-ko/validation/validation_dataset.json


## 3. 훈련 준비

### 훈련 설정 파일 준비

In [10]:
import os
config_folder_name = "accelerator_config"
os.makedirs(config_folder_name, exist_ok=True)

In [11]:
%%writefile accelerator_config/sm_llama_3_8b_fsdp_qlora.yaml
# script parameters
model_id:  "meta-llama/Meta-Llama-3-8B" # Hugging Face model id
max_seq_len:  2048              # max sequence length for model and packing of the dataset
# sagemaker specific parameters
train_dataset_path: "/opt/ml/input/data/train/" # path to where SageMaker saves train dataset
validation_dataset_path: "/opt/ml/input/data/validation/" # path to where SageMaker saves train dataset
test_dataset_path: "/opt/ml/input/data/test/"   # path to where SageMaker saves test dataset
output_dir: "/tmp/llama3"            # where the LoRA adapter weight is
report_to: "mlflow" 
mlflow_experiment_name: "llama3-naver-news-fine-tuning"
MLFLOW_TRACKING_ARN: "arn:aws:sagemaker:us-east-1:057716757052:mlflow-tracking-server/my-setup-test3"
learning_rate: 0.0002                  # learning rate 2e-4
lr_scheduler_type: "constant"          # learning rate scheduler
###########################             
# For Debug
###########################             
num_train_epochs: 1                    # number of training epochs
per_device_train_batch_size: 1         # batch size per device during training
per_device_eval_batch_size: 1          # batch size for evaluation
gradient_accumulation_steps: 2         # number of steps before performing a backward/update pass
###########################             
# For evaluation
###########################             
# num_train_epochs: 3                    # number of training epochs
# per_device_train_batch_size: 16         # batch size per device during training
# per_device_eval_batch_size: 8          # batch size for evaluation
# gradient_accumulation_steps: 2         # number of steps before performing a backward/update pass
###########################             
optim: adamw_torch                     # use torch adamw optimizer
logging_steps: 10                      # log every 10 steps
save_strategy: epoch                   # save checkpoint every epoch
evaluation_strategy: epoch             # evaluate every epoch
max_grad_norm: 0.3                     # max gradient norm
warmup_ratio: 0.03                     # warmup ratio
bf16: true                             # use bfloat16 precision
tf32: true                             # use tf32 precision
gradient_checkpointing: true           # use gradient checkpointing to save memory
# FSDP parameters: https://huggingface.co/docs/transformers/main/en/fsdp
fsdp: "full_shard auto_wrap offload" # remove offload if enough GPU memory
fsdp_config:
  backward_prefetch: "backward_pre"
  forward_prefetch: "false"
  use_orig_params: "false"

Overwriting accelerator_config/sm_llama_3_8b_fsdp_qlora.yaml


### 설정 파일을 S3 에 업로드
- 위에 정의한 파일을 업로드 합니다.


In [12]:

config_desired_s3_uri = f"{input_path}/config"
config_model_name = "accelerator_config/sm_llama_3_8b_fsdp_qlora.yaml"
train_config_s3_path = upload_data_s3(desired_s3_uri=config_desired_s3_uri, file_name=config_model_name, verbose=True)


accelerator_config/sm_llama_3_8b_fsdp_qlora.yaml is uploaded to:
s3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml


### 데이터 입력 구성

#### 데이터 사이즈 조정 
- 디버그 용도이면 run_debug_sample = True, 전데 데이터 이면 False 로 조절 하세요

In [13]:

run_debug_sample = True
# run_debug_sample = False
if run_debug_sample:
  local_data = {
    'train': f'file://{train_data_json}',
    'validation': f'file://{validation_data_json}',
    'config': f'file://{config_model_name}'
    }

  s3_data = {
    'train': train_dataset_s3_path,
    'validation': validation_dataset_s3_path,
    'config': train_config_s3_path
    }  
else:
  local_data = {
    'train': f'file://{train_data_json}',
    'validation': f'file://{validation_data_json}',
    'config': f'file://{config_model_name}'
    }
  s3_data = {
    'train': full_train_dataset_s3_path,
    'validation': full_validation_dataset_s3_path,
    'config': train_config_s3_path
    }  
s3_data    

{'train': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/train/train_dataset.json',
 'validation': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/validation/validation_dataset.json',
 'config': 's3://sagemaker-us-east-1-057716757052/datasets/naver-news-summarization-ko/config/sm_llama_3_8b_fsdp_qlora.yaml'}

In [14]:
data = s3_data
%store data

Stored 'data' (dict)
