# Fine Tuning Llama-3.3 70b model with HuggingFace Estimator on ml.g6e.48xlarge

### Install dependencies

In [None]:
!pip install transformers "datasets[s3]==2.18.0" "huggingface_hub[cli]" --upgrade --quiet

### Login to huggingface using your tokens

In [None]:
!huggingface-cli login --token ""

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

source_dir = "./fsdp_v2"
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
 
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
 
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

## Data Preparation

### Download databricks-dolly-15k dataset from Huggingface

In [None]:
!wget https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl

### Format and split train/test dataset

In [None]:
from sklearn.model_selection import train_test_split
import json
import pandas as pd

def generate_prompt(row):
    prompt = f"Instruction: {row['instruction']}\nContext: {row['context']}\nResponse: {row['response']}"
    return prompt

data = []
with open('databricks-dolly-15k.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

df['text'] = df.apply(generate_prompt, axis=1)

train, test = train_test_split(df, test_size=0.2,random_state = 42)


train.to_json("train_dataset.json", orient="records", force_ascii=False)
test.to_json("test_dataset.json", orient="records", force_ascii=False)

### Upload the train/test dataset to S3 bucket

In [None]:
# save train_dataset to s3 using our SageMaker session
input_path = f's3://{sagemaker_session_bucket}/datasets/llama3'
 
from sagemaker.s3 import S3Uploader
train_dataset_s3_path = S3Uploader.upload(local_path="./train_dataset.json", desired_s3_uri=f"{input_path}/train_v3")
test_dataset_s3_path = S3Uploader.upload(local_path="./test_dataset.json", desired_s3_uri=f"{input_path}/test_v3")

print(f"Training data uploaded to:")
print(train_dataset_s3_path)
print(test_dataset_s3_path)


### Upload config.yaml from source_dir to S3

In [None]:
from sagemaker.s3 import S3Uploader
 
# upload the model yaml file to s3
model_yaml = "{}/llama_3_70b_fsdp_qlora.yaml".format(source_dir)
train_config_s3_path = S3Uploader.upload(local_path=model_yaml, desired_s3_uri=f"{input_path}/config_v2")
 
print(f"Training config uploaded to:")
print(train_config_s3_path)

In [None]:
#from sagemaker.huggingface import HuggingFace
from sagemaker.pytorch import PyTorch
from huggingface_hub import HfFolder

train_dlc_image = "763104351884.dkr.ecr.{}.amazonaws.com/pytorch-training:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker".format(sess.boto_region_name)
# define Training Job Name 
job_name = f'llama3-3-70b-exp1'
 
# create the Estimator
pytorch_estimator = PyTorch(
    entry_point          = 'run_fsdp_qlora.py',      # train script
    source_dir           = source_dir,  # directory which includes all the files needed for training
    instance_type        = 'ml.g6e.48xlarge',  # instances type used for the training job
    instance_count       = 2,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 500,               # the size of the EBS volume in GB
    py_version           = 'py311',           # the python version used in the training job
    image_uri            = train_dlc_image,
    hyperparameters      =  {
        "config": "/opt/ml/input/data/config/llama_3_70b_fsdp_qlora.yaml" # path to TRL config which was uploaded to s3
    },
    keep_alive_period_in_seconds=1800, #warm pool
    disable_output_compression = True,        # not compress output to save training time and cost
    distribution={"torch_distributed": {"enabled": True}},   # enables torchrun
    environment  = {
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache", # set env variable to cache models in /tmp
        "HF_TOKEN": HfFolder.get_token(),       # huggingface token to access gated models, e.g. llama 3
        "ACCELERATE_USE_FSDP": "1",             # enable FSDP
        "FSDP_CPU_RAM_EFFICIENT_LOADING": "1"   # enable CPU RAM efficient loading
    }, 
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {
  'train': train_dataset_s3_path,
  'test': test_dataset_s3_path,
  'config': train_config_s3_path
  }
 
# starting the train job with our uploaded datasets as input
pytorch_estimator.fit(data, wait=True)

## Deploy the Fine-tuned model in a Sagemaker Endpoint

In [None]:
from sagemaker.huggingface import get_huggingface_llm_image_uri
 
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  session=sess,)
 
# print ecr image uri
print(f"llm image uri: {llm_image}")

In [None]:
from huggingface_hub import HfFolder
from sagemaker.huggingface import HuggingFaceModel
 
# sagemaker config
instance_type = "ml.g6e.12xlarge"
health_check_timeout = 1200 # 20 minutes
 
# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model",       # Path to the model in the container
  'SM_NUM_GPUS': "4",                   # Number of GPU used per replica
  'MAX_INPUT_LENGTH': "1024",           # Max length of input text
  'MAX_TOTAL_TOKENS': "2048",           # Max length of the generation (including input text)
  'MAX_BATCH_PREFILL_TOKENS': "4096",  # Limits the number of tokens that can be processed in parallel during the generation
  'MESSAGES_API_ENABLED': "true",       # Enable the OpenAI Messages API
}
 
# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  # path to s3 bucket with model, we are not using a compressed model
  # {'S3DataSource':{'S3Uri': "s3://...",'S3DataType': 'S3Prefix','CompressionType': 'None'}},
  model_data=pytorch_estimator.model_data,
  image_uri=llm_image,
  env=config
)

In [None]:
# Deploy model to an endpoint
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 20 minutes to give SageMaker the time to download and merge model
)

#### Test Inference

In [None]:
inference_params = {
        "do_sample": True,
        "top_p": 0.6,
        "temperature": 0.9,
        "top_k": 50,
        "max_new_tokens": 512,
        "repetition_penalty": 1.03,
        "stop": ["</s>"],
        "return_full_text": False
    }

prompt = "Tell me about AWS SageMaker"
payload = {
    "inputs":  prompt,
    "parameters": inference_params
}

In [None]:
llm.predict(payload)

#### Delete endpoint and model

In [None]:
llm.delete_model()
llm.delete_endpoint(delete_endpoint_config=True)