# Fine Tuning Llama-4 Scout 

### Install dependencies

In [None]:
!pip install transformers "huggingface_hub[cli]" scikit-learn --upgrade --quiet

### Login to huggingface using your tokens

In [None]:
#Add your HF token and ensure that you got access to download meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer.model from HF
!huggingface-cli login --token "" 

### Import modules and define SM IAM role, source_dir and s3 bucket

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

source_dir = "./src"
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
 
#Using SM bdefeult bucket, please update the S3 bucket as needed
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket) 
 
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

## Data Preparation

### Download databricks-dolly-15k dataset from Huggingface

In [None]:
!wget --no-check-certificate https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl

### Format and split train/test dataset

In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

def data_transform(row):
    content = "Instruction: {} \n Context: {}".format(row['instruction'], row['context'])
    prompt = {
        "prompt": content,
        "response": row["response"]
        
    }
    return prompt
    
data = []

with open('databricks-dolly-15k.jsonl', 'r') as f:
    for line in f:
        row = json.loads(line)
        trans_data = data_transform(row)
        data.append(trans_data)
    
df = pd.DataFrame(data)

train, test = train_test_split(df, test_size=0.2,random_state = 42)


train[:500].to_json("train_dataset.json", orient="records", force_ascii=False) #comment to train on entire dataset
test[:100].to_json("test_dataset.json", orient="records", force_ascii=False) #comment to train on entire dataset
#train.to_json("train_dataset.json", orient="records", force_ascii=False)#Uncomment to train on entire dataset
#test.to_json("test_dataset.json", orient="records", force_ascii=False) #Uncomment to train on entire dataset

### Upload the train/test dataset to S3 bucket

In [None]:
# save train_dataset to s3 using our SageMaker session
input_path = f's3://{sagemaker_session_bucket}/datasets/llama4'
 
from sagemaker.s3 import S3Uploader
train_dataset_s3_path = S3Uploader.upload(local_path="./train_dataset.json", desired_s3_uri=f"{input_path}/train_v3")
test_dataset_s3_path = S3Uploader.upload(local_path="./test_dataset.json", desired_s3_uri=f"{input_path}/test_v3")
print(f"Training data uploaded to:")
print(train_dataset_s3_path)
print(test_dataset_s3_path)


### Upload config.yaml from source_dir to S3

In [None]:
from sagemaker.s3 import S3Uploader
 
# upload the model yaml file to s3
model_yaml = "{}/config.yaml".format(source_dir)
train_config_s3_path = S3Uploader.upload(local_path=model_yaml, desired_s3_uri=f"{input_path}/config_v2")
 
print(f"Training config uploaded to:")
print(train_config_s3_path)

### Define PyTorch estimator with all required variables

In [None]:
#from sagemaker.huggingface import HuggingFace
from sagemaker.pytorch import PyTorch
from huggingface_hub import HfFolder

train_dlc_image = "763104351884.dkr.ecr.{}.amazonaws.com/pytorch-training:2.7.1-gpu-py312-cu128-ubuntu22.04-sagemaker".format(sess.boto_region_name)
# define Training Job Name 
job_name = f'llama4-scout'

#Select Instance type
instance_type = 'ml.p4de.24xlarge' # instances type used for the training job - Alternatively you can use p5, p5en instances

# create the Estimator
pytorch_estimator = PyTorch(
    entry_point          = 'run_l4_lora.sh',      # train script
    source_dir           = source_dir,  # directory which includes all the files needed for training
    instance_type        = instance_type,  
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 1024,               # the size of the EBS volume in GB
    py_version           = 'py312',           # the python version used in the training job
    image_uri            = train_dlc_image,
    disable_output_compression = True,        # not compress output to save training time and cost
    environment  = {
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache", # set env variable to cache models in /tmp
        "HF_TOKEN": HfFolder.get_token(),       # huggingface token to access gated models, e.g. llama 4
        "FSDP_CPU_RAM_EFFICIENT_LOADING": "1"   # enable CPU RAM efficient loading
    }, 
)

### Intiate SM training

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {
  'train': train_dataset_s3_path,
  'test': test_dataset_s3_path,
  'config': train_config_s3_path
  }
 
# starting the train job with our uploaded datasets as input
pytorch_estimator.fit(data, wait=True)

## Deploy the Fine-tuned model in a Sagemaker Endpoint

In [None]:
#Select the latest container. Check the link for the latest available version https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers 
CONTAINER_VERSION = '0.33.0-lmi15.0.0-cu128'

# vLLM container URI
container_uri = f'763104351884.dkr.ecr.{sess.boto_region_name}.amazonaws.com/djl-inference:{CONTAINER_VERSION}'

# Select instance type
instance_type = "ml.g6e.48xlarge" 

print(f"Container URI: {container_uri}")
print(f"Instance Type: {instance_type}")

In [None]:
vllm_config = {
    "HF_MODEL_ID": "/opt/ml/model",
    "OPTION_MAX_MODEL_LEN": "2048",
    "OPTION_MAX_ROLLING_BATCH_SIZE": "8",
    "OPTION_MODEL_LOADING_TIMEOUT": "1500",
    "SERVING_FAIL_FAST": "true",
    "OPTION_ROLLING_BATCH": "disable",
    "OPTION_ASYNC_MODE": "true",
    "OPTION_ENTRYPOINT": "djl_python.lmi_vllm.vllm_async_service"
}

In [None]:
from sagemaker import Model, image_uris, serializers, deserializers
 
model = Model(image_uri=container_uri,
              role=role,
              model_data=pytorch_estimator.model_data,
              env=vllm_config)

In [None]:
endpoint_name = sagemaker.utils.name_from_base("L4-Scout-FineTuned")

print(endpoint_name)
llm = model.deploy(
        initial_instance_count=1,
        instance_type=instance_type,
        endpoint_name=endpoint_name,
        container_startup_health_check_timeout = 1800
    )

#### Test Inference

In [None]:
# Invoke the model
import json
import boto3
import time

# Create SageMaker Runtime client
smr_client = boto3.client('sagemaker-runtime')
##Add your endpoint here 
#endpoint_name = ''

# Invoke with messages format
body = {
    "messages": [
        {"role": "user", "content": "Name popular places to visit in London?"}
    ],
    "temperature": 0.9,
    "max_tokens": 256,
    "stream": True,
}

start_time = time.time()
first_token_received = False
ttft = None
token_count = 0
full_response = ""

print(f"Prompt: {body['messages'][0]['content']}\n")
print("Response:", end=' ', flush=True)

# Invoke endpoint with streaming
resp = smr_client.invoke_endpoint_with_response_stream(
    EndpointName=endpoint_name,
    Body=json.dumps(body),
    ContentType="application/json",
)
# Process streaming response
for event in resp['Body']:
    if 'PayloadPart' in event:
        payload = event['PayloadPart']['Bytes'].decode()
        
        try:
            
            if payload.startswith('data: '):
                data = json.loads(payload[6:])  # Skip "data: " prefix
            else:
                data = json.loads(payload)
            
            token_count += 1
            if not first_token_received:
                ttft = time.time() - start_time
                first_token_received = True
            
            # Handle different streaming response formats
            if 'choices' in data and len(data['choices']) > 0:
                # Messages-compatible format
                if 'delta' in data['choices'][0] and 'content' in data['choices'][0]['delta']:
                    token_text = data['choices'][0]['delta']['content']
                    full_response += token_text
                    print(token_text, end='', flush=True)
            elif 'token' in data and 'text' in data['token']:
                # TGI format
                token_text = data['token']['text']
                full_response += token_text
                print(token_text, end='', flush=True)
        
        except json.JSONDecodeError:
            # Skip invalid JSON
            continue

end_time = time.time()
total_latency = end_time - start_time

print("\n\nMetrics:")
print(f"Time to First Token (TTFT): {ttft:.2f} seconds if tokens received else 'No tokens received'")
print(f"Total Tokens Generated: {token_count}")
print(f"Total Latency: {total_latency:.2f} seconds")


#### Delete endpoint and model

In [None]:

print(f"Deleting SageMaker resources for endpoint: {endpoint_name}")
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_name)