# Fine-tune `meta-llama/Llama-3.2-1B-Instruct` for Customer Support Triage
---
This notebook fine-tunes **Llama-3.2-1B-Instruct** to transform customer support tickets into structured internal bug reports using Amazon SageMaker.

**What this notebook does:**
1. Load the source data 
2. Format and upload training data to S3
3. Launch a SageMaker training job using LoRA (PEFT)

---

**Model:** [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)  
**Training Method:** PEFT LoRA (bf16 base with LoRA adapters)  
**Instance:** ml.g5.xlarge (A10G 24GB)

## 1. Setup and Dependencies

In [None]:
import os
import json
import random
from collections import defaultdict
import boto3
import sagemaker
from sagemaker.s3 import S3Uploader

In [None]:
# Set random seed for reproducibility
random.seed(42)

region = boto3.Session().region_name
sess = sagemaker.Session(boto3.Session(region_name=region))

sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    # Fallback to an explicit SageMaker execution role ARN if not using a SageMaker execution role
    role = "arn:aws:iam::889772146711:role/SageMakerExecutionRole"

In [None]:
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

## 2. Load and Preview Data

In [None]:
# Create output directory
dataset_parent_path = os.path.join(os.getcwd(), "tmp_cache_local_dataset")
os.makedirs(dataset_parent_path, exist_ok=True)

In [None]:
# Load source data
source_file = os.path.join(os.getcwd(), "source_data.jsonl")
data = []
with open(source_file, "r") as f:
    for line in f:
        line = line.strip()
        if line:
            data.append(json.loads(line))

print(f"Loaded {len(data)} records")

# Show category distribution
category_counts = defaultdict(int)
for entry in data:
    category_counts[entry["category"]] += 1

print("\nCategory distribution:")
for category, count in sorted(category_counts.items()):
    print(f"  {category}: {count}")

In [None]:
# Preview sample entry
print("Sample entry:")
print("=" * 60)
sample = data[0]
print(f"Category: {sample['category']}")
for msg in sample["messages"]:
    content_preview = msg["content"][:200] + "..." if len(msg["content"]) > 200 else msg["content"]
    print(f"{msg['role'].upper()}: {content_preview}")

## 3. Format and Save Data

Format all data for training. The SFT training script expects data in the `messages` format (without the category field):
```json
{
  "messages": [
    { "role": "user", "content": "..." },
    { "role": "assistant", "content": "..." }
  ]
}
```

In [None]:
# Format for training: remove category field, keep only messages
def format_for_training(entries):
    return [{"messages": entry["messages"]} for entry in entries]

train_formatted = format_for_training(data)

print(f"Formatted {len(train_formatted)} samples for training")
print("\nExample entry:")
print(json.dumps(train_formatted[0], indent=2)[:500] + "...")

In [None]:
# Save training data to JSONL file
train_filename = os.path.join(dataset_parent_path, "train_data.jsonl")

with open(train_filename, "w") as f:
    for entry in train_formatted:
        f.write(json.dumps(entry) + "\n")

print(f"Saved training data to: {train_filename}")

## 4. Upload Training Data to S3

In [None]:
data_s3_uri = f"s3://{sess.default_bucket()}/llama-customer-support-finetune/dataset"

# Check if data already exists at this S3 location
s3_client = boto3.client('s3')
bucket = sess.default_bucket()
prefix = "llama-customer-support-finetune/dataset/"

response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=1)
if response.get('KeyCount', 0) > 0:
    existing_files = [obj['Key'] for obj in response.get('Contents', [])]
    raise FileExistsError(
        f"S3 path already contains data!\n"
        f"Location: s3://{bucket}/{prefix}\n"
        f"Found: {existing_files}\n\n"
        f"To overwrite, manually delete the existing data first:\n"
        f"  aws s3 rm s3://{bucket}/{prefix} --recursive"
    )

uploaded_s3_uri = S3Uploader.upload(
    local_path=train_filename,
    desired_s3_uri=data_s3_uri
)
print(f"Uploaded training data to: {uploaded_s3_uri}")

## 5. Configure Training Job

In [None]:
from sagemaker.modules.configs import (
    CheckpointConfig,
    Compute,
    InputData,
    OutputDataConfig,
    SourceCode,
    StoppingCondition,
)
from sagemaker.modules.train import ModelTrainer
from getpass import getpass

In [None]:
# Enter and validate HuggingFace token (required for gated models like Llama)
hf_token = getpass("Enter your HuggingFace token: ")

# Validate token format
if not hf_token:
    raise ValueError("❌ HuggingFace token cannot be empty!")
elif not hf_token.startswith("hf_"):
    raise ValueError(
        f"❌ Invalid HuggingFace token format!\n"
        f"   Token should start with 'hf_' but starts with '{hf_token[:3]}...'\n"
        f"   Get a valid token at: https://huggingface.co/settings/tokens"
    )
elif len(hf_token) < 20:
    raise ValueError(
        f"❌ HuggingFace token too short!\n"
        f"   Token is only {len(hf_token)} characters (expected 37+)\n"
        f"   Make sure you copied the full token from: https://huggingface.co/settings/tokens"
    )
else:
    print(f"✓ HuggingFace token accepted")
    print(f"  - Format: Valid (starts with 'hf_')")
    print(f"  - Length: {len(hf_token)} characters")
    print(f"  - Preview: {hf_token[:7]}...{hf_token[-4:]}")

In [None]:
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
# Metrics will be reported to tensorboard
reports_to = "tensorboard"
# Job name
job_name = MODEL_ID.replace('/', '--').replace('.', '-')
print(f"job_name: {job_name}")

In [None]:
# Training environment variables
training_env = {
    "HF_TOKEN": hf_token,
    "FI_EFA_USE_DEVICE_RDMA": "1",
    "NCCL_DEBUG": "INFO",
    "NCCL_SOCKET_IFNAME": "eth0",
    "FI_PROVIDER": "efa",
    "NCCL_PROTO": "simple",
    "NCCL_NET_GDR_LEVEL": "5"
}

In [None]:
# Training arguments for PEFT LoRA
args = [
    "--config",
    "hf_recipes/meta-llama/Llama-3.2-1B-Instruct--vanilla-peft-lora.yaml",
]

# Instance configuration
training_instance_type = "ml.g5.xlarge"  # A10G 1GPU 24GB
training_instance_count = 1

print(f"Training instance: {training_instance_type} x {training_instance_count}")

In [None]:
# Get the PyTorch training image
pytorch_image_uri = sagemaker.image_uris.retrieve(
    framework="pytorch",
    region=sess.boto_session.region_name,
    version="2.7.1",
    instance_type=training_instance_type,
    image_scope="training",
)
print(f"Using image: {pytorch_image_uri}")

In [None]:
# Configure the ModelTrainer
source_code = SourceCode(
    source_dir="./sagemaker_code",
    command=f"bash sm_accelerate_train.sh {' '.join(args)}",
)

compute_configs = Compute(
    instance_type=training_instance_type,
    instance_count=training_instance_count,
    keep_alive_period_in_seconds=1800,
    volume_size_in_gb=125
)

import time
timestamp = int(time.time())
# Keep job name short - SageMaker has 63 char limit!
base_job_name = f"llama1b-cs-ft-{timestamp}"
output_path = f"s3://{sess.default_bucket()}/{base_job_name}"

model_trainer = ModelTrainer(
    training_image=pytorch_image_uri,
    source_code=source_code,
    base_job_name=base_job_name,
    compute=compute_configs,
    stopping_condition=StoppingCondition(max_runtime_in_seconds=18000),
    output_data_config=OutputDataConfig(
        s3_output_path=output_path,
    ),
    checkpoint_config=CheckpointConfig(
        s3_uri=os.path.join(
            output_path,
            "customer-support",
            job_name,
            "checkpoints"
        ), 
        local_path="/opt/ml/checkpoints"
    ),
    role=role,
    environment=training_env
)

print(f"base_job_name: {base_job_name}")
print(f"output_path: {output_path}")

In [None]:
## 6. Launch Training Job

In [None]:
# Launch the training job
model_trainer.train(
    input_data_config=[
        InputData(
            channel_name="training",
            data_source=uploaded_s3_uri,  
        )
    ], 
    wait=False  # Set to True to wait for completion
)

print("\nTraining job launched!")

In [None]:
# Wait for training to complete and stream logs
training_job = model_trainer._latest_training_job
print(f"Training job name: {training_job.training_job_name}")
print("Waiting for training to complete...\n")
training_job.wait(logs=False)

## Next Steps

Once the training job completes:

1. The fine-tuned model will be saved to `{output_path}/model/`
2. Run notebook 2 (`02_deploy_and_evaluate.ipynb`) to deploy the model and demo it