In [None]:
from pprint import pprint

from sagemaker.core.resources import TrainingJob, HubContent, InferenceComponent, ModelPackage
from sagemaker.core.utils.utils import Unassigned
! ada credentials update --provider=isengard --account=<> --role=Admin --profile=default --once
! aws configure set region  us-west-2

In [None]:
from sagemaker.core.resources import Endpoint

# Delete endpoints starting with 'e2e-'
for endpoint in Endpoint.get_all():
    if endpoint.endpoint_name.startswith('e2e-'):
        endpoint.delete()


In [None]:
from sagemaker.core.resources import TrainingJob, HubContent, InferenceComponent, ModelPackage
from sagemaker.core.utils.utils import Unassigned

for training_job in TrainingJob.get_all(region="us-west-2"):
    if not isinstance(training_job.output_model_package_arn, Unassigned):
        try:
            model_package = ModelPackage.get(training_job.output_model_package_arn)
            if not isinstance(model_package.inference_specification.containers[0].image,Unassigned)\
                    and model_package.inference_specification.containers[0].image is not None:
                print(training_job.training_job_arn)
                print(model_package.inference_specification.containers[0].image)
        except:
            pass


In [None]:
from sagemaker.core.resources import TrainingJob
import random
training_job = TrainingJob.get(training_job_name="meta-textgeneration-llama-3-2-1b-instruct-sft-20251123162832")
print(training_job.output_model_package_arn)
name = f"e2e-{random.randint(100, 10000)}"
from sagemaker.serve import ModelBuilder
model_builder = ModelBuilder(model=training_job)
model = model_builder.build(model_name=name)
print(model.model_arn)
import random
#endpoint = model_builder.deploy(endpoint_name=name)

In [None]:
endpoint = model_builder.deploy(endpoint_name=name)

In [None]:
from sagemaker.core.resources import InferenceComponent, Tag
from pprint import pprint

for inference_component in InferenceComponent.get_all(endpoint_name_equals="e2e-2358"):
    print(inference_component.inference_component_arn)
    for tag in Tag.get_all(resource_arn=inference_component.inference_component_arn):
        pprint(tag)



In [None]:
import json
# Note this is expected to fail since Endpoint invoke is only available for authorized users. The Invoke call here is the sagemaker-core Endpoint.invoke call .
print(endpoint.endpoint_arn)
endpoint.invoke(body=json.dumps({"inputs": "What is the capital of France?", "parameters": {"max_new_tokens": 50}}))

In [None]:
from sagemaker.core.resources import TrainingJob
from sagemaker.serve import ModelBuilder

model_builder = ModelBuilder(model=TrainingJob.get(training_job_name="meta-textgeneration-llama-3-2-1b-instruct-sft-20251123162832"))
model_builder.fetch_endpoint_names_for_base_model()

In [None]:
name = f"e2e-{random.randint(100, 10000)}"
model_builder.name = name
endpoint = model_builder.deploy(endpoint_name=name, inference_component_name=f"{name}-adapter")
sda

## Part 2: Deploy from ModelPackage

This section demonstrates an alternative deployment workflow using SageMaker Model Registry. This approach is ideal for production environments where:

**Model Registry Benefits:**
- **Version Control**: Track multiple versions of your models
- **Governance**: Implement approval workflows before deployment
- **Reproducibility**: Deploy the exact same model version across environments
- **Metadata Management**: Store model metrics, lineage, and documentation
- **CI/CD Integration**: Automate deployment pipelines with versioned artifacts

**When to Use ModelPackages:**
- Production deployments requiring approval gates
- Multi-environment deployments (dev, staging, prod)
- Models shared across teams or accounts
- Compliance and audit requirements

ModelPackages are automatically created when training jobs complete, or can be registered manually.

### Create ModelPackage Resource

Instantiate a ModelPackage resource from the SageMaker Model Registry. This represents a versioned, registered model with:

**ModelPackage Metadata:**
- **Group**: 'test-finetuned-models' (collection of related model versions)
- **Version**: 3 (specific iteration of the fine-tuned model)
- **Status**: Completed (ready for deployment)

**Inference Specification:**
- Model artifacts location in S3
- Base model reference (Llama 3.2 1B Instruct v0.0.3)
- Recipe name for fine-tuning configuration
- Container and runtime requirements

This ModelPackage was automatically created by the training job in Part 1, demonstrating the integration between training and model registry.

### Build Model from ModelPackage

Use ModelBuilder with a ModelPackage resource instead of a TrainingJob. The process is similar but with key differences:

**ModelPackage vs TrainingJob Deployment:**
- **ModelPackage**: Uses versioned, approved artifacts from Model Registry
- **TrainingJob**: Uses artifacts directly from training output

**Advantages of ModelPackage Approach:**
- Deploy any approved version, not just the latest training run
- Rollback to previous versions easily
- Deploy the same version across multiple environments
- Leverage approval workflows and governance policies

ModelBuilder automatically resolves all necessary metadata from the ModelPackage, including model artifacts, base model references, and inference configurations.

In [None]:
import random
from sagemaker.serve import ModelBuilder

from sagemaker.core.resources import ModelPackage

name = f"e2e-{random.randint(100, 1000000)}"
model_package = ModelPackage.get(model_package_name="arn:aws:sagemaker:us-west-2:<>:model-package/test-finetuned-models-gamma/68")
model_builder = ModelBuilder(model=model_package)
model_builder.build()

### Deploy ModelPackage to Endpoint

Deploy the versioned ModelPackage to a new SageMaker real-time endpoint. This deployment:

**Deployment Characteristics:**
- Uses the exact model version specified in the ModelPackage
- Maintains full traceability to the original training job
- Can be deployed to multiple endpoints simultaneously
- Supports the same deployment patterns (standalone or multi-adapter)

**Production Best Practices:**
- Use ModelPackages for all production deployments
- Implement approval workflows before deployment
- Tag endpoints with model version for tracking
- Monitor model performance and drift

The deployment process is identical to Part 1, but with the confidence that you're deploying a versioned, approved model artifact.

In [None]:
endpoint = model_builder.deploy( endpoint_name=name)


## Bedrock Model Builder


In [None]:
import boto3
import json

# Create config.json for Llama 3.2 1B model
config = {
    "architectures": ["LlamaForCausalLM"],
    "attention_bias": False,
    "attention_dropout": 0.0,
    "bos_token_id": 128000,
    "eos_token_id": 128001,
    "hidden_act": "silu",
    "hidden_size": 2048,
    "initializer_range": 0.02,
    "intermediate_size": 8192,
    "max_position_embeddings": 131072,
    "model_type": "llama",
    "num_attention_heads": 32,
    "num_hidden_layers": 16,
    "num_key_value_heads": 8,
    "pretraining_tp": 1,
    "rms_norm_eps": 1e-05,
    "rope_scaling": None,
    "rope_theta": 500000.0,
    "tie_word_embeddings": True,
    "torch_dtype": "bfloat16",
    "transformers_version": "4.45.0",
    "use_cache": True,
    "vocab_size": 128256
}

# Upload to S3
s3 = boto3.client('s3')
s3.put_object(
    Bucket='open-models-testing-pdx',
    Key='output/meta-textgeneration-llama-3-2-1b-instruct-sft-20251114104310/output/model/config.json',
    Body=json.dumps(config, indent=2),
    ContentType='application/json'
)

print("config.json uploaded successfully")


In [None]:
import boto3
import json

s3 = boto3.client('s3', region_name='us-west-2')
config = {"add_bos_token": True, "add_eos_token": False, "bos_token": "<|begin_of_text|>", "eos_token": "<|end_of_text|>", "pad_token": "<|end_of_text|>", "model_max_length": 131072, "tokenizer_class": "LlamaTokenizer"}
s3.put_object(Bucket="open-models-testing-pdx", Key="output/meta-textgeneration-llama-3-2-1b-instruct-sft-20251114104310/output/model/tokenizer_config.json", Body=json.dumps(config))


In [None]:
! ada credentials update --provider=isengard --account=<> --role=Admin --profile=default --once


In [None]:
from sagemaker.core.resources import TrainingJob
import random


training_job = TrainingJob.get(training_job_name="11-21-llama33-70b-bbh-v1-2025-11-21-18-47-09-200", region="us-west-2")
name = f"e2e-{random.randint(100, 10000)}"

# bedrock_builder = BedrockModelBuilder(model=training_job)
# bedrock_builder.deploy(job_name=name, imported_model_name=name, role_arn="arn:aws:iam::<>:role/Admin")

In [None]:
# Assuming you previously did something like:
# bedrock_builder = BedrockModelBuilder(model_trainer)
# import_response = bedrock_builder.deploy(imported_model_name="my-custom-model-name", ...)

# Use the imported_model_name as the modelId for Bedrock inference
bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-west-2')

response = bedrock_runtime.invoke_model(
    modelId=name,  # This is the imported_model_name from your deploy call
    body=json.dumps({
        "inputText": "What is the capital of France?",
        "textGenerationConfig": {
            "maxTokenCount": 50
        }
    })
)


## Summary

This notebook provided a comprehensive guide to deploying fine-tuned LLMs on Amazon SageMaker using two distinct workflows:

### Key Takeaways

**Deployment Approaches:**
1. **TrainingJob → Endpoint**: Direct deployment for rapid iteration and testing
2. **ModelPackage → Endpoint**: Versioned deployment for production governance

**Deployment Patterns:**
- **Standalone Endpoints**: Dedicated resources, full isolation, simple management
- **Multi-Adapter Endpoints**: Shared base model, cost-efficient, dynamic routing

**Best Practices:**
- Use TrainingJob deployment for development and experimentation
- Use ModelPackage deployment for production with approval workflows
- Leverage multi-adapter deployment to reduce costs when serving multiple variants
- Always test endpoints with sample requests before production traffic

**Next Steps:**
- Implement monitoring and logging for production endpoints
- Set up auto-scaling policies based on traffic patterns
- Create CI/CD pipelines for automated model deployment
- Explore model monitoring for drift detection and performance tracking

In [None]:
import boto3

bedrock = boto3.client('bedrock', region_name='us-west-2')

# List and delete model import jobs
import_jobs = bedrock.list_model_import_jobs()
for job in import_jobs['modelImportJobSummaries']:
    job_arn = job['jobArn']
    print(f"Deleting import job: {job_arn}")
    # Note: Import jobs auto-cleanup, but you can stop in-progress ones
    if job['status'] in ['InProgress', 'Submitted']:
        bedrock.stop_model_import_job(jobIdentifier=job_arn)

# List and delete imported models
imported_models = bedrock.list_imported_models()
for model in imported_models['modelSummaries']:
    model_arn = model['modelArn']
    print(f"Deleting imported model: {model_arn}")
    bedrock.delete_imported_model(modelIdentifier=model_arn)
