# Development Environment and Permissions


_**Use at least a `t3.large` instance otherwise preprocessing will take ages.**_

## Installation

_*Note:* we only install the required libraries from Hugging Face and AWS. You also need PyTorch or Tensorflow, if not already installed_

In [None]:
!pip install sagemaker boto3 torch transformers


Collecting sagemaker
  Downloading sagemaker-2.238.0-py3-none-any.whl.metadata (16 kB)
Collecting boto3
  Downloading boto3-1.36.10-py3-none-any.whl.metadata (6.7 kB)
Collecting attrs<24,>=23.1.0 (from sagemaker)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting docker (from sagemaker)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting fastapi (from sagemaker)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting importlib-metadata<7.0,>=1.4.0 (from sagemaker)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl.metadata (4.9 kB)
Collecting omegaconf<=2.3,>=2.2 (from sagemaker)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pathos (from sagemaker)
  Downloading pathos-0.3.3-py3-none-any.whl.metadata (11 kB)
Collecting sagemaker-core<2.0.0,>=1.0.17 (from sagemaker)
  Downloading sagemaker_core-1.0.19-py3-none-any.whl.metadata (4.9 kB)
Collecting schema (from sagemaker)
  Downloading schema

## Permissions

_If you are going to use Sagemaker in a local environment, you need access to an IAM Role with the required permissions for Sagemaker. You can find out more about this [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html)_

In [None]:
import os
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"  # Change to your AWS region

In [None]:
import boto3
import sagemaker

aws_region = "us-east-1"  # Set the correct AWS region

# Manually set the correct role ARN (replace with your actual ARN)
role_arn = ""

# Create a SageMaker session
boto_session = boto3.Session(region_name=aws_region)
sess = sagemaker.Session(boto_session=boto_session)

print(f"SageMaker Role ARN: {role_arn}")
print(f"SageMaker Default Bucket: {sess.default_bucket()}")
print(f"SageMaker Session Region: {sess.boto_region_name}")



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
SageMaker Role ARN: arn:aws:iam::437552477415:role/sageMaker
SageMaker Default Bucket: sagemaker-us-east-1-437552477415
SageMaker Session Region: us-east-1


In [None]:
# Test connection by listing existing SageMaker endpoints
sagemaker_client = boto3.client("sagemaker", region_name=aws_region)

try:
    response = sagemaker_client.list_endpoints(MaxResults=5)
    print("✅ Successfully connected to SageMaker!")
    print("SageMaker Endpoints:", response["Endpoints"])
except Exception as e:
    print("❌ Failed to connect to SageMaker. Error:", str(e))


✅ Successfully connected to SageMaker!
SageMaker Endpoints: [{'EndpointName': 'huggingface-pytorch-inference-2025-01-30-22-48-14-753', 'EndpointArn': 'arn:aws:sagemaker:us-east-1:437552477415:endpoint/huggingface-pytorch-inference-2025-01-30-22-48-14-753', 'CreationTime': datetime.datetime(2025, 1, 30, 22, 48, 15, 365000, tzinfo=tzlocal()), 'LastModifiedTime': datetime.datetime(2025, 1, 30, 22, 51, 27, 236000, tzinfo=tzlocal()), 'EndpointStatus': 'InService'}, {'EndpointName': 'huggingface-pytorch-inference-2025-01-30-22-46-50-835', 'EndpointArn': 'arn:aws:sagemaker:us-east-1:437552477415:endpoint/huggingface-pytorch-inference-2025-01-30-22-46-50-835', 'CreationTime': datetime.datetime(2025, 1, 30, 22, 46, 51, 588000, tzinfo=tzlocal()), 'LastModifiedTime': datetime.datetime(2025, 1, 30, 22, 50, 3, 196000, tzinfo=tzlocal()), 'EndpointStatus': 'InService'}]


In [None]:
import boto3

# Initialize SageMaker client
sagemaker_client = boto3.client("sagemaker", region_name="us-east-1")

# List all deployed endpoints
endpoints = sagemaker_client.list_endpoints(MaxResults=5)

for ep in endpoints["Endpoints"]:
    endpoint_name = ep["EndpointName"]
    endpoint_desc = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)

    # Handle missing 'ModelName' gracefully
    model_name = endpoint_desc.get("ProductionVariants", [{}])[0].get("ModelName", "Unknown")

    print(f"🔹 Endpoint: {endpoint_name} is running model: {model_name}")

    # Only describe model if model_name is valid
    if model_name != "Unknown":
        model_details = sagemaker_client.describe_model(ModelName=model_name)
        env_vars = model_details["PrimaryContainer"].get("Environment", {})

        print(f"   ➡️ HF_MODEL_ID: {env_vars.get('HF_MODEL_ID', 'Unknown')}")


🔹 Endpoint: huggingface-pytorch-inference-2025-01-31-00-17-37-220 is running model: Unknown
🔹 Endpoint: huggingface-pytorch-inference-2025-01-30-22-48-14-753 is running model: Unknown
🔹 Endpoint: huggingface-pytorch-inference-2025-01-30-22-46-50-835 is running model: Unknown


In [None]:
import boto3
import json

# Define the SageMaker runtime client
sagemaker_runtime = boto3.client("sagemaker-runtime", region_name="us-east-1")

# Specify the endpoint name (choose one from the list)
endpoint_name = "huggingface-pytorch-inference-2025-01-30-22-48-14-753"

# Define input data (adjust based on the model task)
payload = json.dumps({
    "inputs": "The capital of France is [MASK]."
})

# Invoke the endpoint
response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Body=payload
)

# Process the response
result = json.loads(response["Body"].read().decode())
print("🔍 Model Response:", result)


🔍 Model Response: [{'score': 0.1426878571510315, 'token': 16766, 'token_str': 'marseille', 'sequence': 'the capital of france is marseille.'}, {'score': 0.09020467847585678, 'token': 25387, 'token_str': 'nantes', 'sequence': 'the capital of france is nantes.'}, {'score': 0.08808308094739914, 'token': 17209, 'token_str': 'toulouse', 'sequence': 'the capital of france is toulouse.'}, {'score': 0.08617904037237167, 'token': 3000, 'token_str': 'paris', 'sequence': 'the capital of france is paris.'}, {'score': 0.07720649987459183, 'token': 10241, 'token_str': 'lyon', 'sequence': 'the capital of france is lyon.'}]


In [None]:
!pip install bitsandbytes accelerate transformers

In [None]:
!pip install -U bitsandbytes

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Use a valid Qwen or DeepSeek model
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Change if needed

# Load tokenizer & model with forced CPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # FP16 for better efficiency
    device_map={"": "cpu"}  # Force CPU mode
)

print(f"✅ Model '{model_name}' instantiated successfully in CPU mode!")


model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

✅ Model 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B' instantiated successfully in CPU mode!


In [None]:
import torch

# Define input prompt
input_text = "Write a Python function to calculate the factorial of a number."

# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

# Generate response from the model
output = model.generate(**inputs, max_length=200)

# Decode the output
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print("🔍 Model Output:\n", decoded_output)


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [None]:
# from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
# from string import ascii_uppercase
# from random import choice

# def generate_random_label():
#     letters = ascii_uppercase
#     return ''.join(choice(letters) for i in range(10))

In [None]:
# image_uri = get_huggingface_llm_image_uri(
#   backend="huggingface",
#   region=aws_region,
#   version="1.1.0"
# )

# model_name = "MistralLite-" + generate_random_label()

In [None]:
# image_uri

'763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'

In [None]:
#  model_name

'MistralLite-DRJIANCKKB'

In [None]:
# hub_env = {
#   'HF_MODEL_ID': 'amazon/MistralLite',
#   'HF_TASK': 'text-generation',
#   'SM_NUM_GPUS': '1',
#   "MAX_INPUT_LENGTH": '16000',
#   "MAX_TOTAL_TOKENS": '16384',
#   "MAX_BATCH_PREFILL_TOKENS": '16384',
#   "MAX_BATCH_TOTAL_TOKENS":  '16384',
# }

In [None]:
# model = HuggingFaceModel(
#     name=model_name,
#     env=hub_env,
#     role=role,
#     image_uri=image_uri
# )

In [None]:
# predictor = model.deploy(
#   initial_instance_count=1,
#   instance_type="ml.g5.2xlarge",
#   endpoint_name=model_name,
# )

In [None]:
# List all SageMaker endpoints
endpoints = sagemaker_client.list_endpoints(MaxResults=10)

print("🔍 Active Endpoints:")
for ep in endpoints["Endpoints"]:
    print(f"🟢 {ep['EndpointName']} - Status: {ep['EndpointStatus']}")


🔍 Active Endpoints:


In [None]:
# Loop through endpoints and delete any with "huggingface" in the name
for ep in endpoints["Endpoints"]:
    endpoint_name = ep["EndpointName"]
    if "huggingface" in endpoint_name.lower():  # Only delete Hugging Face endpoints
        sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
        print(f"🛑 Deleted Endpoint: {endpoint_name}")


🛑 Deleted Endpoint: huggingface-pytorch-inference-2025-01-31-00-17-37-220
🛑 Deleted Endpoint: huggingface-pytorch-inference-2025-01-30-22-48-14-753
🛑 Deleted Endpoint: huggingface-pytorch-inference-2025-01-30-22-46-50-835


In [None]:
from sagemaker.huggingface.model import HuggingFaceModel
import sagemaker

# Define a lightweight Hugging Face model (DistilBERT for text classification)
hub = {
    "HF_MODEL_ID": "distilbert-base-uncased",  # Lighter model for testing
    "HF_TASK": "fill-mask"  # Simple NLP task
}

# Use the manually set role ARN
role_arn = "arn:aws:iam::437552477415:role/sageMaker"

# Initialize SageMaker session
sess = sagemaker.Session()

# Create a Hugging Face model in SageMaker
huggingface_model = HuggingFaceModel(
    env=hub,
    role=role_arn,
    transformers_version="4.26",
    pytorch_version="1.13",
    py_version="py39",
)

# Deploy the model using a lighter instance type (CPU or small GPU)
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",  # Lighter CPU instance
)

# Test the model
response = predictor.predict({
    "inputs": "The capital of France is [MASK]."
})

print("Model Response:", response)


------!Model Response: [{'score': 0.1426878571510315, 'token': 16766, 'token_str': 'marseille', 'sequence': 'the capital of france is marseille.'}, {'score': 0.09020467847585678, 'token': 25387, 'token_str': 'nantes', 'sequence': 'the capital of france is nantes.'}, {'score': 0.08808308094739914, 'token': 17209, 'token_str': 'toulouse', 'sequence': 'the capital of france is toulouse.'}, {'score': 0.08617904037237167, 'token': 3000, 'token_str': 'paris', 'sequence': 'the capital of france is paris.'}, {'score': 0.07720649987459183, 'token': 10241, 'token_str': 'lyon', 'sequence': 'the capital of france is lyon.'}]


In [None]:
from sagemaker.huggingface.model import HuggingFaceModel
import sagemaker

# Define DeepSeek model configuration
hub = {
    "HF_MODEL_ID": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",  # Smaller model
    "HF_TASK": "text-generation"
}


# Use the manually set role ARN
role_arn = "arn:aws:iam::437552477415:role/sageMaker"

# Initialize SageMaker session
sess = sagemaker.Session()

# Create a Hugging Face model in SageMaker
huggingface_model = HuggingFaceModel(
    env=hub,
    role=role_arn,
    transformers_version="4.26",
    pytorch_version="1.13",
    py_version="py39",
    # container_startup_health_check_timeout=600  # Increase timeout to 10 minutes
)

# Deploy the DeepSeek model on a GPU instance (adjust as needed)
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",  # Use a GPU instance
)

# Test the deployed DeepSeek model
response = predictor.predict({
    "inputs": "Explain reinforcement learning in simple terms."
})

print("🔍 Model Response:", response)

---------!