In [1]:
import boto3
import sagemaker
import time
import os
from dotenv import load_dotenv
import tarfile
load_dotenv()

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/abhijitdeshpande/Library/Application Support/sagemaker/config.yaml


True

In [2]:
## Config
model_name = 'flan-t5-finetuned'
endpoint_name = f"{model_name}-endpoint"
s3_model_path = 's3://sagemaker-us-east-1-720332985926/huggingface-pytorch-training-2025-06-28-15-47-21-142/output/model.tar.gz'
region = 'us-east-1'
role = os.getenv("ROLE")

In [5]:
## Initialize Clients
sm = boto3.client('sagemaker', region_name=region)
runtime = boto3.client('sagemaker-runtime', region_name=region)

In [6]:
## Huging Face Inference Container

image_uri = sagemaker.image_uris.retrieve(
    framework='huggingface',
    region=region,
    version="4.49.0",
    image_scope='inference',
    base_framework_version='pytorch2.6.0',
    instance_type='ml.m5.xlarge'
)

In [7]:
## Create Model
print(f"Creating model: {model_name}")
sm.create_model(
    ModelName=model_name,
    PrimaryContainer={
        "Image":image_uri,
        "ModelDataUrl": s3_model_path,
        "Environment": {
            "HF_TASK": "text2text-generation"
        }
    },
    ExecutionRoleArn=role
)

Creating model: flan-t5-finetuned


{'ModelArn': 'arn:aws:sagemaker:us-east-1:720332985926:model/flan-t5-finetuned',
 'ResponseMetadata': {'RequestId': 'e1bcf027-492b-4bb7-91ab-a5be80fb329e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e1bcf027-492b-4bb7-91ab-a5be80fb329e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '79',
   'date': 'Sun, 29 Jun 2025 04:29:09 GMT'},
  'RetryAttempts': 0}}

In [13]:
# s3 = boto3.client('s3')
# s3.put_object(Bucket='gen-ai-repository',Key='async-inference-output')

In [16]:
## Create endpoint Config
print(f"Creating endpoint config: {endpoint_name}")

sm.create_endpoint_config(
    EndpointConfigName=endpoint_name,
    ProductionVariants=[
        {
            "VariantName": "AllTraffic",
            "ModelName" :model_name,
            "InstanceType": "ml.m5.xlarge",
            "InitialInstanceCount":1
        }
    ],
    AsyncInferenceConfig={
        "OutputConfig": {
            "S3OutputPath": "s3://gen-ai-repository/async-inference-output/"
        },
        "ClientConfig": {
            "MaxConcurrentInvocationsPerInstance": 1
        }
    }
)

Creating endpoint config: flan-t5-finetuned-endpoint


{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:720332985926:endpoint-config/flan-t5-finetuned-endpoint',
 'ResponseMetadata': {'RequestId': '585d31c9-ee6d-472f-981a-09806968d0c9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '585d31c9-ee6d-472f-981a-09806968d0c9',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '107',
   'date': 'Sun, 29 Jun 2025 04:39:38 GMT'},
  'RetryAttempts': 0}}

In [17]:
## Creating Endpomnt 
print(f"Deploying endpoint: {endpoint_name}")
sm.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_name
)

Deploying endpoint: flan-t5-finetuned-endpoint


{'EndpointArn': 'arn:aws:sagemaker:us-east-1:720332985926:endpoint/flan-t5-finetuned-endpoint',
 'ResponseMetadata': {'RequestId': '55494e64-122f-4b23-8cc1-6d9bef88f603',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '55494e64-122f-4b23-8cc1-6d9bef88f603',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '94',
   'date': 'Sun, 29 Jun 2025 04:40:00 GMT'},
  'RetryAttempts': 0}}

In [18]:
## Wait for Endpoint to become InService......
while True:
    status = sm.describe_endpoint(EndpointName=endpoint_name)["EndpointStatus"]
    if status =='InService':
        print(f"Endpoint is live: {endpoint_name}")
        break
    elif status =='Failed':
        raise RuntimeError("Endpoint creation failed.")
    else:
        print(f"Still creating.....current status: {status}")
        time.sleep(30)

Still creating.....current status: Creating
Still creating.....current status: Creating
Still creating.....current status: Creating
Still creating.....current status: Creating
Still creating.....current status: Creating
Still creating.....current status: Creating
Still creating.....current status: Creating
Endpoint is live: flan-t5-finetuned-endpoint


In [19]:
prompt = "Classify the intent: what is the la's time zone"

In [24]:
response = runtime.invoke_endpoint_async(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Input=f'{{"inputs":"{prompt}"}}'.encode('utf-8')
)

In [None]:
result = response["Body"].read().decode().split('"')[3]

In [None]:
print(f"Model output: {result}")

In [25]:
def delete_endpoint(endpoint_name,model_name):
    sm.delete_endpoint(EndpointName=endpoint_name)
    print(f"Deleted endpoint: {endpoint_name}")
    sm.delete_endpoint_config(EndpointConfigName=endpoint_name)
    print(f"Deleted endpoint config: {endpoint_name}")
    sm.delete_model(ModelName=model_name)
    print(f"Deleted model: {model_name}")
    return None

In [28]:
# delete_endpoint(endpoint_name, model_name)