In [None]:
import sagemaker
from sagemaker.transformer import Transformer
from sagemaker.model import Model

## Build and Deploy the Inference container

Since this embedding extraction job relies on multiple custom libraries (openslide, opencv,...) we will use a custom SageMaker Model Container. 

`./build_and_push.sh wsi-embedding`

In [None]:
model = Model(
    name="wsi-embeddings",
    image_uri="xxx.dkr.ecr.us-east-1.amazonaws.com/patch:latest",
    role=sagemaker.get_execution_role(),
)

model.create()

In [None]:
import boto3
import json

def create_manifest_file(bucket_name, prefix=""):
    """Create a manifest file for .svs files in the S3 bucket"""
    s3_client = boto3.client('s3')
    
    # List all objects in the bucket with the given prefix
    paginator = s3_client.get_paginator('list_objects_v2')
    manifest_data = []
    
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        if 'Contents' in page:
            for obj in page['Contents']:
                if obj['Key'].endswith('.svs'):
                    manifest_data.append({
                        "source": f"s3://{bucket_name}/{obj['Key']}"
                    })

    # Write manifest file to S3
    manifest_content = "\n".join(json.dumps(item) for item in manifest_data)
    manifest_key = "manifest.jsonl"
    s3_client.put_object(
        Bucket=bucket_name,
        Key=manifest_key,
        Body=manifest_content.encode('utf-8')
    )
    
    return f"s3://{bucket_name}/{manifest_key}"

# Create the manifest file
manifest_path = create_manifest_file(
    bucket_name="pathologybenchmark-s3bucket-u7pe00xtbplu"
)


In [None]:
# Create transformer
transformer = Transformer(
    model_name="wsi-embeddings",
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    output_path="s3://xxxx/embeddings/",
    base_transform_job_name="wsi-embeddings",
    accept="application/x-embeddings",
    assemble_with="None",
    max_concurrent_transforms=1,
    strategy="SingleRecord",
    env = {'SAGEMAKER_MODEL_SERVER_TIMEOUT' : '3600',
           'SAGEMAKER_SERVING_TIME':'3600'}
)

# Run the transform job directly with the manifest path
transformer.transform(
    data=manifest_path,
    content_type="application/jsonlines",
    split_type="Line",
    model_client_config={'InvocationsTimeoutInSeconds':3600}
)
