# Stable Video Diffusion XT 1.1 on Amazon SageMaker

Stability AI's [Stable Video Diffusion](https://stability.ai/stable-video) foundation model is a diffusion model that takes a single still image as conditioning frame and generates a short 4 second video with multiple frames.

This notebook walks you through invoking an [Asynchronous Inference Endpoint](https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference.html) on Amazon SageMaker using the SVD-XT-1.1 model by Stability AI.

![architecture diagram](architecture/async_inference.png)

## Import Packages and Set SageMaker Variables

Import all needed packages to make sure they installed correctly.

In [None]:
import os
import json
import shutil

import boto3
from botocore.exceptions import ClientError

import sagemaker
from sagemaker.huggingface.model import HuggingFaceModel
from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig
from sagemaker.s3 import s3_path_join

from diffusers.utils import export_to_video, make_image_grid
import ffmpeg

In [None]:
sm_session_bucket = None

sm_session = sagemaker.Session()

if sm_session_bucket is None and sm_session is not None:
    # set to default bucket if a bucket name is not given
    sm_session_bucket = sm_session.default_bucket()

try:
    sm_role = sagemaker.get_execution_role()
except ValueError:
    iam_client = boto3.client("iam")
    sm_role = iam_client.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

In [None]:
print(f"sagemaker role arn: {sm_role}")
print(f"sagemaker bucket: {sm_session_bucket}")
print(f"sagemaker session region: {sm_session.boto_region_name}")

## Choosing Conditioning Images and Inference Parameters

### Method for Encoding Conditioning Image

Use this to embed a base64-encoded image from a local file. Otherwise, simply pass the `https://...` URL directly.

In [None]:
import json
import base64


def encode_image(image_path):
    # encode as data URI with base64
    with open(image_path, "rb") as image_file:
        return "data:text/plain;base64," + base64.b64encode(image_file.read()).decode("utf-8")

def generate_request_payload(data):
    os.makedirs("tmp/request_payloads", exist_ok=True)
    movie_title = data["movie_title"]
    file_name = f"tmp/request_payloads/{movie_title}.json"
    with open(file_name, "w") as f:
        json.dump(data, f)
    return file_name

### Set Movie Name and Inference Parameters

Select one of the sets of inference parameters below and run that cell, or create your own set of parameters. Each variation creates a different video.

You can use the `encode_image(path_to_local_file)` function to embed an image from a local file.

Alternatively, you can pass directly an `https://...` URL to a file available on the Internet, to be downloaded by the SageMaker endpoint during inference.

In [None]:
generate_request_payload({
    "movie_title": "rocket_1",
    "image": encode_image("images_scaled/rocket.png"),
    "width": 1024,
    "height": 576,
    "num_frames": 25,
    "num_inference_steps": 25,
    "min_guidance_scale": 1.0,
    "max_guidance_scale": 3.0,
    "fps": 6,
    "motion_bucket_id": 127,
    "noise_aug_strength": 0.02,
    "decode_chunk_size": 8,
    "seed": 42,
})

In [None]:
generate_request_payload({
    "movie_title": "rocket_2",
    "image": encode_image("images_scaled/rocket.png"),
    "width": 1024,
    "height": 576,
    "num_frames": 25,
    "num_inference_steps": 25,
    "min_guidance_scale": 1.0,
    "max_guidance_scale": 3.0,
    "fps": 6,
    "motion_bucket_id": 180,
    "noise_aug_strength": 0.02,
    "decode_chunk_size": 8,
    "seed": 42,
})

In [None]:
generate_request_payload({
    "movie_title": "smoke",
    "image": "https://raw.githubusercontent.com/aws-samples/sagemaker-hosted-stable-video-diffusion-img2vid-xt/main/images_scaled/smoke.jpg",
    "width": 576,
    "height": 1024,
    "num_frames": 25,
    "num_inference_steps": 50,
    "min_guidance_scale": 0.5,
    "max_guidance_scale": 1.0,
    "fps": 6,
    "motion_bucket_id": 25,
    "noise_aug_strength": 0.8,
    "decode_chunk_size": 8,
    "seed": 111142,
})

In [None]:
generate_request_payload({
    "movie_title": "colored_smoke",
    "image": "https://raw.githubusercontent.com/aws-samples/sagemaker-hosted-stable-video-diffusion-img2vid-xt/main/images_scaled/colored_smoke.jpg",
    "width": 576,
    "height": 1024,
    "num_frames": 25,
    "num_inference_steps": 50,
    "min_guidance_scale": 0.5,
    "max_guidance_scale": 1.0,
    "fps": 6,
    "motion_bucket_id": 25,
    "noise_aug_strength": 0.8,
    "decode_chunk_size": 8,
    "seed": 111142,
})

In [None]:
generate_request_payload({
    "movie_title": "beach_bike_1",
    "image": encode_image("images_scaled/beach_bike.jpg"),
    "width": 1024,
    "height": 576,
    "num_frames": 25,
    "num_inference_steps": 25,
    "min_guidance_scale": 1.0,
    "max_guidance_scale": 3.0,
    "fps": 6,
    "motion_bucket_id": 127,
    "noise_aug_strength": 0.02,
    "decode_chunk_size": 8,
    "seed": 1234567890,
})

In [None]:
generate_request_payload({
    "movie_title": "beach_bike_2",
    "image": encode_image("images_scaled/beach_bike.jpg"),
    "width": 1024,
    "height": 576,
    "num_frames": 25,
    "num_inference_steps": 25,
    "min_guidance_scale": 1.0,
    "max_guidance_scale": 3.0,
    "fps": 6,
    "motion_bucket_id": 127,
    "noise_aug_strength": 0.02,
    "decode_chunk_size": 8,
    "seed": 123,
})

In [None]:
generate_request_payload({
    "movie_title": "waterfall",
    "image": encode_image("images_scaled/waterfall.jpg"),
    "width": 1024,
    "height": 576,
    "num_frames": 25,
    "num_inference_steps": 25,
    "min_guidance_scale": 1.0,
    "max_guidance_scale": 3.0,
    "fps": 6,
    "motion_bucket_id": 127,
    "noise_aug_strength": 0.02,
    "decode_chunk_size": 8,
    "seed": 1234567890,
})

In [None]:
generate_request_payload({
    "movie_title": "boat_ocean",
    "image": encode_image("images_scaled/boat_ocean.jpg"),
    "width": 1024,
    "height": 576,
    "num_frames": 25,
    "num_inference_steps": 25,
    "min_guidance_scale": 1.0,
    "max_guidance_scale": 3.0,
    "fps": 6,
    "motion_bucket_id": 127,
    "noise_aug_strength": 0.02,
    "decode_chunk_size": 8,
    "seed": 42,
})

In [None]:
generate_request_payload({
    "movie_title": "red_car",
    "image": encode_image("images_scaled/red_car.jpg"),
    "width": 1024,
    "height": 576,
    "num_frames": 25,
    "num_inference_steps": 25,
    "min_guidance_scale": 1.0,
    "max_guidance_scale": 3.0,
    "fps": 6,
    "motion_bucket_id": 127,
    "noise_aug_strength": 0.02,
    "decode_chunk_size": 8,
    "seed": 42,
})

In [None]:
generate_request_payload({
    "movie_title": "coffee_stream",
    "image": encode_image("images_scaled/coffee_stream.jpg"),
    "width": 576,
    "height": 1024,
    "num_frames": 25,
    "num_inference_steps": 25,
    "min_guidance_scale": 1.0,
    "max_guidance_scale": 3.0,
    "fps": 6,
    "motion_bucket_id": 127,
    "noise_aug_strength": 0.02,
    "decode_chunk_size": 8,
    "seed": 42,
})

In [None]:
generate_request_payload({
    "movie_title": "koi",
    "image": encode_image("images_scaled/koi.jpg"),
    "width": 1024,
    "height": 576,
    "num_frames": 25,
    "num_inference_steps": 25,
    "min_guidance_scale": 1.0,
    "max_guidance_scale": 3.0,
    "fps": 6,
    "motion_bucket_id": 127,
    "noise_aug_strength": 0.02,
    "decode_chunk_size": 8,
    "seed": 9288258982,
})

In [None]:
generate_request_payload({
    "movie_title": "champagne2",
    "image": encode_image("images_scaled/champagne2.jpg"),
    "width": 576,
    "height": 1024,
    "num_frames": 25,
    "num_inference_steps": 25,
    "min_guidance_scale": 1.0,
    "max_guidance_scale": 3.0,
    "fps": 6,
    "motion_bucket_id": 127,
    "noise_aug_strength": 0.02,
    "decode_chunk_size": 8,
    "seed": 42,
})

## Upload Request Payload and Invoke Endpoint

Upload the JSON request payload to Amazon S3 and invoke the endpoint for inference. Invocation time for a video with 25 inference steps is about 2 minutes.


In [None]:
def upload_file(input_location):
    return sm_session.upload_data(
        input_location,
        bucket=sm_session_bucket,
        key_prefix="async_inference/input",
        extra_args={"ContentType": "application/json"},
    )

# select one of the previously generated request payload files
file_name = "tmp/request_payloads/red_car.json"

# ... and upload it to S3
input_s3_location = upload_file(file_name)
print(f"Request payload location: {input_s3_location}")

## Invoke the deployed Amazon SageMaker Endpoint

If the model was previously deployed to an endpoint, set the `endpoint_name` variable.

In [None]:
# read endpoint name from local config file
with open("deployed_endpoint_name.txt", "r") as f:
    endpoint_name = f.read().strip()

# or set manually
# endpoint_name = "<YOUR_MODEL_ENDPOINT_NAME>"

print(f"Using SageMaker endpoint: {endpoint_name}")

Invoke the model asynchronously using the uploaded request JSON file from S3.

In [None]:
sm_runtime_client = boto3.client("sagemaker-runtime")

invoke_response = sm_runtime_client.invoke_endpoint_async(
    EndpointName=endpoint_name,
    InputLocation=input_s3_location,
    InvocationTimeoutSeconds=3600,
)
print(f"Model invocation response payload: {invoke_response}")

### Wait and Poll for Model Response

Poll the Amazon S3 bucket for a response from the model invocation.


In [None]:
import urllib
import time


# function reference:
# https://github.com/aws/amazon-sagemaker-examples/blob/main/async-inference/Async-Inference-Walkthrough-SageMaker-Python-SDK.ipynb
def get_output(invoke_response):
    output_location = invoke_response["OutputLocation"]
    failure_location = invoke_response["FailureLocation"]

    output_url = urllib.parse.urlparse(output_location)
    bucket = output_url.netloc
    key = output_url.path[1:]

    failure_url = urllib.parse.urlparse(failure_location)
    failure_bucket = failure_url.netloc
    failure_key = failure_url.path[1:]

    while True:
        try:
            return sm_session.read_s3_file(bucket=bucket, key_prefix=key)
        except ClientError as e:
            if e.response["Error"]["Code"] == "NoSuchKey":
                print("Waiting for model output...")
                try:
                    f = sm_session.read_s3_file(bucket=failure_bucket, key_prefix=failure_key)
                    print("Invocation failed:", f)
                    return
                except Exception as e2:
                    pass
                time.sleep(15)
                continue
            raise

In [None]:
%%time

output = get_output(invoke_response)
print(f"Sample of output: {output[:500]}")

## Frames to MP4 Video

### Frames to MP4 Video

Convert binary objects in response to JPEGs of each frame, then combine then into MP4 using Hugging Face's `diffusers.utils.export_to_video` method.


In [None]:
import os
import base64
from PIL import Image
from diffusers.utils import export_to_video


def load_video_frames(video_frames):
    loaded_video_frames = []
    os.makedirs("tmp/frames_out", exist_ok=True)

    for idx, video_frame in enumerate(video_frames):
        frame = bytes(video_frame, "raw_unicode_escape")
        frame_name = f"tmp/frames_out/frame_{idx+1:02}.jpg"
        with open(frame_name, "wb") as fh:
            fh.write(base64.decodebytes(frame))

        image = Image.open(frame_name, mode="r")
        loaded_video_frames.append(image)

    return loaded_video_frames

In [None]:
output = get_output(invoke_response)
data = json.loads(output)
frames = data["frames"]
movie_title = data["config"]["movie_title"]
fps = data["config"]["fps"]

loaded_video_frames = load_video_frames(frames)

os.makedirs("tmp/video_out", exist_ok=True)
export_to_video(loaded_video_frames, f"tmp/video_out/{movie_title}.mp4", fps=fps)
print(f"Video created: {movie_title}.mp4")

### Display Frames as Grid

Display the 25 frames as a 5x5 grid using Hugging Face's `diffusers.utils.make_image_grid` method.


In [None]:
from diffusers.utils import make_image_grid

image = make_image_grid(loaded_video_frames, 5, 5)
(width, height) = (image.width // 2, image.height // 2)
im_resized = image.resize((width, height))
display(im_resized)
im_resized.save("frames.png")

### Display Video

Convert video CODEC to H.264 and display in notebook at 50% of actual size.


In [None]:
import ffmpeg
from IPython.display import Video

output_options = {
    "crf": 20,
    "preset": "slower",
    "movflags": "faststart",
    "pix_fmt": "yuv420p",
    "vcodec": "libx264",
}

ffmpeg.input(f"tmp/video_out/{movie_title}.mp4").output("tmp/video_out/tmp.mp4", **output_options).run(overwrite_output=True, quiet=True)

Video(
    url="tmp/video_out/tmp.mp4",
    width=(loaded_video_frames[0].width // 2),
    html_attributes="controls muted autoplay loop",
)

## Generating of Multiple Video Variations

Generating multiple videos variations by combining the above code in a loop. In this example we are creating five variations, changing the seed each time.


In [None]:
import random
import json
from diffusers.utils import export_to_video

sm_runtime_client = boto3.client("sagemaker-runtime")

for i in range(3):
    seed = random.randrange(1, 9999999999)
    movie_title = f"red_car_{seed}"
    data = {
        "movie_title": movie_title,
        "image": encode_image("images_scaled/red_car.jpg"),
        "width": 1024,
        "height": 576,
        "num_frames": 25,
        "num_inference_steps": 25,
        "min_guidance_scale": 1.0,
        "max_guidance_scale": 3.0,
        "fps": 6,
        "motion_bucket_id": 127,
        "noise_aug_strength": 0.02,
        "decode_chunk_size": 8,
        "seed": seed,
    }
    file_name = generate_request_payload(data)
    input_s3_location = upload_file(file_name)

    response = sm_runtime_client.invoke_endpoint_async(
        EndpointName=endpoint_name,
        InputLocation=input_s3_location,
        InvocationTimeoutSeconds=3600,
    )

    output = get_output(response)
    data = json.loads(output)
    loaded_video_frames = load_video_frames(data["frames"])

    os.makedirs("tmp/video_out", exist_ok=True)
    export_to_video(loaded_video_frames, f"tmp/video_out/{movie_title}.mp4", fps=6)
    print(f"Video created: {movie_title}.mp4")

## Cleanup

Once you are done with your experiments, consider deleting the SageMaker endpoint by uncommenting and running the below code.

In [None]:
# client_sm = boto3.client("sagemaker")
# client_sm.delete_endpoint(EndpointName=endpoint_name)