This notebook allows you to test the retrieval API endpoints and workflow integration for the audio/video embeddings system. It demonstrates how to:

1. Upload video files to the S3 bucket for processing
2. Monitor Step Functions workflow executions
3. Test the retrieval API endpoints with different query parameters

In [None]:
import requests
import boto3
import os
import pandas as pd
import json
from PIL import Image as PILImage

region = os.environ.get("AWS_DEFAULT_REGION", "us-west-2")
ssm = boto3.client(service_name="ssm", region_name=region)
s3_client = boto3.client('s3')
sns_client = boto3.client('stepfunctions')

api_method = "POST"

def get_ssm_parameter(name):
    response = ssm.get_parameter(Name=name, WithDecryption=True)
    return response["Parameter"]["Value"]



In [None]:
api_url = get_ssm_parameter("/videopgvector/api_retrieve")
bucket_name = get_ssm_parameter("/videopgvector/bucket_name")
state_machine_arn = get_ssm_parameter("/videopgvector/state_machine_arn")

In [None]:
# Upload Video to Amazon S3 bucket
def upload_file_to_s3 (video_path,bucket_name,s3_key):
    s3_client.upload_file(video_path, bucket_name,s3_key)
    print("Upload successful!")

base_path = "images/"

def download_file(base_path,bucket, key, filename):
    print("Download file from s3://{}{}".format(bucket,key))
    with open(base_path+filename, "wb") as data:
        s3_client.download_fileobj(bucket, key, data)
    print("Download file from s3://{}{}".format(bucket,key))
    return True

def read_image_from_s3(s3_key):
    parts = s3_key.split('s3://')[-1].split('/', 1)
    bucket_name = parts[0]
    image_key = parts[1]
    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=image_key)
        image_data = response['Body'].read()
        return image_data
    except Exception as e:
        print(f'Error reading image from {s3_key}: {str(e)}')
        raise


In [None]:
video_path = "<your-video-path>"
s3_key = f"video/{video_path}"
upload_file_to_s3 (video_path,bucket_name,s3_key)

### Check the status of the Step Functions workflow processing your video

In [None]:
response = sns_client.describe_state_machine(
    stateMachineArn=state_machine_arn
)

In [None]:
response = sns_client.list_executions(
    stateMachineArn=state_machine_arn,
    maxResults=12
)
response['executions'][0]

In [None]:
request_body =   {
    "query": "what is aurora"
    "", 
    "method": "retrieve",
    "k": 10
  }

response = requests.post(api_url, json=request_body)
response

In [None]:
docs = response.json().get("docs")

In [None]:
pd.DataFrame({"id": doc.get("id"),"page_content": doc.get("page_content"), **doc.get("metadata")} for doc in docs)

In [None]:
def show_row_image_text(docs):
    for row in docs:
        if row.get("metadata").get("content_type") == "text":
            print (f"text:\n{row.get('chunks')}\nmetadata:{row.get('metadata')}\n")
        if row.get("metadata").get("content_type") == "image":
            print(row)
            sourceurl = row.get("metadata").get('source')
            print(sourceurl)
            bucket_name = sourceurl.split("/")[2] 
            key = sourceurl.replace("s3://", "").replace(bucket_name,"").lstrip("/")
            filename = sourceurl.split("/")[-1] 
            print("bucket_name: ",bucket_name)
            print("key: ",key)
            print("filename: ",filename)
            download_file(base_path,bucket_name, key, filename)
            img = PILImage.open(base_path+filename)            
            print (f"Image:\n{row.get('sourceurl')}\nmetadata:{row.get('metadata')}\n")
            display(img)

In [None]:
show_row_image_text(docs)

In [None]:
request_body =   {
    "query": "what is aurora"
    "", 
    "method": "retrieve_generate",
    "k": 10
  }

response = requests.post(api_url, json=request_body)
response

In [None]:
docs = response.json().get("docs")

In [None]:
response.json().get('response')

In [None]:
value = json.loads(response.json()["docs"])


In [None]:
pd.DataFrame({"id": doc.get("id"),"page_content": doc.get("page_content"), **doc.get("metadata")} for doc in value['docs'])

In [None]:
show_row_image_text(value['docs'])