# Nova Video Understanding

In this notebook we will be using Amazon Nova Pro's video understanding capability to analyze the video.

Ensure you have latest version of boto and have bedrock model access for Nova models

In [None]:
!pip install botocore --upgrade
!pip install boto3 --upgrade

In [None]:
!pip show boto3

In [None]:
import boto3 
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket() # Set a default S3 bucket
prefix = 'nova_video'

# Experiment 1B - Combined video

In [None]:
import json
import boto3
import sagemaker


bucket = "<<bucket>>" # Set a default S3 bucket


# AWS Configuration
AWS_ACCOUNT = boto3.client('sts').get_caller_identity().get('Account')
AWS_REGION = "us-east-1"
INFERENCE_PROFILE = f"arn:aws:bedrock:{AWS_REGION}:{AWS_ACCOUNT}:inference-profile/us.amazon.nova-pro-v1:0"
S3_BUCKET = bucket

# Video Configuration
VIDEO_FILE = "combined_videos.mov"
s3url = "s3://<<bucket-name>>nova_video/combined_videos.mov"

def parse_llm_response(json_string):
    try:
        # Parse the JSON string
        data = json.loads(json_string)
        
        # Extract output content
        output_content = []
        if "output" in data:
            if "message" in data["output"]:
                if "content" in data["output"]["message"]:
                    for content_item in data["output"]["message"]["content"]:
                        if "text" in content_item:
                            output_content.append(content_item["text"])
        
        # Extract token information
        token_info = {}
        if "usage" in data:
            token_info = {
                "input_tokens": data["usage"].get("inputTokens", 0),
                "output_tokens": data["usage"].get("outputTokens", 0),
                "total_tokens": data["usage"].get("totalTokens", 0)
            }
        
        return {
            "content": output_content,
            "tokens": token_info
        }
    except json.JSONDecodeError:
        return {"error": "Invalid JSON string"}
    except Exception as e:
        return {"error": f"An error occurred: {str(e)}"}

# Prompt Configuration
system_prompt = ""
user_prompt = ("""    As an expert traffic safety analyst, analyze the provided traffic video footage showing 3 simultaneous camera angles (Left, Center, Right views) and:

1. Safe Driving Behaviors (analyze across all 3 frames):

* Identify and highlight instances of proper following distance maintenance from multiple angles
* Note correct use of turn signals and lane changes as visible in different views
* Recognize appropriate speed adjustments for conditions with comprehensive perspective
* Point out defensive driving techniques visible across camera angles
* Correlate behaviors visible across multiple frames for complete assessment

2. Driver Safety Practices (focus on interior/center frame):

* Detect proper seat belt usage
* Observe correct hand positioning on steering wheel
* Note appropriate mirror checks during maneuvers
* Identify distraction-free driving behaviors. For example not using cellphone or eating
* Compare driver actions with external views for context

3. Critical Analysis (synthesizing all 3 views):

* Timestamp each identified safe driving behavior noting which frame(s) displayed the action
* Rate the effectiveness of each observed safety practice (1-5 scale) considering multi-angle visibility
* Provide specific commentary on what makes each highlighted action exemplary with reference to relevant camera angles
* Note how these actions contributed to accident prevention based on comprehensive view
* Highlight instances where multiple camera angles provided better safety assessment

4. Multi-View Integration:

* Note any safety behaviors that were only visible in specific frames
* Identify how different angles complement each other for complete safety analysis
* Compare and contrast safety observations between frames
* Highlight the value of multiple perspectives in assessing traffic safety

Generate driver score and provide recommendation/feedback for improvement
""")

# Message Configuration
user_message_list = [
    {
        "role": "user",
        "content": [
            {
                "video": {
                    "format": "mp4",
                    "source": {
                        "s3Location": {
                            "uri": s3url,
                            "bucketOwner": AWS_ACCOUNT
                        }
                    }
                }
            },
            {
                "text": user_prompt
            }
        ]
    }
]

system_message_list = [
    {
        "text": "You are an expert traffic safety analyst."
    }
]

body = {
    "schemaVersion": "messages-v1",
    "system": system_message_list,
    "messages": user_message_list,
    "inferenceConfig": {"max_new_tokens": 1024, "top_p": 0.1, "temperature": 0.1},
}

# Invoke Bedrock model
try:
    bedrock_runtime = boto3.client("bedrock-runtime", region_name=AWS_REGION)
    invocation = bedrock_runtime.invoke_model(
        body=json.dumps(body),
        modelId=INFERENCE_PROFILE,
        accept="application/json",
        contentType="application/json"
    )

    # Process response
    response_body = invocation["body"].read().decode('utf-8')
    print("Raw response:", response_body)
    
    # Parse the response using our parser
    parsed_response = parse_llm_response(response_body)
    
    # Print formatted results
    print("\nParsed Output Content:")
    for content in parsed_response["content"]:
        print(content)
        print("\n---\n")

    print("Token Information:")
    for key, value in parsed_response["tokens"].items():
        print(f"{key}: {value}")

except Exception as e:
    print(f"An error occurred: {str(e)}")


In [None]:
import json
import boto3
import sagemaker


bucket = "<<bucket>>" # Set a default S3 bucket


# AWS Configuration
AWS_ACCOUNT = boto3.client('sts').get_caller_identity().get('Account')
AWS_REGION = "us-east-1"
INFERENCE_PROFILE = f"arn:aws:bedrock:{AWS_REGION}:{AWS_ACCOUNT}:inference-profile/us.amazon.nova-pro-v1:0"
S3_BUCKET = bucket

# Video Configuration
VIDEO_FILE = "combined_videos.mov"
s3url = "s3://<<bucket>>/nova_video/combined_videos.mov"

def parse_llm_response(json_string):
    try:
        # Parse the JSON string
        data = json.loads(json_string)
        
        # Extract output content
        output_content = []
        if "output" in data:
            if "message" in data["output"]:
                if "content" in data["output"]["message"]:
                    for content_item in data["output"]["message"]["content"]:
                        if "text" in content_item:
                            output_content.append(content_item["text"])
        
        # Extract token information
        token_info = {}
        if "usage" in data:
            token_info = {
                "input_tokens": data["usage"].get("inputTokens", 0),
                "output_tokens": data["usage"].get("outputTokens", 0),
                "total_tokens": data["usage"].get("totalTokens", 0)
            }
        
        return {
            "content": output_content,
            "tokens": token_info
        }
    except json.JSONDecodeError:
        return {"error": "Invalid JSON string"}
    except Exception as e:
        return {"error": f"An error occurred: {str(e)}"}

# Prompt Configuration
system_prompt = ""
user_prompt = ("""    As an expert traffic safety analyst, analyze the provided traffic video footage showing 3 simultaneous camera angles and:

Analyze the provided video footage and answer the following:
Are there any pedestrians visible? If yes, describe their location, actions, and appearance.
Are any pedestrians at risk due to the driver's behavior? Explain why or why not.
Is the driver driving safely? Provide specific examples of unsafe or safe behaviors.
Is the driver distracted? If so, describe how and when.
""")

# Message Configuration
user_message_list = [
    {
        "role": "user",
        "content": [
            {
                "video": {
                    "format": "mov",
                    "source": {
                        "s3Location": {
                            "uri": s3url,
                            "bucketOwner": AWS_ACCOUNT
                        }
                    }
                }
            },
            {
                "text": user_prompt
            }
        ]
    }
]

system_message_list = [
    {
        "text": "You are an expert traffic safety analyst."
    }
]

body = {
    "schemaVersion": "messages-v1",
    "system": system_message_list,
    "messages": user_message_list,
    "inferenceConfig": {"max_new_tokens": 1024, "top_p": 0.1, "temperature": 0.1},
}

# Invoke Bedrock model
try:
    bedrock_runtime = boto3.client("bedrock-runtime", region_name=AWS_REGION)
    invocation = bedrock_runtime.invoke_model(
        body=json.dumps(body),
        modelId=INFERENCE_PROFILE,
        accept="application/json",
        contentType="application/json"
    )

    # Process response
    response_body = invocation["body"].read().decode('utf-8')
    print("Raw response:", response_body)
    
    # Parse the response using our parser
    parsed_response = parse_llm_response(response_body)
    
    # Print formatted results
    print("\nParsed Output Content:")
    for content in parsed_response["content"]:
        print(content)
        print("\n---\n")

    print("Token Information:")
    for key, value in parsed_response["tokens"].items():
        print(f"{key}: {value}")

except Exception as e:
    print(f"An error occurred: {str(e)}")


# Option 1 A: Call Nova for each model and then do final summary

In [None]:
import json
import boto3

def analyze_traffic_video(video_url, aws_region="us-east-1", bucket_owner=None):
    """
    Analyzes traffic video for safety behaviors using AWS Bedrock.
    
    Args:
        video_url (str): S3 URL of the video to analyze
        aws_region (str): AWS region (default: "us-east-1")
        bucket_owner (str): AWS account ID of bucket owner (optional)
        
    Returns:
        dict: Parsed response containing analysis results and token information
    """
    try:
        # Get AWS account ID if not provided
        if not bucket_owner:
            bucket_owner = boto3.client('sts').get_caller_identity().get('Account')
            
        # Configure inference profile
        inference_profile = f"arn:aws:bedrock:{aws_region}:{bucket_owner}:inference-profile/us.amazon.nova-pro-v1:0"
        
        # Define prompts
        user_prompt = """As an expert traffic safety analyst, analyze the provided traffic video footage showing 3 simultaneous camera angles (Left, Center, Right views) and:

1. Safe Driving Behaviors (analyze across all 3 frames):
* Identify and highlight instances of proper following distance maintenance from multiple angles
* Note correct use of turn signals and lane changes as visible in different views
* Recognize appropriate speed adjustments for conditions with comprehensive perspective
* Point out defensive driving techniques visible across camera angles
* Correlate behaviors visible across multiple frames for complete assessment

2. Driver Safety Practices (focus on interior/center frame):
* Detect proper seat belt usage
* Observe correct hand positioning on steering wheel
* Note appropriate mirror checks during maneuvers
* Identify distraction-free driving behaviors
* Compare driver actions with external views for context

3. Critical Analysis (synthesizing all 3 views):
* Timestamp each identified safe driving behavior noting which frame(s) displayed the action
* Rate the effectiveness of each observed safety practice (1-5 scale)
* Provide specific commentary on what makes each highlighted action exemplary
* Note how these actions contributed to accident prevention
* Highlight instances where multiple camera angles provided better safety assessment

4. Multi-View Integration:
* Note any safety behaviors that were only visible in specific frames
* Identify how different angles complement each other
* Compare and contrast safety observations between frames
* Highlight the value of multiple perspectives

Generate driver score and provide recommendation/feedback for improvement"""

        # Configure messages
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "video": {
                            "format": "mp4",
                            "source": {
                                "s3Location": {
                                    "uri": video_url,
                                    "bucketOwner": bucket_owner
                                }
                            }
                        }
                    },
                    {
                        "text": user_prompt
                    }
                ]
            }
        ]

        system_messages = [
            {
                "text": "You are an expert traffic safety analyst."
            }
        ]

        # Prepare request body
        request_body = {
            "schemaVersion": "messages-v1",
            "system": system_messages,
            "messages": messages,
            "inferenceConfig": {
                "max_new_tokens": 1024,
                "top_p": 0.1,
                "temperature": 0.1
            }
        }

        # Initialize Bedrock client and invoke model
        bedrock_runtime = boto3.client("bedrock-runtime", region_name=aws_region)
        response = bedrock_runtime.invoke_model(
            body=json.dumps(request_body),
            modelId=inference_profile,
            accept="application/json",
            contentType="application/json"
        )

        # Parse and process response
        response_body = response["body"].read().decode('utf-8')
        return parse_llm_response(response_body)

    except Exception as e:
        return {"error": f"Analysis failed: {str(e)}"}

def parse_llm_response(json_string):
    """
    Parses the LLM response JSON string.
    
    Args:
        json_string (str): JSON response from the model
        
    Returns:
        dict: Parsed content and token information
    """
    try:
        data = json.loads(json_string)
        
        output_content = []
        if "output" in data:
            if "message" in data["output"]:
                if "content" in data["output"]["message"]:
                    for content_item in data["output"]["message"]["content"]:
                        if "text" in content_item:
                            output_content.append(content_item["text"])
        
        token_info = {}
        if "usage" in data:
            token_info = {
                "input_tokens": data["usage"].get("inputTokens", 0),
                "output_tokens": data["usage"].get("outputTokens", 0),
                "total_tokens": data["usage"].get("totalTokens", 0)
            }
        
        return {
            "content": output_content,
            "tokens": token_info
        }
    except json.JSONDecodeError:
        return {"error": "Invalid JSON response"}
    except Exception as e:
        return {"error": f"Parsing error: {str(e)}"}


In [None]:
def format_analysis_output(response_dict):
    """
    Formats the analysis output into a readable structure
    
    Args:
        response_dict (dict): The response dictionary containing content and tokens
        
    Returns:
        str: Formatted analysis text
    """
    if 'content' not in response_dict or not response_dict['content']:
        return "No content available"

    # Get the analysis text from the content
    analysis_text = response_dict['content'][0]
    
    # Split the text into sections
    sections = analysis_text.split('\n\n')
    
    # Format the output
    formatted_output = """
TRAFFIC VIDEO ANALYSIS REPORT
============================

"""
    for section in sections:
        if section.startswith('1. Safe'):
            formatted_output += "SAFE DRIVING BEHAVIORS\n" + "="*20 + "\n"
            behaviors = section.split('- ')[1:]
            for behavior in behaviors:
                formatted_output += f"• {behavior}\n"
                
        elif section.startswith('2. Driver'):
            formatted_output += "\nDRIVER SAFETY PRACTICES\n" + "="*20 + "\n"
            practices = section.split('- ')[1:]
            for practice in practices:
                formatted_output += f"• {practice}\n"
                
        elif section.startswith('3. Critical'):
            formatted_output += "\nCRITICAL ANALYSIS\n" + "="*20 + "\n"
            analyses = section.split('- ')[1:]
            for analysis in analyses:
                formatted_output += f"• {analysis}\n"
                
        elif section.startswith('4. Multi'):
            formatted_output += "\nMULTI-VIEW INTEGRATION\n" + "="*20 + "\n"
            points = section.split('- ')[1:]
            for point in points:
                formatted_output += f"• {point}\n"
                
        elif section.startswith('Driver score'):
            formatted_output += "\nFINAL ASSESSMENT\n" + "="*20 + "\n"
            scores = section.split('\n')
            for score in scores:
                formatted_output += f"{score}\n"

    # Add token information
    if 'tokens' in response_dict:
        formatted_output += f"\nTOKEN USAGE\n" + "="*20 + "\n"
        for key, value in response_dict['tokens'].items():
            formatted_output += f"{key.replace('_', ' ').title()}: {value}\n"

    return formatted_output

In [None]:
infacing=analyze_traffic_video("s3://<<bucket>>/nova_video/infacing_dc.mov")
infacingresponse=infacing['content'][0]
print(infacingresponse)

In [None]:
frontfacing=analyze_traffic_video("s3://<<bucket>>/nova_video/frontfacing_dc.mov")
print(frontfacing)
frontfacingresponse=frontfacing['content'][0]
print(frontfacingresponse)

In [None]:
sidefacing=analyze_traffic_video("s3://<<bucket>>/nova_video/sidefacing_dc.mov")
print(sidefacing)
sidefacingresponse=sidefacing['content'][0]
print(sidefacingresponse)

# Option 1C - Process individual videos with Amazon Nova Pro and then summarize with Claude Sonnet

In [None]:
import boto3

bedrock_client = boto3.client('bedrock-runtime')
model_id_sonnet37 ="us.anthropic.claude-3-7-sonnet-20250219-v1:0"
def summarize(client, inface,side,front,model_id,max_tokens=4096, temperature=0, top_p=0.9):
    prompt=""" You are safety expert for safe driving. Analyze the summary from individual camera feed and 
    generate the final recommendation and summary 
    <inward_facing_camera>"""+inface+ """</inward_facing_camera> <side_camera>"""+side+""" </side_camera> <front_camera>"""+front+"""</front_camera>"""
    
    response = ""
    try:
        response = bedrock_client.converse(
            modelId=model_id_sonnet37,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "text": prompt
                        } 
                    ]
                }
                    
            ],
            inferenceConfig={
                "temperature": temperature,
                "maxTokens": max_tokens,
                "topP": top_p
            }
            #additionalModelRequestFields={
            #}
        )
    except Exception as e:
        print(e)
        result = "Model invocation error"
    try:
        res=response['output']['message']['content'][0]['text']
        result = response['output']['message']['content'][0]['text'] \
        + '\n--- Latency: ' + str(response['metrics']['latencyMs']) \
        + 'ms - Input tokens:' + str(response['usage']['inputTokens']) \
        + ' - Output tokens:' + str(response['usage']['outputTokens']) + ' ---\n'
        #data=res.split(":", 1)
        #data.to_csv('file1.csv')
        return res
    except Exception as e:
        print(e)
        result = "Output parsing error"
    return result

In [None]:
summary=summarize(bedrock_client,infacingresponse,sidefacingresponse,frontfacingresponse,model_id_sonnet37)
print(summary)