# Physical AI-Driven Action Sequence Analysis and MDP Modeling Using NVIDIA Cosmos Reason

---
# Part A: Video Recording

1) Only use one hand and manipulate one object at a time.
2) Do not try to do the same order as the other people! Every person can pick a
different sequence of actions to complete the task.
3) Try to make your hand gestures obvious when grasping or releasing an object.

---
# Part B: Object and Human Action Recognition


## Import Libraries and Suppress Warnings

In [9]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torchvision")
warnings.filterwarnings("ignore", message=".*video metadata.*")

from pathlib import Path
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import glob
import json
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import json
from datetime import datetime
import cv2

In [12]:
def get_video_frame_count(video_path, max_frames=500):
    """
    Get the total number of frames in a given video using OpenCV.

    Args:
        Video_path: path to video

    Returns:
        int: total number of frames in given video
    """

    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        print(f"Warning: could not open video {video_path}")
        return 60
    
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    duration = frame_count / fps if fps > 0 else 0

    cap.release()
    capped_count = min(frame_count -1, max_frames)

    print(f" Video: {Path(video_path).name} | Frames: {frame_count} | FPS: {fps:.2f} | Duration: {duration:.2f} secs")
    return capped_count

## Setup Model Function

Load the Cosmos Reason model and processor from Hugging Face.

In [2]:
def setup_model(model_name="nvidia/Cosmos-Reason2-8B"):
    """
    Load the Cosmos Reason model and processor
    
    Args:
        model_name: Name of the model on Hugging Face
        
    Returns:
        model, processor tuple
    """
    print(f"Loading model: {model_name}")
    print("This may take a few minutes on first run...")
    
    # Load the model
    model = Qwen3VLForConditionalGeneration.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    
    # Load the processor
    processor = AutoProcessor.from_pretrained(model_name)
    
    print("Model loaded successfully!")
    return model, processor

## Analyze Video Function

Process a video file with a question, handle system prompts for reasoning, and parse the output.

In [3]:
def analyze_video(video_path, question, model, processor, nframes, enable_reasoning=True):
    """
    Analyze a video using Cosmos Reason
    
    Args:
        video_path: Path to the video file
        question: Question to ask about the video
        model: The loaded model
        processor: The loaded processor
        nframes: Number of frames to sample (default: 60, recommended)
        enable_reasoning: Whether to enable chain-of-thought reasoning
        
    Returns:
        dict with 'reasoning' and 'answer' keys
    """
    print(f"\nAnalyzing video: {video_path}")
    print(f"Question: {question}")
    print(f"Number of frames: {nframes}")
    print(f"Reasoning enabled: {enable_reasoning}")
    
    # Prepare the system prompt (with reasoning format if enabled)
    if enable_reasoning:
        system_prompt = """Answer the question in the following format:
<think>
your reasoning
</think>

<answer>
your answer
</answer>"""
    else:
        system_prompt = "You are a helpful assistant that analyzes videos."
    
    # Prepare the conversation messages
    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": str(video_path),
                    "nframes": nframes  # Explicit frame count to avoid metadata warning
                },
                {
                    "type": "text",
                    "text": question
                }
            ]
        }
    ]
    
    # Apply chat template
    text = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Process the video and prepare inputs
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    )
    
    # Move inputs to the same device as model
    inputs = inputs.to(model.device)
    
    print("\nGenerating response...")
    
    # Generate response
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=4096  # Recommended to avoid truncation
        )
    
    # Trim the input tokens from the generated output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] 
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    
    # Decode the response
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    
    # Parse reasoning and answer if reasoning was enabled
    if enable_reasoning:
        reasoning = ""
        answer = ""
        
        if "<think>" in output_text and "</think>" in output_text:
            reasoning = output_text.split("<think>")[1].split("</think>")[0].strip()
        
        if "<answer>" in output_text and "</answer>" in output_text:
            answer = output_text.split("<answer>")[1].split("</answer>")[0].strip()
        elif "</think>" in output_text:
            # Sometimes the answer comes after </think> without tags
            answer = output_text.split("</think>")[1].strip()
        else:
            answer = output_text
            
        return {
            "reasoning": reasoning,
            "answer": answer,
            "full_output": output_text
        }
    else:
        return {
            "reasoning": "",
            "answer": output_text,
            "full_output": output_text
        }

## Load the Model

Instantiate the model and processor using the Cosmos Reason 2B model.

In [4]:
model, processor = setup_model("nvidia/Cosmos-Reason2-8B")

Loading model: nvidia/Cosmos-Reason2-8B
This may take a few minutes on first run...


Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 41221.66it/s]
Loading weights: 100%|██████████| 750/750 [00:04<00:00, 169.45it/s, Materializing param=model.visual.pos_embed.weight]                                 


Model loaded successfully!


---

## B.1 Object Detection in All Videos

Using NVIDIA Cosmos Reason 2B to detect and identify objects in all demonstration videos. This model leverages vision-language understanding to provide detailed object detection through natural language descriptions.

In [7]:
# Get all video files from the demonstrations folder


demonstrations_path = Path("demonstrations/objects_tracked")
video_extensions = ["*.mp4", "*.mov", "*.avi", "*.mkv"]
video_files = []

for ext in video_extensions:
    video_files.extend(demonstrations_path.glob(ext))

video_files = sorted(video_files)
print(f"Found {len(video_files)} videos in demonstrations folder:")
for v in video_files:
    print(f"  - {v.name}")

Found 11 videos in demonstrations folder:
  - data10_tracked.mp4
  - data1_tracked.mp4
  - data2_tracked.mp4
  - data3_tracked.mp4
  - data4_tracked.mp4
  - data5_tracked.mp4
  - data6_tracked.mp4
  - data7_tracked.mp4
  - data8_tracked.mp4
  - data9_tracked.mp4
  - demonstration1_tracked.mp4


In [13]:
# Object Detection Prompt
OBJECT_DETECTION_PROMPT = """You are analyzing a video with colored dice on a table. Pay EXTREMELY CLOSE ATTENTION to counting.

CRITICAL INSTRUCTIONS:
1. Count EACH die individually by color (red, green, blue)
2. Watch the ENTIRE video to track ALL dice present
3. A die may be moved, stacked, or rearranged - count it only ONCE
4. State the EXACT NUMBER of each color

Analyze this video frame-by-frame and provide:

**DICE INVENTORY (Count carefully!):**
- Total number of GREEN dice: [count each green die you see]
- Total number of RED dice: [count each red die you see]  
- Total number of BLUE dice: [count each blue die you see]
- Total number of ALL dice: [sum]

**INITIAL ARRANGEMENT:**
Describe the starting positions of ALL dice from left to right or their spatial arrangement.

**OBJECTS IN SCENE:**
- Table description
- Human hands/body parts visible
- Any other objects

Be precise with your counts. If you see a die being moved, don't count it twice."""

# Run object detection on all videos
object_detection_results = {}

for i, video_path in enumerate(video_files):
    print(f"\n{'='*80}")
    print(f"Processing video {i+1}/{len(video_files)}: {video_path.name}")
    print("="*80)

    frame_count = get_video_frame_count(video_path)
    
    result = analyze_video(
        video_path=video_path,
        question=OBJECT_DETECTION_PROMPT,
        model=model,
        processor=processor,
        nframes=frame_count,
        enable_reasoning=True
    )
    
    object_detection_results[video_path.name] = {
        "reasoning": result["reasoning"],
        "answer": result["answer"],
        "full_output": result["full_output"]
    }
    
    print(f"\n Objects in {video_path.name}:")
    print("-" * 40)
    print(result["answer"][:500] + "..." if len(result["answer"]) > 500 else result["answer"])


Processing video 1/11: data10_tracked.mp4
 Video: data10_tracked.mp4 | Frames: 555 | FPS: 29.99 | Duration: 18.51 secs

Analyzing video: demonstrations/objects_tracked/data10_tracked.mp4
Question: You are analyzing a video with colored dice on a table. Pay EXTREMELY CLOSE ATTENTION to counting.

CRITICAL INSTRUCTIONS:
1. Count EACH die individually by color (red, green, blue)
2. Watch the ENTIRE video to track ALL dice present
3. A die may be moved, stacked, or rearranged - count it only ONCE
4. State the EXACT NUMBER of each color

Analyze this video frame-by-frame and provide:

**DICE INVENTORY (Count carefully!):**
- Total number of GREEN dice: [count each green die you see]
- Total number of RED dice: [count each red die you see]  
- Total number of BLUE dice: [count each blue die you see]
- Total number of ALL dice: [sum]

**INITIAL ARRANGEMENT:**
Describe the starting positions of ALL dice from left to right or their spatial arrangement.

**OBJECTS IN SCENE:**
- Table descriptio

In [14]:
# Display complete object detection results
print("\n" + "="*80)
print("Results")
print("="*80)

for video_name, data in object_detection_results.items():
    print(f"\n{'='*60}")
    print(f"Video: {video_name}")
    print("="*60)
    print("\nReasoning:")
    print(data["reasoning"][:800] + "..." if len(data["reasoning"]) > 800 else data["reasoning"])
    print("\nObjects Detected:")
    print(data["answer"])


Results

Video: data10_tracked.mp4

Reasoning:
Okay, let's break this down. The user provided  a video involving colored dice on a table and wants me to analyze it according to specific instructions. My task is to count the number of each colored die (green, red, blue) present in the video while paying close attention to details.

First, I need to parse through the video carefully. The key points shown are that there are three dice initially: Green Dice, Red Dice, and Blue Dice. Each has distinct colors and labels. The person interacts with them by moving them around but doesn't add or remove any dice during the process. 

The actions shown involve picking up the Red Dice, placing it back, then stacking it on the Blue Dice, followed by adding the Green Dice on top of both. Throughout these interactions, the video shows that no addition...

Objects Detected:
{
  "dice_inventory": {
    "total_number_of_green_dice": 1,
    "total_number_of_red_dice": 1,
    "total_number_of_blue_dice": 

## B.2 Human Action Recognition

Using NVIDIA Cosmos Reason 2B to recognize actions performed in the videos (grasping, moving, releasing, stacking, etc.).

In [None]:
# Action Recognition Prompt
ACTION_RECOGNITION_PROMPT = """Analyze this video and identify ALL human actions performed required to complete the pattern of dice in this video. The dice will be lined up in a specific pattern.

For each action, provide:
1. Action name (e.g., grasping, picking up, moving, placing, releasing, stacking, pushing, pulling)
2. The object involved in the action
3. The state of the cube's pattern in a line, after each action is preformed

Focus on fine-grained manipulation actions such as:
- Reaching/approaching
- Grasping/gripping
- Lifting/picking up
- Moving/transporting
- Placing/positioning
- Releasing/letting go
- Stacking/arranging
- Adjusting/fine-tuning position

Provide a chronological list of all actions observed."""

# Run action recognition on all videos
action_recognition_results = {}

for i, video_path in enumerate(video_files):
    print(f"\n{'='*80}")
    print(f"Processing video {i+1}/{len(video_files)}: {video_path.name}")
    print("="*80)
    
    result = analyze_video(
        video_path=video_path,
        question=ACTION_RECOGNITION_PROMPT,
        model=model,
        processor=processor,
        nframes=60,
        enable_reasoning=True
    )
    
    action_recognition_results[video_path.name] = {
        "reasoning": result["reasoning"],
        "answer": result["answer"],
        "full_output": result["full_output"]
    }
    
    print(f"\n Actions in {video_path.name}:")
    print("-" * 40)
    print(result["answer"][:500] + "..." if len(result["answer"]) > 500 else result["answer"])


Processing video 1/11: data1.mov

Analyzing video: demonstrations/data1.mov
Question: Analyze this video and identify ALL human actions performed.

For each action, provide:
1. Action name (e.g., grasping, picking up, moving, placing, releasing, stacking, pushing, pulling)
2. The object involved in the action
3. Approximate timing (beginning, middle, end of video)
4. Hand used (left, right, or both)

Focus on fine-grained manipulation actions such as:
- Reaching/approaching
- Grasping/gripping
- Lifting/picking up
- Moving/transporting
- Placing/positioning
- Releasing/letting go
- Stacking/arranging
- Adjusting/fine-tuning position

Provide a chronological list of all actions observed.
Number of frames: 60
Reasoning enabled: True

Generating response...

 ACTIONS RECOGNIZED in data1.mov:
----------------------------------------
The individual stacks four dice (red, blue, green, and another green) vertically on the table using their right hand.

Processing video 2/11: data10.mov

Anal

KeyboardInterrupt: 

In [None]:
# Display complete action recognition results
print("\n" + "="*80)
print("Results")
print("="*80)

for video_name, data in action_recognition_results.items():
    print(f"\n{'='*60}")
    print(f"Video: {video_name}")
    print("="*60)
    print("\nReasoning:")
    print(data["reasoning"][:800] + "..." if len(data["reasoning"]) > 800 else data["reasoning"])
    print("\nActions:")
    print(data["answer"])

---

# Part C: Automatic Generation of Sequence of Actions

## C.1 Action Sequence Generation

Using NVIDIA Cosmos Reason 2B to generate structured sequences of actions from all videos.

In [None]:
# Action Sequence Generation Prompt
ACTION_SEQUENCE_PROMPT = """Analyze this video and generate a STRUCTURED SEQUENCE of actions.

Output the sequence in the following JSON-like format for each action:
{
  "step": <step_number>,
  "action": "<action_verb>",
  "object": "<object_being_manipulated>",
  "start_state": "<state_before_action>",
  "end_state": "<state_after_action>",
  "preconditions": ["<required_conditions>"],
  "effects": ["<resulting_changes>"]
}

Use ONLY these standardized action verbs:
- REACH: Moving hand toward an object
- GRASP: Closing fingers around an object
- LIFT: Raising an object from a surface
- MOVE: Transporting an object through space
- PLACE: Positioning an object at a location
- RELEASE: Opening fingers to let go of object
- ADJUST: Fine-tuning object position
- STACK: Placing object on top of another

Generate the complete action sequence from start to finish."""

# Run action sequence generation on all videos
action_sequence_results = {}

for i, video_path in enumerate(video_files):
    print(f"\n{'='*80}")
    print(f"Processing video {i+1}/{len(video_files)}: {video_path.name}")
    print("="*80)
    
    result = analyze_video(
        video_path=video_path,
        question=ACTION_SEQUENCE_PROMPT,
        model=model,
        processor=processor,
        nframes=60,
        enable_reasoning=True
    )
    
    action_sequence_results[video_path.name] = {
        "reasoning": result["reasoning"],
        "answer": result["answer"],
        "full_output": result["full_output"]
    }
    
    print(f"\n📋 ACTION SEQUENCE for {video_path.name}:")
    print("-" * 40)
    print(result["answer"][:600] + "..." if len(result["answer"]) > 600 else result["answer"])

In [None]:
# Display complete action sequence results
print("\n" + "="*80)
print("Results")
print("="*80)

for video_name, data in action_sequence_results.items():
    print(f"\n{'='*60}")
    print(f"Video: {video_name}")
    print("="*60)
    print("\nAction Sequence:")
    print(data["answer"])

## C.2 Markov Decision Process Design

Based on the observed action sequences, we design a Markov Decision Process (MDP) for the manipulation tasks.

In [None]:
# Define the MDP components based on observed actions

# States: Represent the configuration of objects and hand state
STATES = {
    "S0_IDLE": "Hand empty, objects on table (initial state)",
    "S1_REACHING": "Hand moving toward target object",
    "S2_GRASPING": "Hand closing around object",
    "S3_HOLDING": "Object grasped and held",
    "S4_MOVING": "Object being transported",
    "S5_POSITIONING": "Object at target location",
    "S6_RELEASING": "Hand opening to release object",
    "S7_STACKED": "Object placed on stack (goal state)",
    "S8_COMPLETE": "All objects stacked (terminal state)"
}

# Actions: Possible actions the agent can take
ACTIONS = {
    "A0_WAIT": "Do nothing, remain in current state",
    "A1_REACH": "Extend hand toward target object",
    "A2_GRASP": "Close fingers around object",
    "A3_LIFT": "Raise object from surface",
    "A4_MOVE": "Transport object to target location",
    "A5_LOWER": "Move object down toward surface",
    "A6_RELEASE": "Open fingers to let go",
    "A7_ADJUST": "Fine-tune object position"
}

# Transition Probabilities (estimated from video observations)
# Format: P(next_state | current_state, action)
TRANSITION_PROBS = {
    ("S0_IDLE", "A1_REACH"): {"S1_REACHING": 0.95, "S0_IDLE": 0.05},
    ("S1_REACHING", "A2_GRASP"): {"S2_GRASPING": 0.90, "S1_REACHING": 0.10},
    ("S2_GRASPING", "A3_LIFT"): {"S3_HOLDING": 0.95, "S2_GRASPING": 0.05},
    ("S3_HOLDING", "A4_MOVE"): {"S4_MOVING": 0.90, "S3_HOLDING": 0.10},
    ("S4_MOVING", "A5_LOWER"): {"S5_POSITIONING": 0.85, "S4_MOVING": 0.15},
    ("S5_POSITIONING", "A6_RELEASE"): {"S6_RELEASING": 0.90, "S5_POSITIONING": 0.10},
    ("S6_RELEASING", "A0_WAIT"): {"S7_STACKED": 0.95, "S0_IDLE": 0.05},
    ("S5_POSITIONING", "A7_ADJUST"): {"S5_POSITIONING": 0.70, "S7_STACKED": 0.30},
}

# Rewards
REWARDS = {
    "S7_STACKED": 10.0,      # Successfully stacked one object
    "S8_COMPLETE": 100.0,    # All objects stacked (task complete)
    "S0_IDLE": -0.1,         # Small penalty for idle
    "FAILED_GRASP": -5.0,    # Failed grasp attempt
    "DROPPED": -10.0,        # Dropped object
    "DEFAULT": -0.5          # Step cost to encourage efficiency
}

print("=" * 80)
print("MARKOV DECISION PROCESS DEFINITION")
print("=" * 80)
print("\nStates:")
for state, desc in STATES.items():
    print(f"  {state}: {desc}")

print("\nActions:")
for action, desc in ACTIONS.items():
    print(f"  {action}: {desc}")

print("\nTransition Probabilities (sample):")
for (state, action), probs in list(TRANSITION_PROBS.items())[:5]:
    print(f"  P(·|{state}, {action}):")
    for next_state, prob in probs.items():
        print(f"    → {next_state}: {prob:.2f}")

print("\nRewards:")
for state, reward in REWARDS.items():
    print(f"  {state}: {reward:+.1f}")

In [None]:
# Generate MDP Diagram using graphviz-style text representation
# (Can be visualized with graphviz or mermaid)

# mdp_diagram = """
# MDP State Transition Diagram (Mermaid format - paste into mermaid.live):

# ```mermaid
# stateDiagram-v2
#     [*] --> S0_IDLE
    
#     S0_IDLE --> S1_REACHING : A1_REACH (0.95)
#     S0_IDLE --> S0_IDLE : A0_WAIT (1.0)
    
#     S1_REACHING --> S2_GRASPING : A2_GRASP (0.90)
#     S1_REACHING --> S1_REACHING : fail (0.10)
    
#     S2_GRASPING --> S3_HOLDING : A3_LIFT (0.95)
#     S2_GRASPING --> S0_IDLE : fail (0.05)
    
#     S3_HOLDING --> S4_MOVING : A4_MOVE (0.90)
#     S3_HOLDING --> S3_HOLDING : hold (0.10)
    
#     S4_MOVING --> S5_POSITIONING : A5_LOWER (0.85)
#     S4_MOVING --> S4_MOVING : adjust (0.15)
    
#     S5_POSITIONING --> S6_RELEASING : A6_RELEASE (0.90)
#     S5_POSITIONING --> S5_POSITIONING : A7_ADJUST (0.70)
#     S5_POSITIONING --> S7_STACKED : A7_ADJUST (0.30)
    
#     S6_RELEASING --> S7_STACKED : success (0.95)
#     S6_RELEASING --> S0_IDLE : dropped (0.05)
    
#     S7_STACKED --> S0_IDLE : next_object
#     S7_STACKED --> S8_COMPLETE : all_done
    
#     S8_COMPLETE --> [*]
    
#     note right of S7_STACKED : Reward: +10
#     note right of S8_COMPLETE : Reward: +100
# ```

# Rewards:
# - S7_STACKED (object placed): +10
# - S8_COMPLETE (all done): +100
# - Failed transitions: -5 to -10
# - Each step: -0.5 (encourages efficiency)
# """

# print(mdp_diagram)

In [None]:
# Create a visual representation of the MDP using matplotlib


fig, ax = plt.subplots(1, 1, figsize=(16, 10))

# State positions (arranged in a flow)
state_positions = {
    "S0_IDLE": (1, 5),
    "S1_REACHING": (3, 5),
    "S2_GRASPING": (5, 5),
    "S3_HOLDING": (7, 5),
    "S4_MOVING": (9, 5),
    "S5_POSITIONING": (11, 5),
    "S6_RELEASING": (13, 5),
    "S7_STACKED": (11, 2),
    "S8_COMPLETE": (13, 2),
}

# Draw states
for state, (x, y) in state_positions.items():
    if state == "S8_COMPLETE":
        color = 'lightgreen'
    elif state == "S7_STACKED":
        color = 'lightblue'
    elif state == "S0_IDLE":
        color = 'lightyellow'
    else:
        color = 'lightgray'
    
    circle = plt.Circle((x, y), 0.5, color=color, ec='black', linewidth=2)
    ax.add_patch(circle)
    
    # State label
    short_name = state.replace("S", "").replace("_", "\n")
    ax.text(x, y, short_name, ha='center', va='center', fontsize=8, fontweight='bold')

# Draw transitions (arrows)
transitions = [
    ("S0_IDLE", "S1_REACHING", "REACH"),
    ("S1_REACHING", "S2_GRASPING", "GRASP"),
    ("S2_GRASPING", "S3_HOLDING", "LIFT"),
    ("S3_HOLDING", "S4_MOVING", "MOVE"),
    ("S4_MOVING", "S5_POSITIONING", "LOWER"),
    ("S5_POSITIONING", "S6_RELEASING", "RELEASE"),
    ("S6_RELEASING", "S7_STACKED", "0.95"),
    ("S7_STACKED", "S0_IDLE", "next"),
    ("S7_STACKED", "S8_COMPLETE", "done"),
]

for start, end, label in transitions:
    x1, y1 = state_positions[start]
    x2, y2 = state_positions[end]
    
    # Calculate direction
    dx, dy = x2 - x1, y2 - y1
    dist = np.sqrt(dx**2 + dy**2)
    
    # Offset to start/end at circle edge
    offset = 0.55
    x1_adj = x1 + offset * dx / dist
    y1_adj = y1 + offset * dy / dist
    x2_adj = x2 - offset * dx / dist
    y2_adj = y2 - offset * dy / dist
    
    ax.annotate("", xy=(x2_adj, y2_adj), xytext=(x1_adj, y1_adj),
                arrowprops=dict(arrowstyle="->", color='darkblue', lw=1.5))
    
    # Label
    mid_x, mid_y = (x1 + x2) / 2, (y1 + y2) / 2 + 0.3
    ax.text(mid_x, mid_y, label, ha='center', va='bottom', fontsize=7, color='darkred')

# Add legend
legend_elements = [
    mpatches.Patch(facecolor='lightyellow', edgecolor='black', label='Initial State'),
    mpatches.Patch(facecolor='lightgray', edgecolor='black', label='Intermediate State'),
    mpatches.Patch(facecolor='lightblue', edgecolor='black', label='Reward State (+10)'),
    mpatches.Patch(facecolor='lightgreen', edgecolor='black', label='Terminal State (+100)'),
]
ax.legend(handles=legend_elements, loc='lower left', fontsize=9)

ax.set_xlim(0, 15)
ax.set_ylim(0, 7)
ax.set_aspect('equal')
ax.axis('off')
ax.set_title('Markov Decision Process for Object Manipulation Task', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('mdp_diagram.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nSaved as 'mdp_diagram.png'")

## Limitations Discussion

### Limitations of the Action Sequence Generation Approach:

1. **Temporal Resolution**: The model samples at a fixed FPS (4 frames/second), which may miss rapid or subtle actions.

2. **Vocabulary Constraints**: Actions are limited to predefined verbs; novel or complex manipulations may not be accurately captured.

3. **Context Dependency**: The model relies on visual cues only; it cannot infer intent, force, or tactile feedback.

4. **Generalization**: The approach is trained on specific manipulation scenarios; extending to other domains (e.g., cooking, assembly) may require prompt engineering or fine-tuning.

5. **Occlusion Handling**: When hands or objects are occluded, the model may make incorrect inferences.

6. **Multi-object Tracking**: With many similar objects, the model may confuse object identities across frames.

7. **Real-time Performance**: The current approach is not suitable for real-time applications due to inference latency.

In [None]:
# Save all results to JSON files for reference


timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save object detection results
with open(f"object_detection_results_{timestamp}.json", "w") as f:
    json.dump(object_detection_results, f, indent=2)
print(f"Object detection results saved to object_detection_results_{timestamp}.json")

# Save action recognition results
with open(f"action_recognition_results_{timestamp}.json", "w") as f:
    json.dump(action_recognition_results, f, indent=2)
print(f"Action recognition results saved to action_recognition_results_{timestamp}.json")

# Save action sequence results
with open(f"action_sequence_results_{timestamp}.json", "w") as f:
    json.dump(action_sequence_results, f, indent=2)
print(f"Action sequence results saved to action_sequence_results_{timestamp}.json")

# Save MDP definition
mdp_definition = {
    "states": STATES,
    "actions": ACTIONS,
    "transition_probabilities": {f"{s}_{a}": p for (s, a), p in TRANSITION_PROBS.items()},
    "rewards": REWARDS
}
with open(f"mdp_definition_{timestamp}.json", "w") as f:
    json.dump(mdp_definition, f, indent=2)
print(f"MDP definition saved to mdp_definition_{timestamp}.json")

---

## Summary

This notebook demonstrates:

### Model Used
**NVIDIA Cosmos Reason 2B** - A vision-language model based on Qwen3-VL architecture, fine-tuned for physical world understanding and reasoning.

### Part B: Object and Human Action Recognition
- **B.1**: Object detection using natural language prompting to identify all objects in each video
- **B.2**: Action recognition to identify manipulation actions (grasp, move, place, stack, etc.)

### Part C: Automatic Generation of Sequence of Actions
- **C.1**: Structured action sequence generation in JSON format with preconditions and effects
- **C.2**: Markov Decision Process design with states, actions, transition probabilities, and rewards

### Output Files Generated
- `object_detection_results_*.json` - Object detection for all videos
- `action_recognition_results_*.json` - Action recognition for all videos
- `action_sequence_results_*.json` - Structured action sequences
- `mdp_definition_*.json` - MDP formal definition
- `mdp_diagram.png` - Visual diagram of the MDP