# Chapter 4: Vision-Language-Action (VLA) Models

This notebook explores Vision-Language-Action (VLA) models for robotics. VLA models connect visual perception, natural language understanding, and robotic action to enable robots to follow natural language commands.

In [None]:
# Configuration cell - select execution mode
EXECUTION_MODE = "simulation"  # Options: "hardware", "simulation"

print(f"VLA Chapter 4: Running in {EXECUTION_MODE} mode")
print("Initializing VLA environment...")

# Import required libraries
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
from PIL import Image
import json

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")
if cuda_available:
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

print("VLA environment setup complete!")

## 1. Understanding Vision-Language-Action (VLA) Models

VLA models represent a significant advancement in robotics, enabling robots to understand natural language instructions and execute complex tasks by connecting visual perception with action. In this notebook, we'll explore the architecture and applications of VLA models.

In [None]:
# Create mock VLA model for simulation mode
if EXECUTION_MODE == "simulation":
    print("Using VLA simulation mode - no physical hardware required")
    
    # Mock VLA model class
    class MockVLAModel:
        def __init__(self, model_name="mock-vla"):
            self.model_name = model_name
            self.is_loaded = False
            print(f"Mock VLA Model '{model_name}' initialized")
        
        def load_model(self):
            print(f"Loading mock VLA model: {self.model_name}")
            # Simulate model loading time
            time.sleep(0.5)
            self.is_loaded = True
            print("Mock VLA model loaded successfully")
        
        def process_instruction(self, image, instruction):
            if not self.is_loaded:
                self.load_model()
            
            print(f"Processing instruction: '{instruction}'")
            print(f"With image of shape: {getattr(image, 'shape', 'unknown')}")
            
            # Simulate VLA processing
            time.sleep(0.2)
            
            # Mock action generation
            actions = self._generate_mock_actions(instruction)
            
            return actions
        
        def _generate_mock_actions(self, instruction):
            # Generate mock actions based on the instruction
            instruction_lower = instruction.lower()
            
            if "pick" in instruction_lower or "grasp" in instruction_lower:
                return [{
                    'action_type': 'grasp',
                    'object': 'object',
                    'position': [0.5, 0.3, 0.1],
                    'confidence': 0.9
                }]
            elif "move" in instruction_lower or "go" in instruction_lower:
                return [{
                    'action_type': 'navigate',
                    'target': 'location',
                    'position': [1.0, 0.0, 0.0],
                    'confidence': 0.85
                }]
            elif "push" in instruction_lower or "press" in instruction_lower:
                return [{
                    'action_type': 'push',
                    'object': 'button',
                    'position': [0.2, 0.8, 0.05],
                    'confidence': 0.88
                }]
            else:
                return [{
                    'action_type': 'unknown',
                    'description': instruction,
                    'confidence': 0.6
                }]
    
    # Create mock VLA components
    vla_model = MockVLAModel()
    print("Mock VLA components initialized")
else:
    print("Using VLA hardware mode - connecting to VLA platform")
    # In a real environment, we would import actual VLA packages
    # import vla_models
    # from vla_models import VLAModel
    
    # For this simulation, we'll use the mock components
    vla_model = MockVLAModel()

## 2. Creating Simulated Environment Data

Let's create simulated environment data that our VLA model can process.

In [None]:
# Generate a simulated environment image
def create_simulated_environment_image(width=640, height=480, channels=3):
    """Create a simulated environment image with objects for VLA processing"""
    image = np.zeros((height, width, channels), dtype=np.uint8)
    
    # Create a simple scene with objects
    # Table
    image[300:480, :, :] = [101, 67, 33]  # Brown table
    
    # Red cup
    center_x, center_y = 150, 250
    radius = 30
    y, x = np.ogrid[:height, :width]
    mask = (x - center_x)**2 + (y - center_y)**2 <= radius**2
    image[mask] = [255, 0, 0]  # Red cup
    
    # Blue box
    image[200:280, 400:480, :] = [0, 0, 255]  # Blue box
    
    # Green bottle
    image[150:250, 300:320, :] = [0, 255, 0]  # Green bottle
    
    # Add some texture to the table
    for i in range(300, 480, 20):
        image[i:i+2, :, :] = [85, 55, 28]  # Darker wood lines
    
    return image

# Create simulated environment data
environment_image = create_simulated_environment_image()
print(f"Simulated environment image created with shape: {environment_image.shape}")

# Visualize the simulated environment
plt.figure(figsize=(12, 8))
plt.imshow(environment_image)
plt.title('Simulated Environment for VLA Processing')
plt.axis('off')
plt.show()

## 3. Processing Natural Language Instructions

Now let's test our VLA model with various natural language instructions.

In [None]:
# Define various natural language instructions to test
instructions = [
    "Pick up the red cup",
    "Move to the blue box",
    "Push the green button",
    "Go to the table",
    "Grasp the object on the left"
]

# Process each instruction with the VLA model
print("Processing natural language instructions with VLA model...")

all_results = []
for i, instruction in enumerate(instructions):
    print(f"\nInstruction {i+1}: '{instruction}'")
    
    # Process the instruction with the VLA model
    actions = vla_model.process_instruction(environment_image, instruction)
    
    print(f"Generated actions: {actions}")
    all_results.append({
        'instruction': instruction,
        'actions': actions
    })

# Display summary of all results
print(f"\nProcessed {len(instructions)} instructions with VLA model")

## 4. VLA Model Architecture Visualization

Let's visualize how the VLA model connects vision, language, and action components.

In [None]:
# Create a visualization of the VLA architecture
fig, ax = plt.subplots(figsize=(15, 10))

# Define positions for the VLA components
positions = {
    'vision': (0, 0.5),
    'language': (0, 0.8),
    'fusion': (0.5, 0.65),
    'action': (1, 0.5),
    'robot': (1, 0.2)
}

# Draw components
for component, pos in positions.items():
    x, y = pos
    
    if component == 'vision':
        # Draw camera icon
        circle = plt.Circle((x, y), 0.05, color='blue', alpha=0.7)
        ax.add_patch(circle)
        ax.text(x, y-0.08, 'Vision', ha='center', va='center', fontweight='bold')
        ax.text(x, y-0.12, '(Image Input)', ha='center', va='center', fontsize=8)
    elif component == 'language':
        # Draw speech bubble
        circle = plt.Circle((x, y), 0.05, color='green', alpha=0.7)
        ax.add_patch(circle)
        ax.text(x, y-0.08, 'Language', ha='center', va='center', fontweight='bold')
        ax.text(x, y-0.12, '(Text Input)', ha='center', va='center', fontsize=8)
    elif component == 'fusion':
        # Draw fusion center
        circle = plt.Circle((x, y), 0.07, color='purple', alpha=0.7)
        ax.add_patch(circle)
        ax.text(x, y-0.1, 'Fusion', ha='center', va='center', fontweight='bold')
        ax.text(x, y-0.15, '(VLA Model)', ha='center', va='center', fontsize=8)
    elif component == 'action':
        # Draw action output
        circle = plt.Circle((x, y), 0.05, color='orange', alpha=0.7)
        ax.add_patch(circle)
        ax.text(x, y-0.08, 'Action', ha='center', va='center', fontweight='bold')
        ax.text(x, y-0.12, '(Motor Commands)', ha='center', va='center', fontsize=8)
    elif component == 'robot':
        # Draw robot
        circle = plt.Circle((x, y), 0.05, color='red', alpha=0.7)
        ax.add_patch(circle)
        ax.text(x, y-0.08, 'Robot', ha='center', va='center', fontweight='bold')
        ax.text(x, y-0.12, '(Execution)', ha='center', va='center', fontsize=8)

# Draw connections
connections = [
    (positions['vision'], positions['fusion']),
    (positions['language'], positions['fusion']),
    (positions['fusion'], positions['action']),
    (positions['action'], positions['robot'])
]

for start, end in connections:
    ax.annotate('', xy=end, xytext=start,
                arrowprops=dict(arrowstyle='->', lw=2, color='gray', alpha=0.7))

# Add title and labels
ax.set_xlim(-0.2, 1.2)
ax.set_ylim(-0.2, 1.2)
ax.set_aspect('equal')
ax.set_title('Vision-Language-Action (VLA) Model Architecture', fontsize=16, fontweight='bold')
ax.axis('off')

plt.tight_layout()
plt.show()

print("VLA architecture visualization complete")

## 5. Multi-Modal Integration

Let's explore how VLA models integrate visual and language information to generate appropriate actions.

In [None]:
# Create a function to simulate multi-modal integration
class VLAIntegrationSimulator:
    def __init__(self):
        self.name = "VLA Multi-Modal Integration Simulator"
        print(f"{self.name} initialized")
    
    def integrate_modalities(self, visual_data, language_instruction):
        print(f"Integrating visual data and language instruction:")
        print(f"  Visual: Scene with objects at various positions")
        print(f"  Language: '{language_instruction}'")
        
        # Simulate attention mechanism
        attention_map = self._create_attention_map(visual_data, language_instruction)
        
        # Identify relevant objects based on instruction
        relevant_objects = self._identify_objects(visual_data, language_instruction)
        
        # Generate action plan
        action_plan = self._generate_action_plan(relevant_objects, language_instruction)
        
        return {
            'attention_map': attention_map,
            'relevant_objects': relevant_objects,
            'action_plan': action_plan
        }
    
    def _create_attention_map(self, visual_data, instruction):
        # Simulate attention map generation
        height, width = visual_data.shape[:2]
        attention_map = np.zeros((height, width))
        
        # Create attention based on instruction keywords
        if "red" in instruction.lower():
            # Focus on red areas
            red_mask = np.all(visual_data == [255, 0, 0], axis=-1)
            attention_map[red_mask] = 1.0
        elif "blue" in instruction.lower():
            # Focus on blue areas
            blue_mask = np.all(visual_data == [0, 0, 255], axis=-1)
            attention_map[blue_mask] = 1.0
        elif "green" in instruction.lower():
            # Focus on green areas
            green_mask = np.all(visual_data == [0, 255, 0], axis=-1)
            attention_map[green_mask] = 1.0
        else:
            # General attention
            attention_map[200:400, 200:400] = 0.5  # Center region
        
        return attention_map
    
    def _identify_objects(self, visual_data, instruction):
        # Simulate object identification
        objects = []
        
        # Check for red cup
        red_mask = np.all(visual_data == [255, 0, 0], axis=-1)
        if np.any(red_mask):
            y_coords, x_coords = np.where(red_mask)
            center_x = int(np.mean(x_coords))
            center_y = int(np.mean(y_coords))
            objects.append({
                'name': 'red_cup',
                'color': 'red',
                'position': (center_x, center_y),
                'bbox': [center_x-30, center_y-30, center_x+30, center_y+30]
            })
        
        # Check for blue box
        blue_mask = np.all(visual_data == [0, 0, 255], axis=-1)
        if np.any(blue_mask):
            y_coords, x_coords = np.where(blue_mask)
            center_x = int(np.mean(x_coords))
            center_y = int(np.mean(y_coords))
            objects.append({
                'name': 'blue_box',
                'color': 'blue',
                'position': (center_x, center_y),
                'bbox': [center_x-40, center_y-40, center_x+40, center_y+40]
            })
        
        # Check for green bottle
        green_mask = np.all(visual_data == [0, 255, 0], axis=-1)
        if np.any(green_mask):
            y_coords, x_coords = np.where(green_mask)
            center_x = int(np.mean(x_coords))
            center_y = int(np.mean(y_coords))
            objects.append({
                'name': 'green_bottle',
                'color': 'green',
                'position': (center_x, center_y),
                'bbox': [center_x-10, center_y-50, center_x+10, center_y+50]
            })
        
        return objects
    
    def _generate_action_plan(self, objects, instruction):
        # Generate action plan based on objects and instruction
        instruction_lower = instruction.lower()
        
        if "pick" in instruction_lower or "grasp" in instruction_lower:
            # Find object to pick based on color mentioned in instruction
            for obj in objects:
                if obj['color'] in instruction_lower or obj['name'] in instruction_lower:
                    return {
                        'action': 'grasp',
                        'target_object': obj['name'],
                        'position': obj['position'],
                        'sequence': ['approach', 'grasp', 'lift']
                    }
        elif "move" in instruction_lower or "go" in instruction_lower:
            # Move to the first object
            if objects:
                return {
                    'action': 'navigate',
                    'target_object': objects[0]['name'],
                    'position': objects[0]['position'],
                    'sequence': ['plan_path', 'move_to_location']
                }
        
        return {
            'action': 'unknown',
            'target_object': 'none',
            'position': (0, 0),
            'sequence': ['wait_for_clarification']
        }

# Create VLA integration simulator
vla_simulator = VLAIntegrationSimulator()
print("VLA Integration Simulator created")

# Test integration with different instructions
test_instructions = [
    "Pick up the red cup",
    "Go to the blue box",
    "Move near the green bottle"
]

for instruction in test_instructions:
    print(f"\n--- Processing: '{instruction}' ---")
    result = vla_simulator.integrate_modalities(environment_image, instruction)
    
    print(f"Relevant objects: {[obj['name'] for obj in result['relevant_objects']]}")
    print(f"Action plan: {result['action_plan']['action']} -> {result['action_plan']['target_object']}")

## 6. VLA Action Execution Simulation

Let's simulate how the VLA model's generated actions would be executed by a robot.

In [None]:
# Create a robot simulator to execute VLA actions
class VLARobotSimulator:
    def __init__(self):
        self.name = "VLA Robot Simulator"
        self.position = (0.5, 0.5)  # Starting position
        self.holding_object = None
        print(f"{self.name} initialized at position {self.position}")
    
    def execute_action(self, action_plan):
        print(f"\nExecuting action: {action_plan['action']} -> {action_plan['target_object']}")
        
        if action_plan['action'] == 'grasp':
            return self._execute_grasp(action_plan)
        elif action_plan['action'] == 'navigate':
            return self._execute_navigation(action_plan)
        else:
            print(f"Unknown action: {action_plan['action']}")
            return False
    
    def _execute_grasp(self, action_plan):
        target_pos = action_plan['position']
        print(f"Approaching target at {target_pos}")
        
        # Simulate approach
        time.sleep(0.3)
        print("Aligned with target object")
        
        # Simulate grasp
        print("Grasping object")
        self.holding_object = action_plan['target_object']
        print(f"Successfully grasped {self.holding_object}")
        
        return True
    
    def _execute_navigation(self, action_plan):
        target_pos = action_plan['position']
        print(f"Planning path to {target_pos}")
        
        # Simulate path planning
        time.sleep(0.2)
        print("Path planned and executing")
        
        # Update position
        self.position = target_pos
        print(f"Reached position {self.position}")
        
        return True
    
    def get_state(self):
        return {
            'position': self.position,
            'holding': self.holding_object
        }

# Create robot simulator
robot_sim = VLARobotSimulator()
print("VLA Robot Simulator created")

# Execute the action plans generated by the VLA integration
for i, result in enumerate(all_results):
    print(f"\n=== Executing Plan {i+1}: {result['instruction']} ===")
    
    # Get the action plan from the VLA integration
    action_plan = result['actions'][0] if result['actions'] else {'action': 'unknown', 'target_object': 'none', 'position': (0, 0)}
    
    # Execute the action
    success = robot_sim.execute_action(action_plan)
    
    # Print robot state
    state = robot_sim.get_state()
    print(f"Robot state: Position={state['position']}, Holding={state['holding']}")

print(f"\nAll VLA action plans executed")

## 7. VLA Model Training Concepts

Let's explore the concepts behind training VLA models with multimodal datasets.

In [None]:
# Visualize the VLA training process
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Training data distribution
num_samples = 1000
vision_features = np.random.randn(num_samples, 512)  # Simulated vision features
language_features = np.random.randn(num_samples, 512)  # Simulated language features
action_features = np.random.randn(num_samples, 64)  # Simulated action features

# Simulate correlation between modalities
correlated_data = vision_features[:, :100] + language_features[:, :100] + np.random.randn(num_samples, 100) * 0.1

ax1.scatter(correlated_data[:, 0], correlated_data[:, 1], alpha=0.6)
ax1.set_title('Correlated Features in VLA Training Data', fontweight='bold')
ax1.set_xlabel('Feature Dimension 1')
ax1.set_ylabel('Feature Dimension 2')
ax1.grid(True, alpha=0.3)

# Plot 2: Training loss over epochs
epochs = 50
train_loss = 2.0 * np.exp(-np.arange(epochs) * 0.1) + 0.1 + 0.1 * np.random.randn(epochs)
val_loss = 2.2 * np.exp(-np.arange(epochs) * 0.08) + 0.15 + 0.12 * np.random.randn(epochs)

ax2.plot(train_loss, label='Training Loss', linewidth=2)
ax2.plot(val_loss, label='Validation Loss', linewidth=2)
ax2.set_title('VLA Model Training: Loss Over Epochs', fontweight='bold')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("VLA training visualization complete")
print("Note: In real VLA training, models learn to connect vision, language, and action modalities")
print("through large datasets of human demonstrations and multimodal supervision.")

## Summary

In this notebook, we've covered:
1. Vision-Language-Action (VLA) model architecture
2. Processing natural language instructions with visual input
3. Multi-modal integration of vision and language
4. Action generation and robot execution simulation
5. VLA model training concepts
6. Visualization of VLA architecture and processes

VLA models represent a significant advancement in robotics, enabling robots to understand natural language commands and execute complex tasks by connecting visual perception with action. These models are crucial for developing more intuitive and flexible robotic systems.

In [None]:
# Clean up
print("\nChapter 4 complete! You've learned about Vision-Language-Action models.")
print("VLA concepts covered:")
print("- VLA model architecture")
print("- Natural language instruction processing")
print("- Multi-modal integration")
print("- Action generation and execution")
print("- Training concepts")

print(f"\nVLA model simulation completed successfully!")
print(f"Processed {len(instructions)} natural language instructions")
print(f"Executed {len(all_results)} action plans")