# LIBERO Pro 评估 Jupyter Notebook

这个 notebook 允许你分步执行 LIBERO Pro 评估，支持灵活的参数配置。

## Cell 1: 导入库

In [1]:
import sys
sys.path.append("/hdd/zijianwang/openpi/third_party/LIBERO-PRO")
import collections
import dataclasses
import json
import logging
import math
import os
import pathlib
import time
from datetime import datetime

import imageio
import numpy as np
import perturbation
import tqdm
import yaml
from PIL import Image

from libero.libero import benchmark
from libero.libero import get_libero_path
from libero.libero.envs import OffScreenRenderEnv
from openpi_client import image_tools
from openpi_client import websocket_client_policy as _websocket_client_policy

from util import compute_eef_trajectory_from_actions, build_reusable_value_map, evaluate_trajectory_with_value_map

print("✓ All libraries imported successfully")

11111111111111111111111




Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.
✓ All libraries imported successfully


## Cell 2: 定义常量和辅助函数

In [2]:
# Constants
LIBERO_DUMMY_ACTION = [0.0] * 6 + [-1.0]
LIBERO_ENV_RESOLUTION = 256  # resolution used to render training data

# Helper functions
def _quat2axisangle(quat):
    # clip quaternion
    if quat[3] > 1.0:
        quat[3] = 1.0
    elif quat[3] < -1.0:
        quat[3] = -1.0

    den = np.sqrt(1.0 - quat[3] * quat[3])
    if math.isclose(den, 0.0):
        return np.zeros(3)

    return (quat[:3] * 2.0 * math.acos(quat[3])) / den


def _get_libero_env(task, resolution, seed):
    task_description = task.language
    task_bddl_file = pathlib.Path(get_libero_path("bddl_files")) / task.problem_folder / task.bddl_file
    env_args = {"bddl_file_name": task_bddl_file, "camera_heights": resolution, "camera_widths": resolution}
    env = OffScreenRenderEnv(**env_args)
    env.seed(seed)
    return env, task_description

print("✓ Constants and helper functions defined")

✓ Constants and helper functions defined


## Cell 3: 定义日志和配置管理函数

In [3]:
def setup_logging(args):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_id = f"LIBERO-PRO-{args['task_suite_name']}-{timestamp}"
    if args.get('run_id_note') is not None:
        run_id += f"-{args['run_id_note']}"

    os.makedirs(args['local_log_dir'], exist_ok=True)
    log_filepath = os.path.join(args['local_log_dir'], f"{run_id}.txt")

    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[logging.StreamHandler(), logging.FileHandler(log_filepath, encoding="utf-8")],
    )

    logger = logging.getLogger(__name__)
    logger.info(f"Experiment run ID: {run_id}")
    logger.info(f"Log file path: {log_filepath}")

    return logger, run_id, log_filepath


def save_experiment_config(args, run_id, log_filepath):
    if not args.get('save_experiment_config', True):
        return

    config_data = {
        "run_id": run_id,
        "timestamp": datetime.now().isoformat(),
        "args": args,
        "log_file": log_filepath,
    }

    config_filepath = os.path.join(args['local_log_dir'], f"{run_id}_config.json")
    with open(config_filepath, "w", encoding="utf-8") as f:
        json.dump(config_data, f, indent=2, ensure_ascii=False)

    logging.info(f"Experiment configuration saved to: {config_filepath}")


def save_episode_video(replay_images, task_description, episode_idx, success, run_id, args):
    task_segment = task_description.replace(" ", "_").replace("/", "_")
    task_video_dir = os.path.join(args['video_out_path'], run_id, task_segment)
    os.makedirs(task_video_dir, exist_ok=True)

    suffix = "success" if success else "failure"
    video_filename = f"episode_{episode_idx:03d}_{suffix}.mp4"
    video_filepath = os.path.join(task_video_dir, video_filename)

    try:
        imageio.mimwrite(
            video_filepath,
            [np.asarray(x) for x in replay_images],
            fps=10,
        )
        logging.info(f"Episode video saved: {video_filepath}")
    except Exception as e:
        logging.error(f"Failed to save episode video: {e}")

print("✓ Logging and config management functions defined")

✓ Logging and config management functions defined


## Cell 4: 设置超参数

在这个 Cell 中修改超参数以调整评估配置。

In [4]:
# ============================================================================
# Model server parameters
# ============================================================================
host = "0.0.0.0"
port = 8001
resize_size = 224
replan_steps = 5
sampling_bs = 8

# ============================================================================
# LIBERO environment-specific parameters
# ============================================================================
task_suite_name = "libero_spatial_displacement"  # Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90
num_steps_wait = 10  # Number of steps to wait for objects to stabilize in sim
num_trials_per_task = 1  # Number of rollouts per task

# ============================================================================
# LIBERO Pro parameters
# ============================================================================
evaluation_config_path = "/hdd/zijianwang/openpi/third_party/LIBERO-PRO/evaluation_config.yaml"

# ============================================================================
# Logging and experiment tracking parameters
# ============================================================================
local_log_dir = "./experiments/logs"
run_id_note = None
save_experiment_config_flag = True

# ============================================================================
# Video output
# ============================================================================
video_out_path = "./experiments/videos/"

# ============================================================================
# Random seed
# ============================================================================
seed = 7

# ============================================================================
# Combine all args into a dictionary
# ============================================================================
args = {
    'host': host,
    'port': port,
    'resize_size': resize_size,
    'replan_steps': replan_steps,
    'sampling_bs': sampling_bs,
    'task_suite_name': task_suite_name,
    'num_steps_wait': num_steps_wait,
    'num_trials_per_task': num_trials_per_task,
    'evaluation_config_path': evaluation_config_path,
    'local_log_dir': local_log_dir,
    'run_id_note': run_id_note,
    'save_experiment_config': save_experiment_config_flag,
    'video_out_path': video_out_path,
    'seed': seed,
}

print("✓ Hyperparameters configured:")
for key, value in args.items():
    print(f"  {key}: {value}")

✓ Hyperparameters configured:
  host: 0.0.0.0
  port: 8001
  resize_size: 224
  replan_steps: 5
  sampling_bs: 8
  task_suite_name: libero_spatial_displacement
  num_steps_wait: 10
  num_trials_per_task: 1
  evaluation_config_path: /hdd/zijianwang/openpi/third_party/LIBERO-PRO/evaluation_config.yaml
  local_log_dir: ./experiments/logs
  run_id_note: None
  save_experiment_config: True
  video_out_path: ./experiments/videos/
  seed: 7


## Cell 5: 运行评估

这个 Cell 执行完整的评估流程。

In [6]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger, run_id, log_filepath = setup_logging(args)

# Save experiment configuration
save_experiment_config(args, run_id, log_filepath)

# Set random seed
np.random.seed(args['seed'])

# # Initialize environment perturbation for LIBERO Pro
# with open(args['evaluation_config_path']) as f:
#     evaluation_cfg = yaml.safe_load(f)

# evaluation_cfg["bddl_files_path"] = evaluation_cfg.get("bddl_files_path", "") + "/" + args['task_suite_name']
# evaluation_cfg["task_suite_name"] = args['task_suite_name']

# if not os.path.exists(evaluation_cfg.get("init_file_dir", "") + args['task_suite_name'] + "_temp/"):
#     perturbation.create_env(configs=evaluation_cfg)

# Initialize LIBERO task suite
benchmark_dict = benchmark.get_benchmark_dict()
task_suite = benchmark_dict[args['task_suite_name']]()
num_tasks_in_suite = task_suite.n_tasks
logging.info(f"Task suite: {args['task_suite_name']}")

pathlib.Path(args['video_out_path']).mkdir(parents=True, exist_ok=True)

# Determine max_steps based on task suite
if "libero_spatial" in args['task_suite_name']:
    max_steps = 220
elif "libero_object" in args['task_suite_name']:
    max_steps = 280
elif "libero_goal" in args['task_suite_name']:
    max_steps = 300
elif "libero_10" in args['task_suite_name']:
    max_steps = 520
elif "libero_90" in args['task_suite_name']:
    max_steps = 400
else:
    raise ValueError(f"Unknown task suite: {args['task_suite_name']}")

client = _websocket_client_policy.WebsocketClientPolicy(args['host'], args['port'])

# Start evaluation
total_episodes, total_successes = 0, 0

logger.info(f"Starting evaluation of task suite: {args['task_suite_name']}")
logger.info(f"Number of tasks: {num_tasks_in_suite}")
logger.info(f"Trials per task: {args['num_trials_per_task']}")

print("✓ Initialization complete, starting task loop...")

INFO:__main__:Experiment run ID: LIBERO-PRO-libero_spatial_displacement-20251030_153314
INFO:__main__:Log file path: ./experiments/logs/LIBERO-PRO-libero_spatial_displacement-20251030_153314.txt
INFO:root:Experiment configuration saved to: ./experiments/logs/LIBERO-PRO-libero_spatial_displacement-20251030_153314_config.json
INFO:root:Task suite: libero_spatial_displacement
INFO:root:Waiting for server at ws://0.0.0.0:8001...
INFO:__main__:Starting evaluation of task suite: libero_spatial_displacement
INFO:__main__:Number of tasks: 10
INFO:__main__:Trials per task: 1


[info] Using default task order for benchmark 'libero_spatial_displacement' (10 tasks).
✓ Initialization complete, starting task loop...


In [None]:
# Get current timestamp for this run
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

video_out_dir = pathlib.Path(args['video_out_path']) / timestamp / args['task_suite_name']
video_out_dir.mkdir(parents=True, exist_ok=True)
for task_id in tqdm.tqdm(range(num_tasks_in_suite)):
    if task_id != 8:
        continue
    
    # Get task
    task = task_suite.get_task(task_id)
    initial_states = task_suite.get_task_init_states(task_id)
    env, task_description = _get_libero_env(task, LIBERO_ENV_RESOLUTION, args['seed'])

    # Start episodes
    task_episodes, task_successes = 0, 0
    logger.info(f"\nStarting task: {task_description}")

    all_episode_costs = []
    current_episode_costs = []

    ### Build reusable valuemap before task starts
    logger.info("Building reusable ValueMap...")
    reusable_valuemap = None
    try:
        env.reset()
        obs = env.set_init_state(initial_states[0])
        reusable_valuemap = build_reusable_value_map(env, task_description)

        if "error" not in reusable_valuemap:
            logger.info(f"Reusable ValueMap built successfully:")
            logger.info(f"  Target objects: {reusable_valuemap.get('target_objects', [])}")
            logger.info(f"  Avoid objects: {reusable_valuemap.get('avoid_objects', [])}")
        else:
            logger.warning(f"Failed to build reusable ValueMap: {reusable_valuemap.get('error', 'Unknown error')}")
            reusable_valuemap = None
    except Exception as e:
        logger.warning(f"Failed to build reusable ValueMap: {e}")
        reusable_valuemap = None

    # reusable_valuemap = None   

    
    ### Entering task loop
    for episode_idx in tqdm.tqdm(range(args['num_trials_per_task'])):
        if episode_idx != 0:
            continue
        logger.info(f"\nTask: {task_description}")
        logger.info(f"Episode {episode_idx + 1}/{args['num_trials_per_task']}")

        # Reset environment
        env.reset()
        robot_instance = env.robots[0]
        action_plan = collections.deque()
        obs = env.set_init_state(initial_states[episode_idx])

        # Setup
        t = 0
        replay_images = []
        current_episode_costs = []

        logger.info(f"Starting episode {task_episodes + 1}...")
        while t < max_steps + args['num_steps_wait']:
            # IMPORTANT: Do nothing for the first few timesteps because the simulator drops objects
            if t < args['num_steps_wait']:
                obs, reward, done, info = env.step(LIBERO_DUMMY_ACTION)
                t += 1
                continue

            # Get preprocessed image - note the 180 degree rotation
            img = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1])
            wrist_img = np.ascontiguousarray(obs["robot0_eye_in_hand_image"][::-1, ::-1])
            img = image_tools.convert_to_uint8(image_tools.resize_with_pad(img, args['resize_size'], args['resize_size']))
            wrist_img = image_tools.convert_to_uint8(
                image_tools.resize_with_pad(wrist_img, args['resize_size'], args['resize_size'])
            )

            # Image.fromarray(np.uint8(img)).save("./experiments/tmp/live_image.png")
            replay_images.append(img)

            if not action_plan:
                # Finished executing previous action chunk -- compute new chunk
                element = {
                    "observation/image": img,
                    "observation/wrist_image": wrist_img,
                    "observation/state": np.concatenate(
                        (
                            obs["robot0_eef_pos"],
                                _quat2axisangle(obs["robot0_eef_quat"]),
                            obs["robot0_gripper_qpos"],
                        )
                    ),
                    "prompt": str(task_description),
                    "sampling_bs": int(args['sampling_bs']),
                }

                # Query model to get action
                action_chunk = client.infer(element)["actions"]
                assert action_chunk.shape[-2] >= args['replan_steps'], (
                    f"We want to replan every {args['replan_steps']} steps, but policy only predicts {action_chunk.shape[-2]} steps."
                )

                # Evaluate trajectory using built valuemap
                try:
                    if action_chunk.ndim == 2:
                        action_chunk = np.expand_dims(action_chunk, axis=0)

                    batch_eef_positions = []
                    for i in range(action_chunk.shape[0]):
                        eef_traj = compute_eef_trajectory_from_actions(env, action_chunk[i])
                        batch_eef_positions.append(eef_traj)
                    eef_trajs = np.stack(batch_eef_positions, axis=0)

                    if reusable_valuemap is not None:
                        evaluation_result = evaluate_trajectory_with_value_map(
                            reusable_valuemap, eef_trajs, current_env=env
                        )
                        
                        traj_cost = evaluation_result["step_info"]["traj_cost"] # shape: (num_traj, num_steps)
                        best_traj_id = evaluation_result["step_info"]["best_traj_id"]
                        best_action_chunk = action_chunk[best_traj_id]
                        best_traj_cost = traj_cost[best_traj_id, : args['replan_steps']]

                        # Record cost data for current step
                        step_cost_data = {
                            "step": t,
                            "best_traj_cost": best_traj_cost.copy(),
                            "best_traj_id": best_traj_id,
                            "replan_steps": args['replan_steps'],
                        }
                        current_episode_costs.append(step_cost_data)
                        logger.info(f"Step {t} - Best traj cost: {best_traj_cost}")
                    else:
                        logger.warning("No available valuemap, skipping trajectory evaluation")
                        best_action_chunk = action_chunk[0]

                except Exception as e:
                    logger.warning(f"Trajectory evaluation failed: {e}")
                    best_action_chunk = action_chunk[0]

                action_plan.extend(best_action_chunk[: args['replan_steps']])

            action = action_plan.popleft()

            # Execute action in environment
            obs, reward, done, info = env.step(action.tolist())
            gripper_is_closed_result = is_gripper_closed(obs)
            if done:
                task_successes += 1
                total_successes += 1
                break
            t += 1

        task_episodes += 1
        total_episodes += 1

        # Save cost data for current episode
        episode_data = {
            "episode_idx": episode_idx,
            "task_description": task_description,
            "success": done,
            "total_steps": t,
            "costs": current_episode_costs.copy(),
        }
        all_episode_costs.append(episode_data)

        logger.info(
            f"Episode {episode_idx + 1} completed - Success: {done}, Steps: {t}, Cost records: {len(current_episode_costs)}"
        )

        # Save a replay video of the episode
        # save_episode_video(replay_images, task_description, episode_idx, done, run_id, args)
        suffix = "success" if done else "failure"
        task_segment = task_description.replace(" ", "_")
        video_filename = f"task{task_id:02d}_ep{episode_idx:03d}_{task_segment}_{suffix}.mp4"
        video_path = video_out_dir / video_filename
        
        imageio.mimwrite(
            video_path,
            [np.asarray(x) for x in replay_images],
            fps=24,
        )

        # Log current results
        logger.info(f"Episode result: {'Success' if done else 'Failure'}")
        logger.info(f"Completed episodes: {total_episodes}")
        logger.info(f"Successful episodes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)")

    # Log final results
    task_success_rate = float(task_successes) / float(task_episodes) if task_episodes > 0 else 0
    total_success_rate = float(total_successes) / float(total_episodes) if total_episodes > 0 else 0

    logger.info(f"Current task success rate: {task_success_rate:.4f} ({task_success_rate * 100:.1f}%)")
    logger.info(f"Overall success rate: {total_success_rate:.4f} ({total_success_rate * 100:.1f}%)")
    logger.info(f"Current task episodes: {task_episodes}, successful: {task_successes}")
    logger.info(f"Total episodes: {total_episodes}, total successful: {total_successes}")

    # Save cost data for current task
    task_cost_file = f"./experiments/cost/{run_id}/cost_data_task_{task_id}.json"
    os.makedirs(os.path.dirname(task_cost_file), exist_ok=True)
    with open(task_cost_file, "w") as f:
        json.dump(all_episode_costs, f, indent=2, default=str)
    logger.info(f"Cost data saved to: {task_cost_file}")

    break

# Calculate final results
final_success_rate = float(total_successes) / float(total_episodes) if total_episodes > 0 else 0

# Log final results
logger.info("=" * 60)
logger.info("Experiment completed - Final results:")
logger.info(f"Total episodes: {total_episodes}")
logger.info(f"Total successful: {total_successes}")
logger.info(f"Final success rate: {final_success_rate:.4f} ({final_success_rate * 100:.1f}%)")
logger.info("=" * 60)
logger.info(f"Experiment run ID: {run_id}")
logger.info(f"Log file: {log_filepath}")
logger.info("Experiment completed!")

print("\n✓ Evaluation complete!")

  0%|          | 0/10 [00:00<?, ?it/s]



INFO:__main__:
Starting task: pick up the black bowl next to the plate and place it on the plate
INFO:__main__:Building reusable ValueMap...
INFO:root:可重复使用ValueMap构建完成:
INFO:root:  目标对象: ['akita_black_bowl_1']
INFO:root:  避免对象: ['akita_black_bowl_2', 'cookies_1', 'glazed_rim_porcelain_ramekin_1', 'plate_1', 'wooden_cabinet_1', 'flat_stove_1']
INFO:root:  地图大小: 100
INFO:root:  分辨率: [0.011  0.016  0.0105]
INFO:root:  当前末端执行器位置: [-0.20846466  0.          1.17327948]
INFO:root:已恢复原始环境状态
INFO:__main__:Reusable ValueMap built successfully:
INFO:__main__:  Target objects: ['akita_black_bowl_1']
INFO:__main__:  Avoid objects: ['akita_black_bowl_2', 'cookies_1', 'glazed_rim_porcelain_ramekin_1', 'plate_1', 'wooden_cabinet_1', 'flat_stove_1']


##################################################
## voxel resolution: [0.011  0.016  0.0105]
##################################################




INFO:__main__:
Task: pick up the black bowl next to the plate and place it on the plate
INFO:__main__:Episode 1/1
INFO:__main__:Starting episode 1...


** saving visualization to experiments/tmp/visualizations/16:9:35.html ...
** saving visualization to experiments/tmp/visualizations/latest.html ...


INFO:__main__:Step 10 - Best traj cost: [0.23895795 0.23895795 0.2362535  0.22659561 0.22659561]


** save to experiments/tmp/visualizations/16:9:35.html


  0%|          | 0/1 [05:35<?, ?it/s]
 80%|████████  | 8/10 [05:44<01:26, 43.03s/it]


KeyboardInterrupt: 

## Cell 6: 评估总结

显示评估结果和输出位置。

In [None]:
print("\n" + "="*70)
print("EVALUATION COMPLETE")
print("="*70)
print(f"\nConfiguration Summary:")
print(f"  Task Suite: {args['task_suite_name']}")
print(f"  Number of Trials per Task: {args['num_trials_per_task']}")
print(f"  Model Server: {args['host']}:{args['port']}")
print(f"  Image Resize Size: {args['resize_size']}")
print(f"  Replan Steps: {args['replan_steps']}")
print(f"\nOutput Locations:")
print(f"  Log Directory: {args['local_log_dir']}")
print(f"  Video Output: {args['video_out_path']}")
print(f"  Cost Output: ./experiments/cost/")
print(f"\nResults:")
print(f"  Final Success Rate: {final_success_rate:.4f} ({final_success_rate * 100:.1f}%)")
print(f"  Total Episodes: {total_episodes}")
print(f"  Total Successes: {total_successes}")
print("="*70)