In [1]:
import numpy as np
import gymnasium as gym
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env
from imitation.data.wrappers import RolloutInfoWrapper

env = make_vec_env(
    "seals:seals/CartPole-v0",
    rng=np.random.default_rng(),
    post_wrappers=[
        lambda env, _: RolloutInfoWrapper(env)
    ],  # needed for computing rollouts later
)
env_specs = env.observation_space, env.action_space 
print('Observation Spec:', env.observation_space)
print('Action Spec:', env.action_space)

expert = load_policy(
    "ppo-huggingface",
    organization="HumanCompatibleAI",
    env_name="seals/CartPole-v0",
    venv=env,
)

Observation Spec: Box([-3.4028235e+38 -3.4028235e+38 -3.1415927e+00 -3.4028235e+38], [3.4028235e+38 3.4028235e+38 3.1415927e+00 3.4028235e+38], (4,), float32)
Action Spec: Discrete(2)


In [2]:
env.reset()

array([[ 0.01369617, -0.02302133, -0.04590265, -0.04834723],
       [ 0.00118216,  0.04504637, -0.03558404,  0.04486495],
       [-0.02383879, -0.02015088,  0.03142257, -0.04080841],
       [-0.04143508, -0.02631895,  0.03012745,  0.0082162 ],
       [ 0.04430561,  0.00113276,  0.04762437, -0.0419164 ],
       [ 0.03050029,  0.03079408,  0.00153256, -0.02141986],
       [ 0.00381644, -0.01567291, -0.01309328, -0.01255032],
       [ 0.01250955,  0.03972138,  0.02756857, -0.02747928]],
      dtype=float32)

In [3]:
from stable_baselines3.common.evaluation import evaluate_policy

reward, _ = evaluate_policy(expert, env, 10)
print(reward)

500.0


In [4]:
from imitation.data import rollout

rng = np.random.default_rng()
rollouts = rollout.rollout(
    expert,
    env,
    rollout.make_sample_until(min_timesteps=None, min_episodes=50),
    rng=rng,
)
transitions = rollout.flatten_trajectories(rollouts)

In [5]:
print(
    f"""The `rollout` function generated a list of {len(rollouts)} {type(rollouts[0])}.
After flattening, this list is turned into a {type(transitions)} object containing {len(transitions)} transitions.
The transitions object contains arrays for: {', '.join(transitions.__dict__.keys())}."
"""
)

The `rollout` function generated a list of 56 <class 'imitation.data.types.TrajectoryWithRew'>.
After flattening, this list is turned into a <class 'imitation.data.types.Transitions'> object containing 28000 transitions.
The transitions object contains arrays for: obs, acts, infos, next_obs, dones."



In [13]:
rollouts[0]

TrajectoryWithRew(obs=array([[-0.04716803, -0.03757167,  0.01706244,  0.01471895],
       [-0.04791947, -0.23293412,  0.01735682,  0.31273606],
       [-0.05257815, -0.03806367,  0.02361154,  0.02557709],
       ...,
       [ 0.23058708, -0.0370378 , -0.01744288,  0.00292787],
       [ 0.22984631, -0.23190531, -0.01738432,  0.2900567 ],
       [ 0.22520821, -0.03653984, -0.01158319, -0.00805794]],
      dtype=float32), acts=array([0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 

In [7]:
transitions

Transitions(obs=array([[-0.04716803, -0.03757167,  0.01706244,  0.01471895],
       [-0.04791947, -0.23293412,  0.01735682,  0.31273606],
       [-0.05257815, -0.03806367,  0.02361154,  0.02557709],
       ...,
       [ 0.1094296 , -0.18791756, -0.00134319,  0.25827387],
       [ 0.10567125,  0.00722354,  0.00382229, -0.0348324 ],
       [ 0.10581572, -0.18795301,  0.00312564,  0.25905403]],
      dtype=float32), acts=array([0, 1, 0, ..., 1, 0, 1], dtype=int64), infos=array([{}, {}, {}, ..., {}, {}, {}], dtype=object), next_obs=array([[-0.04791947, -0.23293412,  0.01735682,  0.31273606],
       [-0.05257815, -0.03806367,  0.02361154,  0.02557709],
       [-0.05333942, -0.23351614,  0.02412308,  0.3256152 ],
       ...,
       [ 0.10567125,  0.00722354,  0.00382229, -0.0348324 ],
       [ 0.10581572, -0.18795301,  0.00312564,  0.25905403],
       [ 0.10205666,  0.00712418,  0.00830672, -0.03264138]],
      dtype=float32), dones=array([False, False, False, ..., False, False,  True]))

In [8]:
cd acme

C:\Users\Adhula\Downloads\acme


In [9]:
import acme

In [10]:
class BehaviorCloningAgent(acme.Actor):

  def __init__(self, env_specs=None):
    # No Q-value table needed for behavior cloning
    pass

  def transform_state(self, state):
    # This might still be needed depending on your environment
    state = *map(int, state),
    return state

  def select_action(self, observation):
    state = self.transform_state(observation)
    # Access expert action from rollout data (replace with your implementation)
    expert_action = get_expert_action_from_rollout(state)
    return expert_action

  def observe_first(self, timestep):
    pass  # No action needed here

  def observe(self, action, next_timestep):
    pass  # No action needed here

  def update(self):
    pass  # No update needed in behavior cloning

bc=BehaviorCloningAgent()

In [11]:
#from acme import types
from acme.types import Batches
from acme.wrappers import gym_wrapper
from acme.environment_loop import EnvironmentLoop
from acme.utils.loggers import TerminalLogger, InMemoryLogger

# environments
import gym
#import dm_env

# other
import numpy as np

In [12]:
loop = EnvironmentLoop(env, bc, logger=InMemoryLogger())
loop.run_episode()

AttributeError: 'DummyVecEnv' object has no attribute 'reward_spec'