In [None]:
import gymnasium as gym
import numpy as np
import torch

from collections import defaultdict
from gym_wrapper import GymWrapperRecorder

from datasets import Dataset
from sb3_contrib import TRPO

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else ("cuda:0" if torch.cuda.is_available() else "cpu"))

In [None]:
env_name = 'HalfCheetah-v4'
env = GymWrapperRecorder(gym.make(env_name))

In [None]:
model_name = f"trpo_{env_name.split('-')[0].lower()}"
model = TRPO("MlpPolicy", env, verbose=0, device="cpu")
model.learn(total_timesteps=1000*1000)
model.save(f"./agents/{model_name}")

# model = PPO.load(f"./agents/{model_name}")

In [None]:
td = env.get_all_episodes()
d = defaultdict(list)
returns = []

for t in td:
    d['observations'].append([list(obs) for obs in t['observations']])
    d['actions'].append([list(act) for act in t['actions']])
    d['rewards'].append(list(t['rewards']))
    d['dones'].append(list(t['dones']))
    returns.append(sum(t['rewards']))

ds = Dataset.from_dict(d)
print(f"Episode returns | Avg: {np.round(np.mean(returns), 4)} | Std: {np.round(np.std(returns), 4)} | Min: {np.round(np.min(returns), 4)} | Median: {np.round(np.median(returns), 4)} | Max: {np.round(np.max(returns), 4)}")
print(ds)

In [None]:
ds.save_to_disk(f'./datasets/{model_name}_train') 

In [None]:
TOTAL_TRAJECTORIES = 1000
MAX_EP_LEN = 1000
RETURNS_SCALE = 1000.0

eval_dict = defaultdict(list)
obs = env.restart()
for i in range(TOTAL_TRAJECTORIES):
    ep_return = 0
    ep_len = 0
    obs, _ = env.reset()

    while True:
        action, _ = model.predict(obs)
        obs, reward, done, _, _ = env.step(action)

        ep_return += reward
        ep_len += 1

        if done or ep_len == MAX_EP_LEN - 1:
            eval_dict['iter'].append(i)
            eval_dict['ep_length'].append(ep_len)
            eval_dict['ep_return'].append(ep_return)
            break

print(f"Episode lengths | Avg: {np.round(np.mean(eval_dict['ep_length']), 4)} | Std: {np.round(np.std(eval_dict['ep_length']), 4)} | Min: {np.round(np.min(eval_dict['ep_length']), 4)} | Median: {np.round(np.median(eval_dict['ep_length']), 4)} | Max: {np.round(np.max(eval_dict['ep_length']), 4)}")
print(f"Episode returns | Avg: {np.round(np.mean(eval_dict['ep_return']), 4)} | Std: {np.round(np.std(eval_dict['ep_return']), 4)} | Min: {np.round(np.min(eval_dict['ep_return']), 4)} | Median: {np.round(np.median(eval_dict['ep_return']), 4)} | Max: {np.round(np.max(eval_dict['ep_return']), 4)}")
print("\n")


In [None]:
td = env.get_all_episodes()
d = defaultdict(list)
for t in td:
    d['observations'].append([list(obs) for obs in t['observations']])
    d['actions'].append([list(act) for act in t['actions']])
    d['rewards'].append(list(t['rewards']))
    d['dones'].append(list(t['dones']))

ds = Dataset.from_dict(d)
print(ds)

In [None]:
ds.save_to_disk(f'./datasets/{model_name}_eval')