In [None]:
import gymnasium as gym
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3 import PPO
import os
import random
import numpy as np
from torch import nn
import torch
from torch.nn import LogSoftmax
import torch.optim as optim

In [None]:
env = gym.make('HalfCheetah-v2')

In [None]:
# define env and policy model
env = gym.make('HalfCheetah-v2')
checkpoint_callback = CheckpointCallback(
  save_freq=4000,
  save_path="./logs/",
  name_prefix="rl_model",
  save_replay_buffer=True,
  save_vecnormalize=True,
)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=64000, callback=checkpoint_callback)

  logger.deprecation(
  logger.deprecation(


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -404     |
| time/              |          |
|    fps             | 1609     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -423        |
| time/                   |             |
|    fps                  | 1113        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010157889 |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.2         |
|    entropy_loss   

<stable_baselines3.ppo.ppo.PPO at 0x7f8a71dd07f0>

In [None]:
class Dataset():
    def __init__(self, env, dir="./logs", size=1000):
        self.dir = dir
        self.size = size
        self.env = env
        self.trajs = []

    def generate_trajectory(self, model, min_len=100):
        state = env.reset()[0]
        obs, actions, rewards = [state], [], []
        for i in range(1000):
            action, _ = model.predict(state)
            state, reward, done, _, _ = env.step(action)
            env.render()
            obs.append(state)
            actions.append(action)
            rewards.append(reward)

            if done:
                if len(obs) < min_len:
                    obs.pop()
                    state = env.reset()[0]
                    obs.append(state)
                else:
                    obs.pop()
                    break
        return (np.stack(obs, axis=0), np.stack(actions, axis=0), rewards)

    def generate_dataset(self, models):
        for model in models:
            traj = self.generate_trajectory(model)
            self.trajs.append(traj)
        obs, actions, rewards = zip(*self.trajs)
        self.trajs = (np.concatenate(obs, axis=0),np.concatenate(actions,axis=0),np.concatenate(rewards,axis=0))

    def generate_ranked_pairs(self, segment_length=50):
        ranked_dataset = []
        obs, actions, rewards = self.trajs

        for i in range(self.size):
            t1_start = random.randint(0, len(obs)-segment_length-1)
            t2_start = random.randint(0, len(obs)-segment_length-1)
            t1_end = t1_start + segment_length
            t2_end = t2_start + segment_length
            t1_return = sum(rewards[t1_start:t1_end])
            t2_return = sum(rewards[t2_start:t2_end])
            d = (obs[t1_start:t1_end], obs[t2_start:t2_end], 0 if t1_return > t2_return else 1)
            ranked_dataset.append(d)

        return ranked_dataset

In [None]:
# create ranked pairs of trajectories
def create_ranked_pairs(env=env):
    dataset=Dataset(env=env)
    models = []
    for model_path in os.listdir("./logs"):
        model_path = "./logs/" + model_path[:-4]
        model = PPO.load(model_path, env=env)
        models.append(model)
    dataset.generate_dataset(models=models)

    return dataset.generate_ranked_pairs()

ranked_pair_dataset = create_ranked_pairs(env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

In [None]:
class RewardModel(nn.Module):
    def __init__(self, in_dims=17):
        super().__init__()
        self.in_dims = in_dims
        self.loss_fn = LogSoftmax(dim=1)


        self.linear_relu_stack = nn.Sequential(
            nn.Linear(self.in_dims, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

    def train(self, x, optimizer, epochs=20):
        loss = 0
        for e in range(1,epochs+1):
            b = 0
            bloss = []
            for t1, t2, y in x:
                t1_return_predcited = torch.sum(self.forward(torch.from_numpy(t1).to(torch.float32)))
                t2_return_predcited = torch.sum(self.forward(torch.from_numpy(t2).to(torch.float32)))
                t_stack = torch.stack((t1_return_predcited,t2_return_predcited), dim=0).reshape((1,2)) # [1,2] for sgd
                loss += -self.loss_fn(t_stack)[0][y]
                b += 1
                if b%64 == 0:
                    loss = loss/64
                    bloss.append(loss)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    loss = 0
            print("loss after {} epoch: {}".format(e, sum(bloss)/len(bloss)))


In [None]:
model = RewardModel()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
model.train(ranked_pair_dataset, optimizer=optimizer)

loss after 1 epoch: 0.6357890963554382
loss after 2 epoch: 0.4068799614906311
loss after 3 epoch: 0.34277665615081787
loss after 4 epoch: 0.2707604169845581
loss after 5 epoch: 0.26000961661338806
loss after 6 epoch: 0.22391971945762634
loss after 7 epoch: 0.19832585752010345
loss after 8 epoch: 0.17795917391777039
loss after 9 epoch: 0.15314528346061707
loss after 10 epoch: 0.13230834901332855
loss after 11 epoch: 0.129897341132164
loss after 12 epoch: 0.12240181118249893
loss after 13 epoch: 0.1351267397403717
loss after 14 epoch: 0.17856083810329437
loss after 15 epoch: 0.1812516152858734
loss after 16 epoch: 0.16456995904445648
loss after 17 epoch: 0.13276347517967224
loss after 18 epoch: 0.12200073897838593
loss after 19 epoch: 0.12759491801261902
loss after 20 epoch: 0.13036884367465973


In [None]:
test_ranked_pairs = create_ranked_pairs(env=env)
def eval(x):
    acc = []
    for t1, t2, y in x:
        ret_t1 = sum(model(torch.from_numpy(t1).to(torch.float32)))
        ret_t2 = sum(model(torch.from_numpy(t2).to(torch.float32)))
        if ret_t1 > ret_t2:
            xx = y == 0
        else:
            xx = y == 1
        if xx:
            acc.append(1)
        else:
            acc.append(0)
    return sum(acc)/len(acc)
print(eval(test_ranked_pairs))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

tensor([11.0971], grad_fn=<AddBackward0>)