In [None]:
import numpy as np
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from src.envs.environment_factory import EnvironmentFactory
import random
from dataclasses import dataclass
import torch
from transformers import DecisionTransformerConfig, DecisionTransformerModel, Trainer, TrainingArguments
import pandas as pd
import datasets
import pyarrow as pa


#### OBTAIN TRAJECTORIES

env_name = "CustomMyoBaodingBallsP1"

# Path to normalized Vectorized environment (if not first task)
PATH_TO_NORMALIZED_ENV = "output/training/2022-10-02/21-36-47/training_env.pkl"  # "trained_models/normalized_env_original"

# Path to pretrained network (if not first task)
PATH_TO_PRETRAINED_NET = "output/training/2022-10-02/21-36-47/best_model.zip"  # "trained_models/best_model.zip"

# Reward structure and task parameters:
config = {
    "weighted_reward_keys": {
        "pos_dist_1": 0,
        "pos_dist_2": 0,
        "act_reg": 0,
        "alive": 0,
        "solved": 5,
        "done": 0,
        "sparse": 0,
    },
    "goal_time_period": [4, 6],
    "task": "ccw",
    "enable_rhi": False,
    "enable_rsi": False
}


# Function that creates and monitors vectorized environments:
def make_parallel_envs(env_name, env_config, num_env, start_index=0):
    def make_env(rank):
        def _thunk():
            env = EnvironmentFactory.create(env_name, **env_config)
            return env

        return _thunk

    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])



###### TRANSFORMER

@dataclass
class DecisionTransformerGymDataCollator:
    return_tensors: str = "pt"
    max_len: int = 20 #subsets of the episode we use for training
    state_dim: int = 86  # size of state space
    act_dim: int = 39  # size of action space
    max_ep_len: int = 200 # max episode length in the dataset
    scale: float = 1000.0  # normalization of rewards/returns
    state_mean: np.array = None  # to store state means
    state_std: np.array = None  # to store state stds
    p_sample: np.array = None  # a distribution to take account trajectory lengths
    n_traj: int = 0 # to store the number of trajectories in the dataset

    def __init__(self, dataset) -> None:
        self.act_dim = len(dataset[0]["actions"][0])
        self.state_dim = len(dataset[0]["observations"][0])
        self.dataset = dataset
        # calculate dataset stats for normalization of states
        states = []
        traj_lens = []
        for obs in dataset["observations"]:
            states.extend(obs)
            traj_lens.append(len(obs))
        self.n_traj = len(traj_lens)
        states = np.vstack(states)
        self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6
        
        traj_lens = np.array(traj_lens)
        self.p_sample = traj_lens / sum(traj_lens)

    def _discount_cumsum(self, x, gamma):
        discount_cumsum = np.zeros_like(x)
        discount_cumsum[-1] = x[-1]
        for t in reversed(range(x.shape[0] - 1)):
            discount_cumsum[t] = x[t] + gamma * discount_cumsum[t + 1]
        return discount_cumsum

    def __call__(self, features):
        batch_size = len(features)
        # this is a bit of a hack to be able to sample of a non-uniform distribution
        batch_inds = np.random.choice(
            np.arange(self.n_traj),
            size=batch_size,
            replace=True,
            p=self.p_sample,  # reweights so we sample according to timesteps
        )
        # a batch of dataset features
        s, a, r, d, rtg, timesteps, mask = [], [], [], [], [], [], []
        
        for ind in batch_inds:
            # for feature in features:
            feature = self.dataset[int(ind)]
            si = random.randint(0, len(feature["rewards"]) - 1)

            # get sequences from dataset
            s.append(np.array(feature["observations"][si : si + self.max_len]).reshape(1, -1, self.state_dim))
            a.append(np.array(feature["actions"][si : si + self.max_len]).reshape(1, -1, self.act_dim))
            r.append(np.array(feature["rewards"][si : si + self.max_len]).reshape(1, -1, 1))

            d.append(np.array(feature["dones"][si : si + self.max_len]).reshape(1, -1))
            timesteps.append(np.arange(si, si + s[-1].shape[1]).reshape(1, -1))
            timesteps[-1][timesteps[-1] >= self.max_ep_len] = self.max_ep_len - 1  # padding cutoff
            rtg.append(
                self._discount_cumsum(np.array(feature["rewards"][si:]), gamma=1.0)[
                    : s[-1].shape[1]   # TODO check the +1 removed here
                ].reshape(1, -1, 1)
            )
            if rtg[-1].shape[1] < s[-1].shape[1]:
                print("if true")
                rtg[-1] = np.concatenate([rtg[-1], np.zeros((1, 1, 1))], axis=1)

            # padding and state + reward normalization
            tlen = s[-1].shape[1]
            s[-1] = np.concatenate([np.zeros((1, self.max_len - tlen, self.state_dim)), s[-1]], axis=1)
            s[-1] = (s[-1] - self.state_mean) / self.state_std
            a[-1] = np.concatenate(
                [np.ones((1, self.max_len - tlen, self.act_dim)) * -10.0, a[-1]],
                axis=1,
            )
            r[-1] = np.concatenate([np.zeros((1, self.max_len - tlen, 1)), r[-1]], axis=1)
            d[-1] = np.concatenate([np.ones((1, self.max_len - tlen)) * 2, d[-1]], axis=1)
            rtg[-1] = np.concatenate([np.zeros((1, self.max_len - tlen, 1)), rtg[-1]], axis=1) / self.scale
            timesteps[-1] = np.concatenate([np.zeros((1, self.max_len - tlen)), timesteps[-1]], axis=1)
            mask.append(np.concatenate([np.zeros((1, self.max_len - tlen)), np.ones((1, tlen))], axis=1))

        s = torch.from_numpy(np.concatenate(s, axis=0)).float()
        a = torch.from_numpy(np.concatenate(a, axis=0)).float()
        r = torch.from_numpy(np.concatenate(r, axis=0)).float()
        d = torch.from_numpy(np.concatenate(d, axis=0))
        rtg = torch.from_numpy(np.concatenate(rtg, axis=0)).float()
        timesteps = torch.from_numpy(np.concatenate(timesteps, axis=0)).long()
        mask = torch.from_numpy(np.concatenate(mask, axis=0)).float()

        return {
            "states": s,
            "actions": a,
            "rewards": r,
            "returns_to_go": rtg,
            "timesteps": timesteps,
            "attention_mask": mask,
        }

class TrainableDT(DecisionTransformerModel):
    def __init__(self, config):
        super().__init__(config)

    def forward(self, **kwargs):
        output = super().forward(**kwargs)
        # add the DT loss
        action_preds = output[1]
        action_targets = kwargs["actions"]
        attention_mask = kwargs["attention_mask"]
        act_dim = action_preds.shape[2]
        action_preds = action_preds.reshape(-1, act_dim)[attention_mask.reshape(-1) > 0]
        action_targets = action_targets.reshape(-1, act_dim)[attention_mask.reshape(-1) > 0]
        
        loss = torch.mean((action_preds - action_targets) ** 2)

        return {"loss": loss}

    def original_forward(self, **kwargs):
        return super().forward(**kwargs)


if __name__ == "__main__":
    # Create vectorized environments:
    envs = make_parallel_envs(env_name, config, num_env=16)

    # Normalize environment:
    envs = VecNormalize.load(PATH_TO_NORMALIZED_ENV, envs)

    # Create model (hyperparameters from RL Zoo HalfCheetak)
    model = RecurrentPPO.load(PATH_TO_PRETRAINED_NET, env=envs)

    # EVALUATE
    eval_model = model
    eval_env = EnvironmentFactory.create(env_name, **config)

    # Enjoy trained agent
    num_episodes = 5000
    perfs = []
    all_observations, all_actions, all_rewards, all_dones = [] , [] , [] , []
    for i in range(num_episodes):
        observations, actions, rewards, dones = [] , [] , [] , []
        lstm_states = None
        cum_rew = 0
        obs = eval_env.reset()
        episode_starts = np.ones((1,), dtype=bool)
        done = False
        while not done:
            action, lstm_states = eval_model.predict(
                envs.normalize_obs(obs),
                state=lstm_states,
                episode_start=episode_starts,
                deterministic=True,
            )
            obs, rew, done, _ = eval_env.step(action)

            observations.append(obs.tolist())
            actions.append(action.tolist())
            rewards.append(rew.tolist())
            dones.append(done)
            episode_starts = done
            cum_rew += rew
        perfs.append(cum_rew)
        all_observations.append(observations)
        all_actions.append(actions)
        all_rewards.append(rewards)
        all_dones.append(dones)
        print("Episode", i, ", cum rew: ", cum_rew)
    print(("Average rew:", np.mean(perfs)))

In [119]:
all_observations2, all_actions2, all_rewards2, all_dones2 = [] , [] , [] , []
for i in range(len(all_dones)):
    blank_dones = 200*[False]
    blank_rewards = 200*[0.0]
    blank_actions = np.zeros((200,39)).tolist()
    blank_obs = np.zeros((200,86)).tolist()

    blank_dones[:(len(all_dones[i])-1)] = all_dones[i][:-1]
    blank_rewards[:(len(all_rewards[i])-1)] = all_rewards[i][:-1]
    blank_actions[:(len(all_actions[i])-1)] = all_actions[i][:-1]
    blank_obs[:(len(all_observations[i])-1)] = all_observations[i][:-1]

    all_observations2.append(blank_obs)
    all_actions2.append(blank_actions)
    all_rewards2.append(blank_rewards)
    all_dones2.append(blank_dones)
    

    

In [122]:



##### CREATE DATASET OBJECT WITH TRAJECTORIES ####

experience_dict = {'observations': all_observations2, 'actions': all_actions2, 'rewards': all_rewards2, 'dones': all_dones2}
df = pd.DataFrame.from_dict(experience_dict)
da = datasets.Dataset(pa.Table.from_pandas(df))
dataset = datasets.DatasetDict({'train': da})


#### create trainable
collator = DecisionTransformerGymDataCollator(dataset["train"])
config = DecisionTransformerConfig(state_dim=collator.state_dim, act_dim=collator.act_dim)
model = TrainableDT(config)


#### train
training_args = TrainingArguments(
output_dir="output/",
remove_unused_columns=False,
num_train_epochs=120,
per_device_train_batch_size=64,
learning_rate=1e-4,
weight_decay=1e-4,
warmup_ratio=0.1,
optim="adamw_torch",
max_grad_norm=0.25,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    data_collator=collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 1000
  Num Epochs = 120
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1920
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
 26%|██▌       | 500/1920 [12:23<35:29,  1.50s/it]Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin


{'loss': 0.1983, 'learning_rate': 8.217592592592593e-05, 'epoch': 31.25}


 52%|█████▏    | 1000/1920 [24:46<23:45,  1.55s/it]Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin


{'loss': 0.0371, 'learning_rate': 5.3240740740740744e-05, 'epoch': 62.5}


 78%|███████▊  | 1500/1920 [37:10<10:06,  1.44s/it]Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin


{'loss': 0.0314, 'learning_rate': 2.4305555555555558e-05, 'epoch': 93.75}


100%|██████████| 1920/1920 [47:36<00:00,  1.35s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1920/1920 [47:36<00:00,  1.49s/it]

{'train_runtime': 2856.5965, 'train_samples_per_second': 42.008, 'train_steps_per_second': 0.672, 'train_loss': 0.07601597805817922, 'epoch': 120.0}





TrainOutput(global_step=1920, training_loss=0.07601597805817922, metrics={'train_runtime': 2856.5965, 'train_samples_per_second': 42.008, 'train_steps_per_second': 0.672, 'train_loss': 0.07601597805817922, 'epoch': 120.0})

In [123]:
# Function that gets an action from the model using autoregressive prediction with a window of the previous 20 timesteps.
def get_action(model, states, actions, rewards, returns_to_go, timesteps):
    # This implementation does not condition on past rewards

    states = states.reshape(1, -1, model.config.state_dim)
    actions = actions.reshape(1, -1, model.config.act_dim)
    returns_to_go = returns_to_go.reshape(1, -1, 1)
    timesteps = timesteps.reshape(1, -1)

    states = states[:, -model.config.max_length :]
    actions = actions[:, -model.config.max_length :]
    returns_to_go = returns_to_go[:, -model.config.max_length :]
    timesteps = timesteps[:, -model.config.max_length :]
    padding = model.config.max_length - states.shape[1]
    # pad all tokens to sequence length
    attention_mask = torch.cat([torch.zeros(padding), torch.ones(states.shape[1])])
    attention_mask = attention_mask.to(dtype=torch.long).reshape(1, -1)
    states = torch.cat([torch.zeros((1, padding, model.config.state_dim)), states], dim=1).float()
    actions = torch.cat([torch.zeros((1, padding, model.config.act_dim)), actions], dim=1).float()
    returns_to_go = torch.cat([torch.zeros((1, padding, 1)), returns_to_go], dim=1).float()
    timesteps = torch.cat([torch.zeros((1, padding), dtype=torch.long), timesteps], dim=1)

    state_preds, action_preds, return_preds = model.original_forward(
        states=states,
        actions=actions,
        rewards=rewards,
        returns_to_go=returns_to_go,
        timesteps=timesteps,
        attention_mask=attention_mask,
        return_dict=False,
    )

    return action_preds[0, -1]

In [128]:
# build the environment
model = model.to("cpu")
env = eval_env
max_ep_len = 200
device = "cpu"
scale = 1000.0  # normalization for rewards/returns
TARGET_RETURN = 1000 / scale  # evaluation is conditioned on a return of 12000, scaled accordingly

state_mean = collator.state_mean.astype(np.float32)
state_std = collator.state_std.astype(np.float32)
print(state_mean)

state_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
# Create the decision transformer model

state_mean = torch.from_numpy(state_mean).to(device=device)
state_std = torch.from_numpy(state_std).to(device=device)

[-1.2371e+00  4.3647e-01 -3.3807e-01 -1.0207e-01 -6.0633e-01  6.5817e-01
  2.4875e-01  8.5003e-01  2.3608e-01 -3.1889e-02 -7.1316e-03 -1.7548e-03
  2.6401e-01 -2.0834e-02 -1.1196e-02  5.7523e-01  2.2938e-01 -2.8528e-03
 -3.2417e-03  1.2535e-01 -2.1219e-01  3.2079e-02 -7.2300e-03 -2.6391e-01
 -5.0825e-01  1.4052e+00 -9.6182e-05  2.0225e-05 -8.5985e-05 -2.6490e-01
 -5.0962e-01  1.4025e+00 -1.4433e-04  1.0323e-04 -1.8574e-05 -2.6355e-01
 -5.1052e-01  1.4016e+00 -2.6319e-01 -5.0821e-01  1.4016e+00  3.6019e-04
 -2.2744e-03 -3.6508e-03  1.7167e-03  1.4139e-03 -8.8827e-04  7.4164e-01
  3.8710e-02  1.4413e-03  1.4277e-03  9.2832e-04  6.0979e-01  9.8122e-03
  5.8737e-03  5.8451e-02  6.1799e-02  1.4269e-03  1.5639e-02  3.4160e-03
  6.6882e-04  1.7018e-03  2.3003e-02  3.4791e-03  6.6964e-02  4.8909e-01
  7.6082e-01  3.9991e-01  1.0915e-02  1.4705e-01  1.5584e-01  1.4539e-02
  4.9274e-01  2.3028e-01  2.4338e-01  8.9980e-01  3.2195e-01  4.2762e-02
  2.4291e-01  6.7887e-02  3.2464e-02  6.8648e-01  1

In [129]:
# Interact with the environment and create a video

for _ in range(10):
    done = False
    episode_return, episode_length = 0, 0
    state = env.reset()
    target_return = torch.tensor(TARGET_RETURN, device=device, dtype=torch.float32).reshape(1, 1)
    states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
    actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
    rewards = torch.zeros(0, device=device, dtype=torch.float32)
    timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)

    while not done:
        actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
        rewards = torch.cat([rewards, torch.zeros(1, device=device)])

        action = get_action(
            model,
            (states - state_mean) / state_std,
            actions,
            rewards,
            target_return,
            timesteps,
        )
        actions[-1] = action
        action = action.detach().cpu().numpy()

        state, reward, done, _ = env.step(action)
        cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
        states = torch.cat([states, cur_state], dim=0)
        rewards[-1] = reward

        pred_return = target_return[0, -1] - (reward / scale)
        target_return = torch.cat([target_return, pred_return.reshape(1, 1)], dim=1)
        timesteps = torch.cat([timesteps, torch.ones((1, 1), device=device, dtype=torch.long) * (t + 1)], dim=1)

        episode_return += reward
        episode_length += 1
        if done:
            print((episode_length,episode_return))

(28, 15.0)
(27, 15.0)
(30, 20.0)
(35, 15.0)
(30, 10.0)
(30, 15.0)
(29, 20.0)
(29, 15.0)
(30, 15.0)
(30, 15.0)


In [66]:
collator = DecisionTransformerGymDataCollator(dataset["train"])

In [73]:
collator

DecisionTransformerGymDataCollator(return_tensors='pt', max_len=20, state_dim=86, act_dim=39, max_ep_len=200, scale=1000.0, state_mean=array([-1.2633e+00,  4.4615e-01, -3.4541e-01, -1.0592e-01, -6.1978e-01,
        6.7275e-01,  2.5072e-01,  8.7233e-01,  2.4052e-01, -3.2339e-02,
       -6.9329e-03, -1.7060e-03,  2.6980e-01, -2.1272e-02, -1.1475e-02,
        5.8714e-01,  2.3457e-01, -2.8897e-03, -3.3776e-03,  1.2863e-01,
       -2.1567e-01,  3.2636e-02, -7.3457e-03, -2.6959e-01, -5.1934e-01,
        1.4361e+00, -9.4730e-05,  2.0428e-05, -9.0390e-05, -2.7079e-01,
       -5.2087e-01,  1.4333e+00, -1.4577e-04,  1.0605e-04, -3.0901e-05,
       -2.6922e-01, -5.2170e-01,  1.4324e+00, -2.6902e-01, -5.1940e-01,
        1.4324e+00,  3.6528e-04, -2.3533e-03, -3.7123e-03,  1.7673e-03,
        1.4718e-03, -8.5813e-04,  7.5854e-01,  3.9588e-02,  1.5634e-03,
        1.4589e-03,  9.6145e-04,  6.2333e-01,  1.0152e-02,  6.1522e-03,
        5.9578e-02,  6.5014e-02,  1.5636e-03,  1.6392e-02,  3.5839e-03,
 

In [76]:
for i in range(1000):
    if np.sum(dataset_cheetah['train'][i]['dones'])>0:
        print('fdsf')

In [29]:
torch.as_tensor(envs.normalize_obs(np.array((states - state_mean) / state_std)))

tensor([[-10.0000, -10.0000,  10.0000,   1.3274,  10.0000, -10.0000,  -1.8194,
          -6.2619, -10.0000,  10.0000,   3.2320,  -0.0863, -10.0000,  10.0000,
          10.0000, -10.0000, -10.0000,   5.6166,  10.0000,  -5.9207,  10.0000,
          -2.1693,  10.0000,  10.0000,  10.0000,  10.0000,  10.0000, -10.0000,
          10.0000,  10.0000, -10.0000, -10.0000,  10.0000, -10.0000,   3.9839,
          10.0000, -10.0000,  10.0000,  10.0000,  10.0000,   5.1911,  10.0000,
         -10.0000,   8.5148, -10.0000,  10.0000,  10.0000, -10.0000,  -3.9352,
          -2.0958,  -4.3004,  -3.2520, -10.0000,  -2.2302,  -3.1617,  -3.0870,
          -2.2083,  -1.8401,  -2.4877,  -4.2620,  -1.2069,  -2.8776,  -4.4549,
          -3.8955,  -3.5442,  -6.6043, -10.0000,  -4.6399,  -5.4344,  -4.4733,
          -5.0089,  -4.2003,  -5.5503,  -3.3655,  -2.7025, -10.0000,  -5.4791,
          -2.3303,  -4.3623,  -3.1463,  -1.9821, -10.0000,  -4.2417, -10.0000,
          -2.3016,  -4.6624]])

In [36]:
envs.norm_reward(5.0)

TypeError: 'bool' object is not callable

In [34]:
dataset['train'][0]['rewards']

[5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0

In [21]:
np.sum(dataset_cheetah['train'][0]['rewards'])

10760.844891987741