In [22]:
# Import required packages
import argparse
import os.path as osp
import pickle
import os

import gym
import numpy as np
import h5py
import torch as th
import torch.nn as nn
from gym.wrappers import TimeLimit
from tqdm.notebook import tqdm

import mani_skill2.envs
from mani_skill2.utils.wrappers import RecordEpisode
from torch.utils.data import Dataset, DataLoader


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
env_id = "StackCube-v1"

In [27]:
# loads h5 data into memory for faster access

def tensor_to_numpy(x):
    # moves all tensors to numpy. This is just for SB3 as SB3 does not optimize for observations stored on the GPU.
    if th.is_tensor(x):
        return x.cpu().numpy()
    return x
def convert_observation(observation):
    # flattens the original observation by flattening the state dictionaries
    # and combining the rgb and depth images

    # image data is not scaled here and is kept as uint16 to save space
    image_obs = observation["image"]
    rgb = image_obs["base_camera"]["rgb"]
    depth = image_obs["base_camera"]["depth"]
    rgb2 = image_obs["hand_camera"]["rgb"]
    depth2 = image_obs["hand_camera"]["depth"]

    # we provide a simple tool to flatten dictionaries with state data
    from mani_skill2.utils.common import flatten_state_dict
    obs_ext = observation["extra"]["tcp_pose"]
    state = np.hstack(
        [
            flatten_state_dict(observation["agent"]),
            obs_ext,
            #flatten_state_dict(observation["extra"]),
        ]
    )
    #print(state.shape)
    # combine the RGB and depth images
    rgbd = np.concatenate([rgb, depth, rgb2, depth2], axis=-1)
    obs = dict(rgbd=rgbd, state=state)
    return obs
def rescale_rgbd(rgbd):
    # rescales rgbd data and changes them to floats
    rgb1 = rgbd[..., 0:3] / 255.0
    rgb2 = rgbd[..., 4:7] / 255.0
    depth1 = rgbd[..., 3:4] / (2**10)
    depth2 = rgbd[..., 7:8] / (2**10) 
    return np.concatenate([rgb1, depth1, rgb2, depth2], axis=-1)
class ManiSkill2Dataset(Dataset):
    def __init__(self, dataset_file: str, load_count=-1) -> None:
        self.dataset_file = dataset_file
        import pickle
        self.data = pickle.load(open(self.dataset_file, 'rb'))
        print('load in again')
        self.episodes = len(self.data.keys())

        self.obs_state = []
        self.obs_rgbd = []
        self.actions = []
        self.total_frames = 0
        if load_count == -1:
            load_count = self.episodes
        for eps_id in tqdm(range(load_count)):
            # eps = self.episodes[eps_id]
            trajectory = self.data[f"traj_{eps_id}"]

            # convert the original raw observation with our batch-aware function
            obs = convert_observation(trajectory["obs"])
            # we use :-1 to ignore the last obs as terminal observations are included
            # and they don't have actions
            self.obs_rgbd.append(obs['rgbd'][:-1])
            self.obs_state.append(obs['state'][:-1])
            self.actions.append(trajectory["actions"])
        #test = np.concatenate(self.obs_rgbd,axis=0)
        #print(f'shape of test{test.shape}')
        self.obs_rgbd = np.concatenate(self.obs_rgbd,axis=0)# np.vstack(self.obs_rgbd)
        #print(self.obs_rgbd.shape)
        self.obs_state =np.concatenate(self.obs_state,axis=0)# np.vstack(self.obs_state)
        self.actions = np.concatenate(self.actions,axis=0)#np.vstack(self.actions)

    def __len__(self):
        return len(self.obs_rgbd)

    def __getitem__(self, idx):
        action = th.from_numpy(self.actions[idx]).float()
        rgbd = self.obs_rgbd[idx]
        rgbd = rescale_rgbd(rgbd)
        # permute data so that channels are the first dimension as PyTorch expects this
        rgbd = th.from_numpy(rgbd).float().permute((2, 0, 1))
        state = th.from_numpy(self.obs_state[idx]).float()
        return dict(rgbd=rgbd, state=state), action
    

In [28]:
demopath = '/content/drive/MyDrive/Colab_Notebooks/final/stack1g.pkl'
dataset = ManiSkill2Dataset(demopath)
print('Finish initialize')

load in again


  0%|          | 0/200 [00:00<?, ?it/s]

Finish initialize


In [29]:


dataloader = DataLoader(dataset, batch_size=100, num_workers=2, pin_memory=True, drop_last=True, shuffle=True)
obs, action = dataset[0]
print("RGBD:", obs['rgbd'].shape)
print("State:", obs['state'].shape)
print("Action:", action.shape)

RGBD: torch.Size([8, 128, 128])
State: torch.Size([32])
Action: torch.Size([7])


These two using state in training phase

In [44]:
class NatureCNN(nn.Module):
    def __init__(self, image_size=(128, 128), in_channels=8, state_size=32):
        super().__init__()

        extractors = {}

        self.out_features = 0
        feature_size = 256

        # here we use a NatureCNN architecture to process images, but any architecture is permissble here
        cnn = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten()
        )

        # to easily figure out the dimensions after flattening, we pass a test tensor
        test_tensor = th.zeros(
            [in_channels, image_size[0], image_size[1]]
        )
        with th.no_grad():
            n_flatten = cnn(test_tensor[None]).shape[1]
            fc = nn.Sequential(nn.Linear(n_flatten, feature_size), nn.ReLU())
        extractors["rgbd"] = nn.Sequential(cnn, fc)
        self.out_features += feature_size   #image feature size would be 256
        
        # for state data we simply pass it through a single linear layer
        extractors["state"] = nn.Linear(state_size, 64)
        self.out_features += 64

        self.extractors = nn.ModuleDict(extractors)

    def forward(self, observations) -> th.Tensor:
        encoded_tensor_list = []
        # self.extractors contain nn.Modules that do all the processing.
        for key, extractor in self.extractors.items():
            encoded_tensor_list.append(extractor(observations[key]))
        return th.cat(encoded_tensor_list, dim=1)

In [45]:
class Policy(nn.Module):
    def __init__(
        self,
        image_size=(128, 128),
        in_channels=8,
        state_size=42,
        hidden_units=[128, 128],
        act_dims=8,
        activation=nn.ReLU,
    ):
        super().__init__()
        self.feature_extractor = NatureCNN(image_size, in_channels, state_size)
        mlp_layers = []
        prev_units = self.feature_extractor.out_features
        for h in hidden_units:
            mlp_layers += [nn.Linear(prev_units, h), activation()]
            prev_units = h
        mlp_layers += [nn.Linear(prev_units, act_dims), nn.Tanh()]
        self.mlp = nn.Sequential(*mlp_layers)

    def forward(self, observations) -> th.Tensor:
        features = self.feature_extractor(observations)
        return self.mlp(features)

# create our policy
obs, action = dataset[0]
rgbd_shape = obs['rgbd'].shape
print(obs['rgbd'].shape)
th.manual_seed(0)
policy = Policy(image_size=rgbd_shape[1:], in_channels=rgbd_shape[0], state_size=obs['state'].shape[0], 
                act_dims=action.shape[0], hidden_units=[256, 256, 256])
# move model to gpu if possible
device = "cuda" if th.cuda.is_available() else "cpu"
policy = policy.to(device)
print(policy)

torch.Size([8, 128, 128])
Policy(
  (feature_extractor): NatureCNN(
    (extractors): ModuleDict(
      (rgbd): Sequential(
        (0): Sequential(
          (0): Conv2d(8, 32, kernel_size=(8, 8), stride=(4, 4))
          (1): ReLU()
          (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
          (3): ReLU()
          (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
          (5): ReLU()
          (6): Flatten(start_dim=1, end_dim=-1)
        )
        (1): Sequential(
          (0): Linear(in_features=9216, out_features=256, bias=True)
          (1): ReLU()
        )
      )
      (state): Linear(in_features=32, out_features=64, bias=True)
    )
  )
  (mlp): Sequential(
    (0): Linear(in_features=320, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=256, bias=True)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=7, bias=True)
    (7):

These new network is using without state info, pure rgbd data 

In [46]:
class PureCNN(nn.Module):
    def __init__(self, image_size=(128, 128), in_channels=8):
        super().__init__()

        extractors = {}

        self.out_features = 0
        feature_size = 256

        # here we use a NatureCNN architecture to process images, but any architecture is permissble here
        cnn = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten()
        )

        # to easily figure out the dimensions after flattening, we pass a test tensor
        test_tensor = th.zeros(
            [in_channels, image_size[0], image_size[1]]
        )
        with th.no_grad():
            n_flatten = cnn(test_tensor[None]).shape[1]
            fc = nn.Sequential(nn.Linear(n_flatten, feature_size), nn.ReLU())
        extractors["rgbd"] = nn.Sequential(cnn, fc)
        self.out_features += feature_size   #image feature size would be 256
        
        # for state data we simply pass it through a single linear layer
        #extractors["state"] = nn.Linear(state_size, 64)
        #self.out_features += 64

        self.extractors = nn.ModuleDict(extractors)

    def forward(self, observations) -> th.Tensor:
        encoded_tensor_list = []
        # self.extractors contain nn.Modules that do all the processing.
        for key, extractor in self.extractors.items():
            encoded_tensor_list.append(extractor(observations[key]))
        return th.cat(encoded_tensor_list, dim=1)

In [47]:
class VisPolicy(nn.Module):
    def __init__(
        self,
        image_size=(128, 128),
        in_channels=8,
        hidden_units=[128, 128],
        act_dims=8,
        activation=nn.ReLU,
    ):
        super().__init__()
        self.feature_extractor = PureCNN(image_size, in_channels)
        mlp_layers = []
        prev_units = self.feature_extractor.out_features
        for h in hidden_units:
            mlp_layers += [nn.Linear(prev_units, h), activation()]
            prev_units = h
        mlp_layers += [nn.Linear(prev_units, act_dims), nn.Tanh()]
        self.mlp = nn.Sequential(*mlp_layers)

    def forward(self, observations) -> th.Tensor:
        features = self.feature_extractor(observations)
        return self.mlp(features)

# create our policy
obs, action = dataset[0]
rgbd_shape = obs['rgbd'].shape
print(obs['state'].shape)
th.manual_seed(0)
policy = VisPolicy(image_size=rgbd_shape[1:], in_channels=rgbd_shape[0],
                act_dims=action.shape[0], hidden_units=[256, 256, 256])
# move model to gpu if possible
device = "cuda" if th.cuda.is_available() else "cpu"
policy = policy.to(device)
print(policy)

torch.Size([32])
VisPolicy(
  (feature_extractor): PureCNN(
    (extractors): ModuleDict(
      (rgbd): Sequential(
        (0): Sequential(
          (0): Conv2d(8, 32, kernel_size=(8, 8), stride=(4, 4))
          (1): ReLU()
          (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
          (3): ReLU()
          (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
          (5): ReLU()
          (6): Flatten(start_dim=1, end_dim=-1)
        )
        (1): Sequential(
          (0): Linear(in_features=9216, out_features=256, bias=True)
          (1): ReLU()
        )
      )
    )
  )
  (mlp): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=256, bias=True)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=7, bias=True)
    (7): Tanh()
  )
)


### 2.4 Setting up Training, Dataloader, and Logging

With a policy and dataset, we can now write some utility functions to perform a training step, load data in batches, and log results to tensorboard.

In [48]:
loss_fn = nn.MSELoss()

# a short save function to save our model
def save_model(policy, path):
    save_data = dict(
        policy=policy.state_dict(),
    )
    base = os.path.join('/content/drive/MyDrive/Colab_Notebooks/final/',path)
    th.save(save_data, base)

def train_step(policy, obs, actions, optim, loss_fn):
    optim.zero_grad()
    # move data to appropriate device first
    obs_device = dict()
    for k in obs:
        obs_device[k] = obs[k].to(device)
    actions = actions.to(device)

    pred_actions = policy(obs_device)
    
    # compute loss and optimize
    loss = loss_fn(actions, pred_actions)
    loss.backward()
    optim.step()
    return loss.item()

Below sets up the logging tools as well which can be viewed with `tensorboard --logdir logs`. You can also open up Tensorboard directly in this notebook

In [54]:
from torch.utils.tensorboard import SummaryWriter

#writer = SummaryWriter(os.path.join('/content/drive/MyDrive/Colab_Notebooks/final/liftcube',f"logs/rgbd_{env_id}"))
writer = SummaryWriter(f"/content/drive/MyDrive/Colab_Notebooks/final/stackcube/logs/rgbd_{env_id}")

### 2.5 Training

We can now create a optimizer and training loop and begin training. The code below will optimize for `iterations = 8000` number of gradient steps at a learning rate of `1e-3`. These parameters are tuned for training on the LiftCube environment and will train a succesful policy that doesn't overfit too much to the dataset. Training time takes around 5-25 minutes depending on hardware.

Note that this is a simple tutorial with a barebones training setup. It doesn't include using a validation dataset, computing success rate during training, regularization or normalization etc.

See final thoughts in section 3 for some simple suggestions on how improve the IL approach.

In [56]:
iterations = 5000
optim = th.optim.Adam(policy.parameters(), lr=1e-3)
best_epoch_loss = np.inf
pbar = tqdm(dataloader, total=iterations)
epoch = 0
steps = 0
while steps < iterations:
    epoch_loss = 0
    for batch in dataloader:
        steps += 1
        obs, actions = batch
        loss_val = train_step(policy, obs, actions, optim, loss_fn)

        # track the loss and print it
        writer.add_scalar("train/mse_loss", loss_val, steps)
        epoch_loss += loss_val
        pbar.set_postfix(dict(loss=loss_val))
        pbar.update(1)

        # periodically save the policy
        if steps % 1000 == 0: save_model(policy, f"ckpt_{steps}_stack.pt")
        if steps >= iterations: break
    
    epoch_loss = epoch_loss / len(dataloader)

    # save a new model if the average MSE loss in an epoch has improved
    if epoch_loss < best_epoch_loss:
        best_epoch_loss = epoch_loss
        save_model(policy, "ckpt_best1_stack.pt")
    
    writer.add_scalar("train/mse_loss_epoch", epoch_loss, epoch)
    epoch += 1
save_model(policy, "ckpt_latest_stack.pt")

  0%|          | 0/5000 [00:00<?, ?it/s]

### 2.6 Evaluation

With a trained policy on our hands, we can now create an evaluation environment to compute the success rate and watch the videos. The default settings should train a policy that achieves around 30% success rate

In [57]:
# optionally load a checkpoint
path = os.path.join('/content/drive/MyDrive/Colab_Notebooks/final/',"ckpt_best1_stack.pt")
policy.load_state_dict(th.load(path)["policy"])

<All keys matched successfully>

In [1]:
from mani_skill2.utils.wrappers import RecordEpisode
env_id="StackCube-v1"
obs_mode = "rgbd"
control_mode = "pd_ee_delta_pose"
env = gym.make(env_id, obs_mode=obs_mode, control_mode=control_mode)
# RecordEpisode wrapper auto records a new video once an episode is completed
env = RecordEpisode(env, output_dir=f"logs/rgbd_{env_id}/videos")
obs = env.reset(seed=42)

successes = []
num_episodes = 100
i = 0
pbar = tqdm(total=num_episodes)
while i < num_episodes:
    # convert observation to our desired shape and move to appropriate device
    obs = convert_observation(obs)
    obs_device = dict()
    obs['rgbd'] = rescale_rgbd(obs['rgbd'])
    # unsqueeze adds an extra batch dimension and we permute rgbd since PyTorch expects the channel dimension to be first
    obs_device['rgbd'] = th.from_numpy(obs['rgbd']).float().permute(2,0,1).unsqueeze(0).to(device)
    obs_device['state'] = th.from_numpy(obs['state']).float().unsqueeze(0).to(device)
    #print(obs['state'])
    with th.no_grad():
        action = policy(obs_device).cpu().numpy()[0]
    obs, reward, done, info = env.step(action)

    if done:
        successes.append(info['success'])
        obs = env.reset()
        i += 1
        pbar.update(1)
print("Success Rate:", np.mean(successes))
print(successes)
writer.add_scalar("Success Rate",  np.mean(successes), num_episodes)

NameError: ignored

In [None]:
class DAggerdataset(Dataset):
    def __init__(self,data):
       
        self.obs_state = []
        self.obs_rgbd = []
        self.actions = []
        for _,v in data:    #each k is 'traj_1" ,v would be a dict
            obs = convert_observation(v["obs"])
            self.obs_rgbd.append(obs['rgbd'][:-1])
            self.obs_state.append(obs['state'][:-1])
            self.actions.append(v["actions"])

        self.obs_rgbd = np.concatenate(self.obs_rgbd,axis=0)# np.vstack(self.obs_rgbd)
        
        self.obs_state =np.concatenate(self.obs_state,axis=0)# np.vstack(self.obs_state)
        self.actions = np.concatenate(self.actions,axis=0)#np.vstack(self.actions)



    def __len__(self):
        return len(self.obs_rgbd)
    def __getitem__(self,idx):

        action = th.from_numpy(self.actions[idx]).float()
        rgbd = self.obs_rgbd[idx]
        rgbd = rescale_rgbd(rgbd)
        # permute data so that channels are the first dimension as PyTorch expects this
        rgbd = th.from_numpy(rgbd).float().permute((2, 0, 1))
        state = th.from_numpy(self.obs_state[idx]).float()
        return dict(rgbd=rgbd, state=state), action


In [None]:
def traiBC(dataloader,iterations,policy,loss_fn,writer):
    
    optim = th.optim.Adam(policy.parameters(), lr=1e-3)
    best_epoch_loss = np.inf
    pbar = tqdm(dataloader, total=iterations)
    epoch = 0
    steps = 0
    while steps < iterations:
        epoch_loss = 0
        for batch in dataloader:
            steps += 1
            obs, actions = batch
            loss_val = train_step(policy, obs, actions, optim, loss_fn)

            # track the loss and print it
            writer.add_scalar("train/mse_loss", loss_val, steps)
            epoch_loss += loss_val
            pbar.set_postfix(dict(loss=loss_val))
            pbar.update(1)

            # periodically save the policy
            if steps % 1000 == 0: save_model(policy, f"/stack/ckpt_{steps}.pt")
            if steps >= iterations: break
        
        epoch_loss = epoch_loss / len(dataloader)

        # save a new model if the average MSE loss in an epoch has improved
        if epoch_loss < best_epoch_loss:
            best_epoch_loss = epoch_loss
            save_model(policy, "stack/ckpt_best.pt")
        
        writer.add_scalar("train/mse_loss_epoch", epoch_loss, epoch)
        epoch += 1
    save_model(policy, "stack/ckpt_latest.pt")
    return policy

In [None]:
env_id="StackCube-v1"
obs_mode = "rgbd"
control_mode = "pd_ee_delta_pose"
env = gym.make(env_id, obs_mode=obs_mode, control_mode=control_mode)

def dagger(traj_dataset,expert,env,policy,writer):       
    #expert,student would be two policy
    new_traj = DataLoader(traj_dataset, batch_size=100, num_workers=16, pin_memory=True, drop_last=True, shuffle=True)
    num_rollout = 20
    loss_fn =  nn.MSELoss()
    for i in range(100):   #100 new rounds/traj
        new_obs = []
        new_act = []
        new_succ = []
        reward_per_round = []
        
        max_steps = 5000

        for i in range(num_rollout):
            print(f'Number of total traj: {len(new_traj.keys())}')
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0

            #TO-DO
            #use BC to train a student network
            student = trainBC(new_traj,policy,loss_fn,writer,iterations=max_steps)

            while not done:
                #move data to GPU
                obs_device = dict()
                for k in obs:
                    obs_device[k] = obs[k].to(device)
                actions = actions.to(device)

                expert_action = expert(obs_device)     #expert's action under this observation, state-based obs
                pred_action = student(obs_device)       #image-based obs 
                new_obs.append(obs)
                new_act.append(expert_action)
                new_succ.append(done)

                obs, r, done, _ = env.step(pred_action)

                totalr += r
                steps += 1

                if steps > max_steps:
                    break
            reward_per_round.append[totalr]
            new_succ.append(done)

        print('returns', reward_per_round)
        print('mean return', np.mean(reward_per_round))
        print('std of return', np.std(reward_per_round))
        new_traj[f'traj_{i+100}'] = {'obs':np.array(new_obs),
                                     'actions': np.array(new_act),
                                     'success':new_succ}
        
    return new_traj
        



In [None]:
from IPython.display import Video
Video(f"logs/rgbd_{env_id}/videos/7.mp4", embed=True) # Watch one of the replays

## 3 Final Thoughts

Imitation Learning (IL) as well as the general class of Learning from Demonstrations (LfD) methods show promise in being able to learn from pre-collected data without having to go through costly environment interactions. However achieving good results, state-based or visual, requires additional tricks from supervised-learning research as well as things specific to robotics that can help improve performance


Reinforcement Learning (RL) is another technique to tackle these environments by training an agent to maximize return. For a tutorial on RL with ManiSkill2, see our [RL Colab Tutorial](https://colab.research.google.com/github/haosulab/ManiSkill2/blob/tutorials/examples/tutorials/2_reinforcement_learning.ipynb)

While our environments and code enable much faster visual-based RL and IL compared to other robotics environments, there are still a number of approaches that can enhance visual-based policies. Many of these approaches have been consolidated into our own library called [ManiSkill2-Learn](https://github.com/haosulab/ManiSkill2-Learn) which has code to leverage RGBD and PointClouds, transformers, and more. It also includes various approaches such as [DAPG](https://arxiv.org/pdf/1709.10087.pdf) which leverage both learning from demonstrations as well as online interaction.