In [51]:
import os
from os.path import join, dirname, abspath
import sys
CURRENT_DIR = os.getcwd()
sys.path.insert(0, join(CURRENT_DIR, '../..'))  # Import local models

from cliport.environments.environment import Environment
import torch
from models.PickModel import PickModel
from models.PlaceModel import PlaceModel
from cliport import tasks
from cliport.dataset import RavensDataset
import numpy as np
from cliport.utils import utils


In [1]:
record_cfg = {
  "save_video": False,
  "save_video_path": "/home/ubuntu/VLM/videos/",
  "add_text": True,
  "fps": 20,
  "video_height": 640,
  "video_width": 720,
}

In [2]:
assets_root = "/home/ubuntu/cliport/cliport/environments/assets/"

In [5]:
env = Environment(
    assets_root,
    disp=False,
    shared_memory=False,
    hz=480,
    record_cfg=record_cfg
)

text argument:/home/ubuntu/cliport/cliport/environments/assets/
int args: [

In [9]:
pick_model = PickModel(num_rotations=1, batchnorm = False).to('cuda')

In [10]:
pick_model.load_state_dict(torch.load("/home/ubuntu/VLM/checkpoint/checkpoint_model_best_pick.pth")['state_dict'])

<All keys matched successfully>

In [11]:
place_model = PlaceModel(num_rotations=12, crop_size=64, batchnorm = False).to('cuda')

In [None]:
place_model.load_state_dict(torch.load("/home/ubuntu/VLM/checkpoint/checkpoint_model_best_place.pth")['state_dict'])

In [None]:
train_dataset_cfg = {"dataset":{"type": "single",
                    "images": True,
                    "cache": False,
                    "augment":{"theta_sigma":60},
                    "cache_size": 350},
                    }

# load data
train_dataset = RavensDataset('/home/ubuntu/cliport/data/stack-block-pyramid-seq-seen-colors-val', train_dataset_cfg, n_demos=100, augment=False)

In [38]:
def run_pick(inp):
    pick_model.eval()

    with torch.no_grad():
        device = 'cuda'
        img_cuda = torch.Tensor(inp['inp_img']).to(device)
        language_cuda = inp['lang_goal']

        affordances = pick_model(img_cuda, language_cuda)
        pick_model(img_cuda, language_cuda)
        affordances = affordances.view(affordances.shape[0], -1)
        preds = torch.nn.functional.softmax(affordances, dim=1)
        preds = preds.cpu()
        preds = preds.view(320,160)
        location = np.unravel_index(torch.argmax(preds).numpy(), (320,160))
    
    return preds, location

In [39]:
def run_place(inp, p0):
    place_model.eval()
    with torch.no_grad():
        device = 'cuda'
        img_cuda = torch.Tensor(inp['inp_img']).to(device)
        language_cuda = inp['lang_goal']

        affordances = place_model(img_cuda, language_cuda, p0)
        affordances = affordances.view(affordances.shape[0], -1)
        preds = torch.nn.functional.softmax(affordances, dim=1)
        preds = preds.cpu()
        preds = preds.view(12, 320,160)
        location = np.unravel_index(torch.argmax(preds).numpy(), (12, 320,160))        
        
    return preds, location

In [93]:
for i in range(10):
    env.start_rec("video" + str(i+10))

    episode, seed = train_dataset.load(i)
    task = tasks.names["stack-block-pyramid-seq-seen-colors"]()
    task.mode = "train"
    env.seed(seed)
    env.set_task(task)
    obs = env.reset()
    info = env.info

    for _ in range(task.max_steps):
        img = train_dataset.get_image(obs)
        lang_goal = info['lang_goal']

        inp = {'inp_img': img, 'lang_goal': lang_goal}

        preds, p0_pix = run_pick(inp)
        p0_theta = 0

        preds, p1 = run_place(inp, p0_pix)
        p1_pix = p1[1:3]
        p1_theta = p1_pix[0] * 2 * np.pi / preds.shape[0]

        # Pixels to end effector poses.
        bounds = np.array([[0.25, 0.75], [-0.5, 0.5], [0, 0.28]])
        pix_size = 0.003125

        hmap = img[:, :, 3]
        p0_xyz = utils.pix_to_xyz(p0_pix, hmap, bounds, pix_size)
        p1_xyz = utils.pix_to_xyz(p1_pix, hmap, bounds, pix_size)
        p0_xyzw = utils.eulerXYZ_to_quatXYZW((0, 0, -p0_theta))
        p1_xyzw = utils.eulerXYZ_to_quatXYZW((0, 0, -p1_theta))

        act = {
            'pose0': (np.asarray(p0_xyz), np.asarray(p0_xyzw)),
            'pose1': (np.asarray(p1_xyz), np.asarray(p1_xyzw)),
            'pick': [p0_pix[0], p0_pix[1], p0_theta],
            'place': [p1_pix[0], p1_pix[1], p1_theta],
        }

        print(f'Lang Goal: {lang_goal}')

        obs, reward, done, info = env.step(act)
        if done:
            break
        
    env.end_rec()


Lang Goal: put the brown block on the lightest brown block


[libx264 @ 0x64bb000] -qscale is ignored, -crf is recommended.


Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the brown block on the lightest brown block
Lang Goal: put the red block on the lightest brown block


[libx264 @ 0x638a000] -qscale is ignored, -crf is recommended.


Lang Goal: put the red block on the lightest brown block
Lang Goal: put the red block on the lightest brown block
Lang Goal: put the red block on the lightest brown block


KeyboardInterrupt: 

In [94]:
env.end_rec()


In [59]:

obs = env.reset()
info = env.info
reward = 0

img = train_dataset.get_image(obs)
lang_goal = info['lang_goal']

inp = {'inp_img': img, 'lang_goal': lang_goal}

preds, p0_pix = run_pick(inp)
p0_theta = 0

preds, p1 = run_place(inp, p0_pix)
p1_pix = p1[1:3]
p1_theta = p1_pix[0] * 2 * np.pi / preds.shape[0]

# Pixels to end effector poses.
bounds = np.array([[0.25, 0.75], [-0.5, 0.5], [0, 0.28]])
pix_size = 0.003125

hmap = img[:, :, 3]
p0_xyz = utils.pix_to_xyz(p0_pix, hmap, bounds, pix_size)
p1_xyz = utils.pix_to_xyz(p1_pix, hmap, bounds, pix_size)
p0_xyzw = utils.eulerXYZ_to_quatXYZW((0, 0, -p0_theta))
p1_xyzw = utils.eulerXYZ_to_quatXYZW((0, 0, -p1_theta))

act = {
    'pose0': (np.asarray(p0_xyz), np.asarray(p0_xyzw)),
    'pose1': (np.asarray(p1_xyz), np.asarray(p1_xyzw)),
    'pick': [p0_pix[0], p0_pix[1], p0_theta],
    'place': [p1_pix[0], p1_pix[1], p1_theta],
}

obs, reward, done, info = env.step(act)

env.end_rec()

[libx264 @ 0x7492000] -qscale is ignored, -crf is recommended.


In [96]:
env.start_rec("video" + str(41))

episode, seed = train_dataset.load(i)
task = tasks.names["stack-block-pyramid-seq-seen-colors"]()
task.mode = "train"
env.seed(seed)
env.set_task(task)
obs = env.reset()
info = env.info

img = train_dataset.get_image(obs)
lang_goal = info['lang_goal']

inp = {'inp_img': img, 'lang_goal': "put the brown block on the red block"}

preds, p0_pix = run_pick(inp)
p0_theta = 0

preds, p1 = run_place(inp, p0_pix)
p1_pix = p1[1:3]
p1_theta = p1_pix[0] * 2 * np.pi / preds.shape[0]

# Pixels to end effector poses.
bounds = np.array([[0.25, 0.75], [-0.5, 0.5], [0, 0.28]])
pix_size = 0.003125

hmap = img[:, :, 3]
p0_xyz = utils.pix_to_xyz(p0_pix, hmap, bounds, pix_size)
p1_xyz = utils.pix_to_xyz(p1_pix, hmap, bounds, pix_size)
p0_xyzw = utils.eulerXYZ_to_quatXYZW((0, 0, -p0_theta))
p1_xyzw = utils.eulerXYZ_to_quatXYZW((0, 0, -p1_theta))

act = {
    'pose0': (np.asarray(p0_xyz), np.asarray(p0_xyzw)),
    'pose1': (np.asarray(p1_xyz), np.asarray(p1_xyzw)),
    'pick': [p0_pix[0], p0_pix[1], p0_theta],
    'place': [p1_pix[0], p1_pix[1], p1_theta],
}

print(f'Lang Goal: {lang_goal}')

obs, reward, done, info = env.step(act)
    
env.end_rec()


Lang Goal: put the green block on the lightest brown block


[libx264 @ 0x66bb000] -qscale is ignored, -crf is recommended.
