## Import dependencies

In [1]:
from osg.utils.general_utils import load_robot_data, create_robot_observation_graph, get_spatial_referents
from osg.vlm_library import vlm_library

  from .autonotebook import tqdm as notebook_tqdm


## Setup

In [2]:
#setup
tmp_fldr=f"results/"
vlm_instance   = vlm_library(vl_model="owl_vit", data_src="robot", seg_model="mobile_sam", tmp_fldr=tmp_fldr) 

#load data
data_path = "../data/sample_robot_data"
observation_data, edge_connectivity, env_pointcloud = load_robot_data(data_path, tmp_fldr)
observations_graph, _, _, _ = create_robot_observation_graph(observation_data,edge_connectivity,tmp_fldr=tmp_fldr)

-------------------------------------------------
Visual language model: owl_vit
-------------------------------------------------
Segmentation model: Mobile SAM
-------------------------------------------------
Data source: robot
-------------------------------------------------
0 out of 12 || Getting cardinal images for waypoint:balky-cuckoo-DSOhlw+wDw5TKMQhzKNg1Q==
1 out of 12 || Getting cardinal images for waypoint:bended-drum-ZK.nOK2ROH99jbDMGbrG3Q==
2 out of 12 || Getting cardinal images for waypoint:brainy-eel-acD1gEPuOhQgqad45uHuXw==
3 out of 12 || Getting cardinal images for waypoint:frayed-emu-M5JSw0IiYB8xYBwfswR41Q==
4 out of 12 || Getting cardinal images for waypoint:garish-rat-RgSDxBIafXZwgLb2o+h3TA==
5 out of 12 || Getting cardinal images for waypoint:leafy-mayfly-8hSmtSRh9YMd0eyKxJ6Tjw==
6 out of 12 || Getting cardinal images for waypoint:lemony-gibbon-44n7dioWNlkPvLqiIcb2gg==
7 out of 12 || Getting cardinal images for waypoint:lethal-fish-xlr1rhR2yqmfn88mEo7FTw==
8 out 

In [3]:
observations_graph.nodes[0]

{'rgb': {0: <PIL.Image.Image image mode=RGB size=640x480>,
  1: <PIL.Image.Image image mode=RGB size=640x480>,
  2: <PIL.Image.Image image mode=RGB size=640x480>,
  3: <PIL.Image.Image image mode=RGB size=640x480>},
 'pose': {0: {'position': [2.668748959000604,
    -2.7842405328179436,
    0.03886261251422272],
   'quaternion(wxyz)': [0.9742608555283065,
    1.5039994283409839e-06,
    0.00015538128483319542,
    -0.22542351527657162],
   'rotation_matrix': array([[ 8.98368429e-01,  4.39242614e-01,  3.02085733e-04],
          [-4.39242613e-01,  8.98368478e-01, -7.29837664e-05],
          [-3.03441881e-04, -6.71226153e-05,  9.99999952e-01]])},
  1: {'position': [2.654588264705733, -2.795095249821269, 0.04214847160365154],
   'quaternion(wxyz)': [0.847823810652734,
    0.0024072553041238,
    -0.0017644481240037956,
    0.5302696275810973],
   'rotation_matrix': array([[ 4.37622018e-01, -8.99158928e-01, -4.38893517e-04],
          [ 8.99141938e-01,  4.37616654e-01, -5.95312323e-03],
    

## Composible Referent Descriptors

In [4]:
## Composible Referent Descriptor (CRD) 
    # CRDs are propositional expressions that represent specific referent instances by chaining comparators that encode descriptive spatial information. 
    # For more details see: https://arxiv.org/abs/2402.11498

## CRD Syntax
    # referent_1::isbetween(referent_2,referent_3)  :denotes that referent_1 is between referent_2 and referent_3.
    # referent_1::isabove(referent_2)               :denotes that referent_1 is above referent_2.
    # referent_1::isbelow(referent_2)               :denotes that referent_1 is below referent_2.
    # referent_1::isleftof(referent_2)              :denotes that referent_1 is left of referent_2.
    # referent_1::isrightof(referent_2)             :denotes that referent_1 is right of referent_2.
    # referent_1::isnextto(referent_2)              :denotes that referent_1 is close to referent_2.
    # referent_1::isinfrontof(referent_2)           :denotes that referent_1 is in front of referent_2.
    # referent_1::isbehind(referent_2)              :denotes that referent_1 is behind referent_2.

## Examples
    # Desired referent:   table behind the fridge
    # CRD representation: table::isbehind(fridge) 

    # Desired referent:    chair between the green laptop and the yellow box below the sofa
    # CRD representation:  chair::isbetween(green_laptop,yellow_box::isbelow(sofa))

    # Desired referent:    brown bag between the television and the kettle on the left of the green seat
    # CRD representation:  brown_bag::isbetween(television, kettle::isleftof(green_seat))

## Ground referents and filter instances via spatial constraints

In [5]:
# Enter comma seperated referent names or composible referent descriptors you wish to ground
referents_to_ground = ["laptop", "whiteboard::isinfrontof(green_plush_toy)"]

## Extract spatial information
referent_spatial_details = get_spatial_referents(referents_to_ground)
print("referent_spatial_details: ",referent_spatial_details,"\n")

referent_spatial_details:  {'laptop': [], 'whiteboard': ['isinfrontof(green_plush_toy)'], 'green_plush_toy': []} 



In [6]:
## Spatial grounding
relevant_element_details = vlm_instance.spatial_grounding(observations_graph, referent_spatial_details, visualize=True, use_segmentation=True, multiprocessing=True, workers=3)

Propositions to ground: ['laptop', 'whiteboard', 'green_plush_toy']
--------------------------------------------------------------------------
 Running: VLM Detections || Datanode count: 12 || workers: 3 || workers_after_chunking: 3
--------------------------------------------------------------------------


*************************************************
Begin Spatial Grounding
-------------------------------------------------

Current Element whiteboard_0a_0 || Type: whiteboard || Spatial Details: ['isinfrontof(green_plush_toy)']
    Checking Descriptor: isinfrontof(green_plush_toy)  || Comparative referents: ['green_plush_toy']
        'whiteboard_0a_0' is in front of 'green_plush_toy_2c_1': False
    Element whiteboard_0a_0 || Completed spatial checks: {'isinfrontof(green_plush_toy)': False}

Current Element whiteboard_0b_0 || Type: whiteboard || Spatial Details: ['isinfrontof(green_plush_toy)']
    Checking Descriptor: isinfrontof(green_plush_toy)  || Comparative referents: ['gr

In [7]:
print(f"\nReferents after spatial constraint filtering:",len(relevant_element_details))
#for all relevant elements print their ids
print(f"Filtered elements \n",[element['mask_id'] for element in relevant_element_details])


Referents after spatial constraint filtering: 18
Filtered elements 
 ['whiteboard_2a_0', 'green_plush_toy_2c_1', 'whiteboard_3c_0', 'whiteboard_3c_1', 'whiteboard_4b_0', 'whiteboard_4b_1', 'laptop_5b_1', 'whiteboard_5d_0', 'whiteboard_6c_0', 'whiteboard_7c_0', 'whiteboard_7c_1', 'whiteboard_9a_0', 'whiteboard_9a_1', 'laptop_10a_0', 'whiteboard_10d_0', 'whiteboard_10d_1', 'laptop_11b_0', 'whiteboard_11c_0']
