# Data preparation

(Consider splitting into train/val/test considering the affordances of the objects instead of randomly, so that e.g. pen, telescope and laptop are in the train set and pencil, microscope and desktop computer in the test set.)


(Also, consider adding multiple images of each object. This way, the model can train on mapping object with its affordances multiple times.)

In [1]:
from transformers import VisualBertModel, BertModel, BertTokenizer
import torch
import pandas as pd

In [2]:
def clean_up(object_name):
    clean_object_name = ''
    for char in object_name:
        if char == '_':
            clean_object_name += ' '
        elif char == '.':
            break
        else:
            clean_object_name += char
    return clean_object_name

In [3]:
file = '../data/affordance_annotations.txt'
df = pd.read_csv(file)
df.rename(columns = {'Unnamed: 0':'Object'}, inplace = True)
df['Object'] = df['Object'].apply(clean_up)

In [4]:
df

Unnamed: 0,Object,ImageNet synset,Grasp,Lift,Throw,Push,Fix,Ride,Play,Watch,SitOn,Feed,Row,PourFrom,LookThrough,WriteWith,TypeOn
0,automobile engine,n02761557,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,axe,n02764044,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
2,bicycle,n02834778,0,1,0,1,1,1,0,0,1,0,0,0,0,0,0
3,bottle,n02876657,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0
4,camera,n02942699,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,sickle,n04213353,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
58,spoon,n04284002,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
59,stool,n04326896,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
60,typewriter,n04505036,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1


In [5]:
unique_objects = list(df['Object'])
unique_affordances = [affordance.lower() for affordance in df.columns[2:]]

In [6]:
unique_objects

['automobile engine',
 'axe',
 'bicycle',
 'bottle',
 'camera',
 'can',
 'car tire',
 'carving knife',
 'chair',
 'chalk',
 'cleaver',
 'desktop computer',
 'dish',
 'dog',
 'dustcloth',
 'fishing pole',
 'food turner',
 'frisbee',
 'guitar',
 'hand saw',
 'handset',
 'horse',
 'laptop',
 'microscope',
 'mobile phone',
 'mop',
 'pen',
 'pitcher',
 'power saw',
 'shopping cart',
 'small boat',
 'sofa',
 'teapot',
 'telescope',
 'television',
 'toothbrush',
 'umbrella',
 'vacuum cleaner',
 'violin',
 'wheelbarrow',
 'banjo',
 'bench',
 'bowl',
 'broom',
 'camel',
 'cat',
 'coffee cup',
 'donkey',
 'flagon',
 'hammer',
 'hand truck',
 'kayak',
 'monitor',
 'motorcycle',
 'pencil',
 'rhinoceros',
 'serving cart',
 'sickle',
 'spoon',
 'stool',
 'typewriter',
 'walkie-talkie']

In [7]:
unique_affordances

['grasp',
 'lift',
 'throw',
 'push',
 'fix',
 'ride',
 'play',
 'watch',
 'siton',
 'feed',
 'row',
 'pourfrom',
 'lookthrough',
 'writewith',
 'typeon']

## Approach 1 - pairs of objects and their affordances

In [8]:
df1 = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
train1 = df1[:42]
val1 = df1[42:52]
test1 = df1[52:]

In [10]:
def get_gold_data_1(table):
    gold_data_pairs = []
    for index, row in table.iterrows():
        for i, value in enumerate(row):
            if type(value) == str:
                pass
            else:
                gold_data_pairs.append((row[0],table.columns[i].lower(),value))
    return gold_data_pairs

In [11]:
train1_pairs = get_gold_data_1(train1)
val1_pairs = get_gold_data_1(val1)
test1_pairs = get_gold_data_1(test1)

In [12]:
train1_pairs

[('hand truck', 'grasp', 0),
 ('hand truck', 'lift', 1),
 ('hand truck', 'throw', 0),
 ('hand truck', 'push', 1),
 ('hand truck', 'fix', 1),
 ('hand truck', 'ride', 1),
 ('hand truck', 'play', 0),
 ('hand truck', 'watch', 0),
 ('hand truck', 'siton', 1),
 ('hand truck', 'feed', 0),
 ('hand truck', 'row', 0),
 ('hand truck', 'pourfrom', 0),
 ('hand truck', 'lookthrough', 0),
 ('hand truck', 'writewith', 0),
 ('hand truck', 'typeon', 0),
 ('serving cart', 'grasp', 0),
 ('serving cart', 'lift', 0),
 ('serving cart', 'throw', 0),
 ('serving cart', 'push', 1),
 ('serving cart', 'fix', 1),
 ('serving cart', 'ride', 1),
 ('serving cart', 'play', 0),
 ('serving cart', 'watch', 0),
 ('serving cart', 'siton', 1),
 ('serving cart', 'feed', 0),
 ('serving cart', 'row', 0),
 ('serving cart', 'pourfrom', 0),
 ('serving cart', 'lookthrough', 0),
 ('serving cart', 'writewith', 0),
 ('serving cart', 'typeon', 0),
 ('automobile engine', 'grasp', 0),
 ('automobile engine', 'lift', 0),
 ('automobile engin

In [13]:
val1_pairs

[('coffee cup', 'grasp', 1),
 ('coffee cup', 'lift', 1),
 ('coffee cup', 'throw', 1),
 ('coffee cup', 'push', 1),
 ('coffee cup', 'fix', 0),
 ('coffee cup', 'ride', 0),
 ('coffee cup', 'play', 0),
 ('coffee cup', 'watch', 0),
 ('coffee cup', 'siton', 0),
 ('coffee cup', 'feed', 0),
 ('coffee cup', 'row', 0),
 ('coffee cup', 'pourfrom', 1),
 ('coffee cup', 'lookthrough', 0),
 ('coffee cup', 'writewith', 0),
 ('coffee cup', 'typeon', 0),
 ('axe', 'grasp', 1),
 ('axe', 'lift', 1),
 ('axe', 'throw', 1),
 ('axe', 'push', 1),
 ('axe', 'fix', 0),
 ('axe', 'ride', 0),
 ('axe', 'play', 0),
 ('axe', 'watch', 0),
 ('axe', 'siton', 0),
 ('axe', 'feed', 0),
 ('axe', 'row', 0),
 ('axe', 'pourfrom', 0),
 ('axe', 'lookthrough', 0),
 ('axe', 'writewith', 0),
 ('axe', 'typeon', 0),
 ('horse', 'grasp', 0),
 ('horse', 'lift', 0),
 ('horse', 'throw', 0),
 ('horse', 'push', 0),
 ('horse', 'fix', 0),
 ('horse', 'ride', 1),
 ('horse', 'play', 0),
 ('horse', 'watch', 0),
 ('horse', 'siton', 1),
 ('horse', 'fee

# Approach 2 - pairs of affordances and their objects

In [14]:
def get_gold_data_2(table):
    gold_data_pairs = []
    for index, row in table.iterrows():
        for i, value in enumerate(row):
            if type(value) == str:
                pass
            else:
                gold_data_pairs.append((table.index[0].lower(),table.columns[i].lower(),value))
                
    return gold_data_pairs

In [15]:
df2 = df.transpose()
df2 = df2.sample(frac=1, random_state=42)
df2.columns = df2.iloc[0]

In [16]:
train2 = df2[2:11]
val2 = df2[11:14]
test2 = df2[14:17]

In [17]:
train2

Object,automobile engine,axe,bicycle,bottle,camera,can,car tire,carving knife,chair,chalk,...,monitor,motorcycle,pencil,rhinoceros,serving cart,sickle,spoon,stool,typewriter,walkie-talkie
Push,0,1,1,1,1,1,1,1,1,1,...,1,1,1,0,1,1,1,1,1,1
WriteWith,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
Feed,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
LookThrough,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Play,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PourFrom,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Grasp,0,1,0,1,1,1,0,1,0,1,...,0,0,1,0,0,1,1,0,0,1
Watch,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
TypeOn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [18]:
train2_pairs = get_gold_data_2(train2)
val2_pairs = get_gold_data_2(val2)
test2_pairs = get_gold_data_2(test2)

In [19]:
train2_pairs

[('push', 'automobile engine', 0),
 ('push', 'axe', 1),
 ('push', 'bicycle', 1),
 ('push', 'bottle', 1),
 ('push', 'camera', 1),
 ('push', 'can', 1),
 ('push', 'car tire', 1),
 ('push', 'carving knife', 1),
 ('push', 'chair', 1),
 ('push', 'chalk', 1),
 ('push', 'cleaver', 1),
 ('push', 'desktop computer', 1),
 ('push', 'dish', 1),
 ('push', 'dog', 1),
 ('push', 'dustcloth', 1),
 ('push', 'fishing pole', 1),
 ('push', 'food turner', 1),
 ('push', 'frisbee', 1),
 ('push', 'guitar', 1),
 ('push', 'hand saw', 1),
 ('push', 'handset', 1),
 ('push', 'horse', 0),
 ('push', 'laptop', 1),
 ('push', 'microscope', 1),
 ('push', 'mobile phone', 1),
 ('push', 'mop', 1),
 ('push', 'pen', 1),
 ('push', 'pitcher', 1),
 ('push', 'power saw', 1),
 ('push', 'shopping cart', 1),
 ('push', 'small boat', 1),
 ('push', 'sofa', 0),
 ('push', 'teapot', 1),
 ('push', 'telescope', 1),
 ('push', 'television', 1),
 ('push', 'toothbrush', 1),
 ('push', 'umbrella', 1),
 ('push', 'vacuum cleaner', 1),
 ('push', 'vio

# BERT Embeddings

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [21]:
def tokenize_string(text):
    marked_text = "[CLS] " + text + " [SEP]"
    return tokenizer.tokenize(marked_text)

In [22]:
bert_model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [24]:
bert_embeddings_dict = {} # I create this embeddings dictionary so I can easily map words to embeddings

with torch.no_grad():
    
    for subset in [train1_pairs + val1_pairs + test1_pairs]:
    
        for obj, affordance, truth_val in subset:

            if obj not in bert_embeddings_dict.keys():
                tokenized_obj = tokenize_string(obj)
                indexed_obj = tokenizer.convert_tokens_to_ids(tokenized_obj)
                segments_ids = [1] * len(tokenized_obj)
                tokens_tensor = torch.tensor([indexed_obj])
                segments_tensor = torch.tensor([segments_ids])

                outputs = bert_model(tokens_tensor, segments_tensor)
                hidden_states = outputs[2]
                token_vecs = hidden_states[-2][0] # I take the penultimate layer
                obj_embedding = torch.mean(token_vecs, dim=0) # I take the mean over the vectors for each token to get a representation of the whole input

                bert_embeddings_dict[obj] = obj_embedding

            if affordance not in bert_embeddings_dict.keys():
                tokenized_affordance = tokenize_string(affordance)
                indexed_affordance = tokenizer.convert_tokens_to_ids(tokenized_affordance)
                segments_ids = [1] * len(tokenized_affordance)
                tokens_tensor = torch.tensor([indexed_affordance])
                segments_tensor = torch.tensor([segments_ids])

                outputs = bert_model(tokens_tensor, segments_tensor)
                hidden_states = outputs[2]
                token_vecs = hidden_states[-2][0]
                affordance_embedding = torch.mean(token_vecs, dim=0)

                bert_embeddings_dict[affordance] = affordance_embedding


In [25]:
bert_embeddings_dict['coffee cup']

tensor([ 1.1761e-02,  1.6537e-01, -5.8541e-02, -4.0929e-01,  2.7759e-01,
        -2.2142e-01, -4.5131e-01,  5.0575e-01, -4.5594e-01, -3.2246e-01,
         2.8799e-02,  8.9958e-02,  6.7854e-02,  2.2720e-01,  2.7228e-03,
        -7.2006e-02,  4.8871e-02,  5.7241e-01,  5.1877e-01,  6.0365e-02,
        -5.5937e-02, -1.7037e-01, -1.2329e-01, -6.2228e-02,  1.4209e-01,
         7.7891e-02, -2.8361e-01,  8.5463e-02, -5.3574e-02,  2.7535e-01,
         3.9820e-01,  1.2675e-02,  3.7213e-01, -2.3714e-01, -6.3678e-04,
        -5.0599e-01,  3.2774e-01,  4.8251e-01, -4.8535e-01,  1.5368e-01,
        -1.6745e-01, -2.1683e-02,  9.7068e-01, -1.2071e-01,  1.5644e-01,
         2.5727e-01, -7.3715e-01, -2.2904e-01, -7.8507e-01, -3.9459e-01,
         1.8688e-01,  5.4118e-02, -3.6044e-01,  1.0047e-01,  1.5384e-01,
         3.1605e-01,  2.5628e-01, -1.2792e-01,  1.2189e-01,  2.4846e-02,
        -3.4284e-01, -2.0443e-01,  2.6048e-04, -1.7712e-01,  6.2992e-02,
        -6.8668e-02,  5.2262e-01,  1.2636e-01, -4.3

In [26]:
bert_embeddings_dict['grasp'].size()

torch.Size([768])

In [27]:
len(bert_embeddings_dict)

77

# Visual Bert Embeddings

In [28]:
visual_bert_model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre",output_hidden_states=True)

Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing VisualBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
visual_bert_embeddings_dict = {} # I create this embeddings dictionary so I can easily map words to embeddings

with torch.no_grad():
    
    for subset in [train1_pairs + val1_pairs + test1_pairs]:
    
        for obj, affordance, truth_val in subset:

            if obj not in visual_bert_embeddings_dict.keys():
                tokenized_obj = tokenize_string(obj)
                indexed_obj = tokenizer.convert_tokens_to_ids(tokenized_obj)
                segments_ids = [1] * len(tokenized_obj)
                tokens_tensor = torch.tensor([indexed_obj])
                segments_tensor = torch.tensor([segments_ids])

                outputs = visual_bert_model(tokens_tensor, segments_tensor)
                hidden_states = outputs[2]
                token_vecs = hidden_states[-2][0] # I take the penultimate layer
                obj_embedding = torch.mean(token_vecs, dim=0) # I take the mean over the vectors for each token to get a representation of the whole input

                visual_bert_embeddings_dict[obj] = obj_embedding

            if affordance not in visual_bert_embeddings_dict.keys():
                tokenized_affordance = tokenize_string(affordance)
                indexed_affordance = tokenizer.convert_tokens_to_ids(tokenized_affordance)
                segments_ids = [1] * len(tokenized_affordance)
                tokens_tensor = torch.tensor([indexed_affordance])
                segments_tensor = torch.tensor([segments_ids])

                outputs = visual_bert_model(tokens_tensor, segments_tensor)
                hidden_states = outputs[2]
                token_vecs = hidden_states[-2][0]
                affordance_embedding = torch.mean(token_vecs, dim=0)

                visual_bert_embeddings_dict[affordance] = affordance_embedding

In [32]:
visual_bert_embeddings_dict['coffee cup']

tensor([-1.1488e-01,  2.8898e-01,  3.3470e-01, -2.6124e-01,  9.6840e-02,
        -2.2301e-01,  1.4204e-01, -9.8500e-02, -2.9749e-01, -3.4153e-01,
        -1.5345e-01,  8.5644e-03, -2.0052e-01,  2.6325e-01,  9.6830e-02,
         1.4308e-01,  3.0279e-02,  8.8182e-02,  3.7079e-01, -4.2185e-01,
        -2.2610e-01, -2.7353e-02, -1.6980e-01,  4.9023e-02, -1.0711e-01,
        -2.2149e-01, -1.0475e-01, -1.4782e-01,  1.3282e-01, -1.5429e-01,
         3.5487e-01, -8.5045e-02,  3.0713e-01,  3.0890e-03,  2.9201e-01,
        -1.2735e-01,  1.7551e-01,  8.1743e-02, -1.2363e-01,  4.9050e-02,
        -2.6848e-01,  1.2090e-01,  3.1243e-01,  3.7343e-01,  1.5022e-01,
         5.1618e-03, -3.3208e-01,  6.6585e-02, -4.4918e-01, -5.1149e-01,
         2.9058e-01,  5.0829e-02,  1.3263e-01, -2.6644e-01, -5.6575e-02,
         2.1564e-01,  2.3412e-01,  1.9256e-01,  3.5003e-01,  1.3354e-02,
         2.8870e-02, -8.3321e-02,  7.0731e-02, -2.0450e-01, -2.7815e-01,
        -4.3389e-02,  3.5928e-01, -2.8819e-01, -3.3