# Data preparation

(Consider splitting into train/val/test considering the affordances of the objects instead of randomly, so that e.g. pen, telescope and laptop are in the train set and pencil, microscope and desktop computer in the test set.)


(Also, consider adding multiple images of each object. This way, the model can train on mapping object with its affordances multiple times.)

In [None]:
from transformers import VisualBertModel, BertModel, BertTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import torch.optim as optim

In [None]:
def clean_up(object_name):
    clean_object_name = ''
    for char in object_name:
        if char == '_':
            clean_object_name += ' '
        elif char == '.':
            break
        else:
            clean_object_name += char
    return clean_object_name

In [None]:
file = '../data/affordance_annotations.txt'
df = pd.read_csv(file)
df.rename(columns = {'Unnamed: 0':'Object'}, inplace = True)
df['Object'] = df['Object'].apply(clean_up)

In [None]:
df

In [None]:
unique_objects = list(df['Object'])
unique_affordances = [affordance.lower() for affordance in df.columns[2:]]

In [None]:
unique_objects

In [None]:
unique_affordances

## Approach 1 - pairs of objects and their affordances

In [None]:
df1 = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
train1 = df1[:42]
val1 = df1[42:52]
test1 = df1[52:]

In [None]:
def get_gold_data_1(table):
    gold_data_pairs = []
    for index, row in table.iterrows():
        for i, value in enumerate(row):
            if type(value) == str:
                pass
            else:
                gold_data_pairs.append((row[0],table.columns[i].lower(),value))
    return gold_data_pairs

In [None]:
train1_pairs = get_gold_data_1(train1)
val1_pairs = get_gold_data_1(val1)
test1_pairs = get_gold_data_1(test1)

In [None]:
train1_pairs

In [None]:
val1_pairs

# Approach 2 - pairs of affordances and their objects

In [None]:
def get_gold_data_2(table):
    gold_data_pairs = []
    for index, row in table.iterrows():
        for i, value in enumerate(row):
            if type(value) == str:
                pass
            else:
                gold_data_pairs.append((table.index[0].lower(),table.columns[i].lower(),value))
                
    return gold_data_pairs

In [None]:
df2 = df.transpose()
df2 = df2.sample(frac=1, random_state=42)
df2.columns = df2.iloc[0]

In [None]:
train2 = df2[2:11]
val2 = df2[11:14]
test2 = df2[14:17]

In [None]:
train2

In [None]:
train2_pairs = get_gold_data_2(train2)
val2_pairs = get_gold_data_2(val2)
test2_pairs = get_gold_data_2(test2)

In [None]:
train2_pairs

# BERT Embeddings

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_string(text):
    marked_text = "[CLS] " + text + " [SEP]"
    return tokenizer.tokenize(marked_text)

In [None]:
bert_model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

In [None]:
bert_model.eval()

In [None]:
bert_embeddings_dict = {} # I create this embeddings dictionary so I can easily map words to embeddings

with torch.no_grad():
    
    for subset in [train1_pairs + val1_pairs + test1_pairs]:
    
        for obj, affordance, truth_val in subset:

            if obj not in bert_embeddings_dict.keys():
                tokenized_obj = tokenize_string(obj)
                indexed_obj = tokenizer.convert_tokens_to_ids(tokenized_obj)
                segments_ids = [1] * len(tokenized_obj)
                tokens_tensor = torch.tensor([indexed_obj])
                segments_tensor = torch.tensor([segments_ids])

                outputs = bert_model(tokens_tensor, segments_tensor)
                hidden_states = outputs[2]
                token_vecs = hidden_states[-2][0] # I take the penultimate layer
                obj_embedding = torch.mean(token_vecs, dim=0) # I take the mean over the vectors for each token to get a representation of the whole input

                bert_embeddings_dict[obj] = obj_embedding

            if affordance not in bert_embeddings_dict.keys():
                tokenized_affordance = tokenize_string(affordance)
                indexed_affordance = tokenizer.convert_tokens_to_ids(tokenized_affordance)
                segments_ids = [1] * len(tokenized_affordance)
                tokens_tensor = torch.tensor([indexed_affordance])
                segments_tensor = torch.tensor([segments_ids])

                outputs = bert_model(tokens_tensor, segments_tensor)
                hidden_states = outputs[2]
                token_vecs = hidden_states[-2][0]
                affordance_embedding = torch.mean(token_vecs, dim=0)

                bert_embeddings_dict[affordance] = affordance_embedding


In [None]:
bert_embeddings_dict['coffee cup']

In [None]:
bert_embeddings_dict['grasp'].size()

In [None]:
len(bert_embeddings_dict)

# Visual Bert Embeddings

In [None]:
visual_bert_model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre",output_hidden_states=True)

In [None]:
visual_bert_embeddings_dict = {} # I create this embeddings dictionary so I can easily map words to embeddings

with torch.no_grad():
    
    for subset in [train1_pairs + val1_pairs + test1_pairs]:
    
        for obj, affordance, truth_val in subset:

            if obj not in visual_bert_embeddings_dict.keys():
                tokenized_obj = tokenize_string(obj)
                indexed_obj = tokenizer.convert_tokens_to_ids(tokenized_obj)
                segments_ids = [1] * len(tokenized_obj)
                tokens_tensor = torch.tensor([indexed_obj])
                segments_tensor = torch.tensor([segments_ids])

                outputs = visual_bert_model(tokens_tensor, segments_tensor)
                hidden_states = outputs[2]
                token_vecs = hidden_states[-2][0] # I take the penultimate layer
                obj_embedding = torch.mean(token_vecs, dim=0) # I take the mean over the vectors for each token to get a representation of the whole input

                visual_bert_embeddings_dict[obj] = obj_embedding

            if affordance not in visual_bert_embeddings_dict.keys():
                tokenized_affordance = tokenize_string(affordance)
                indexed_affordance = tokenizer.convert_tokens_to_ids(tokenized_affordance)
                segments_ids = [1] * len(tokenized_affordance)
                tokens_tensor = torch.tensor([indexed_affordance])
                segments_tensor = torch.tensor([segments_ids])

                outputs = visual_bert_model(tokens_tensor, segments_tensor)
                hidden_states = outputs[2]
                token_vecs = hidden_states[-2][0]
                affordance_embedding = torch.mean(token_vecs, dim=0)

                visual_bert_embeddings_dict[affordance] = affordance_embedding

In [None]:
visual_bert_embeddings_dict['coffee cup']

In [None]:
len(visual_bert_embeddings_dict)

# The model

In [None]:
hyperparameters = {"epochs":2000,
                   "batch_size":64,
                   "learning_rate":0.005}

In [None]:
class Probe(nn.Module):
    def __init__(self):
        super(Probe, self).__init__()
        self.sigmoid = nn.Sigmoid()
        self.fc = nn.Linear(768, 2)
        self.softmax = nn.LogSoftmax(1)
        
    def forward(self, obj, affordance):
        combined_vector = obj * affordance
        x1 = self.sigmoid(combined_vector)
        x2 = self.fc(x1)
        output = self.softmax(x2)
        return output

## Dataloader BERT

In [None]:
embeddings_train_data = [(bert_embeddings_dict[x], bert_embeddings_dict[y], z) for x,y,z in train1_pairs]
embeddings_val_data = [(bert_embeddings_dict[x], bert_embeddings_dict[y], z) for x,y,z in val1_pairs]
embeddings_test_data = [(bert_embeddings_dict[x], bert_embeddings_dict[y], z) for x,y,z in test1_pairs]

bert_train_dataloader = DataLoader(embeddings_train_data, batch_size=hyperparameters["batch_size"], shuffle=True)
bert_val_dataloader = DataLoader(embeddings_val_data, batch_size=hyperparameters["batch_size"], shuffle=True)
bert_test_dataloader = DataLoader(embeddings_test_data, batch_size=hyperparameters["batch_size"], shuffle=True)

## Dataloader VisualBERT

In [None]:
embeddings_train_data = [(visual_bert_embeddings_dict[x], visual_bert_embeddings_dict[y], z) for x,y,z in train1_pairs]
embeddings_val_data = [(visual_bert_embeddings_dict[x], visual_bert_embeddings_dict[y], z) for x,y,z in val1_pairs]
embeddings_test_data = [(visual_bert_embeddings_dict[x], visual_bert_embeddings_dict[y], z) for x,y,z in test1_pairs]

visual_bert_train_dataloader = DataLoader(embeddings_train_data, batch_size=hyperparameters["batch_size"], shuffle=True)
visual_bert_val_dataloader = DataLoader(embeddings_val_data, batch_size=hyperparameters["batch_size"], shuffle=True)
visual_bert_test_dataloader = DataLoader(embeddings_test_data, batch_size=hyperparameters["batch_size"], shuffle=True)

# Training

In [None]:
def plot_accuracy(epochs, train_acc, val_acc):
    plt.title("Training and Validation Accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.plot(epochs, train_acc, label="Training Accuracy")
    plt.plot(epochs, val_acc, label="Validation Accuracy")
    plt.legend()
    plt.show()
    return

In [None]:
def plot_loss(epochs, train_loss, val_loss):
    plt.title("Training and Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.plot(epochs, train_loss, label="Training Loss")
    plt.plot(epochs, val_loss, label="Validation Loss")
    plt.legend()
    plt.show()
    return

In [None]:
probe = Probe()
print(probe)

criterion = nn.NLLLoss()
optimizer = optim.Adam(
    probe.parameters(),
    lr=hyperparameters["learning_rate"]
)

epoch_list = []
val_loss_list = []
train_loss_list = []
total_loss = 0

train_accuracy_list = []
val_accuracy_list = []

for epoch in range(hyperparameters["epochs"]):
    
    # TRAIN LOOP
    training_loss = 0
    probe.train()
    
    epoch_accuracy = 0
    
    for i, batch in enumerate(bert_train_dataloader):
        
        obj = batch[0]
        affordance = batch[1]
        truth_value = batch[2]
        
        output = probe(obj, affordance)
        loss = criterion(output,truth_value)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        training_loss += loss.item()
        
        # calculate training accuracy
        prediction = torch.argmax(output, dim=1)
        correct_predictions = torch.eq(prediction,truth_value).long()
        batch_accuracy = float(sum(correct_predictions)/len(correct_predictions))
        epoch_accuracy += batch_accuracy
    
    # VALIDATION LOOP
    validation_loss = 0
    probe.eval()
    
    val_epoch_accuracy = 0
    
    for i, batch in enumerate(bert_val_dataloader):
        
        obj = batch[0]
        affordance = batch[1]
        truth_value = batch[2]
        
        output = probe(obj, affordance)
        loss = criterion(output,truth_value)
        validation_loss += loss.item()
        
        # calculate validation accuracy
        prediction = torch.argmax(output, dim=1)
        correct_predictions = torch.eq(prediction,truth_value).long()
        batch_accuracy = float(sum(correct_predictions)/len(correct_predictions))
        val_epoch_accuracy += batch_accuracy
    
    epoch_list.append(epoch+1)
    training_loss_avg = training_loss/len(bert_train_dataloader)
    train_loss_list.append(training_loss_avg)
    validation_loss_avg = validation_loss/len(bert_val_dataloader)
    val_loss_list.append(validation_loss_avg)
    
    train_accuracy_list.append(epoch_accuracy/len(bert_train_dataloader))
    val_accuracy_list.append(val_epoch_accuracy/len(bert_val_dataloader))

    print("Epoch: {}".format(epoch+1))
    print("Training loss: {}".format(training_loss_avg))
    print("Validation loss: {}".format(validation_loss_avg))
    print("Training accuracy: {}".format(epoch_accuracy/len(bert_train_dataloader)))
    print("Validation accuracy: {}".format(val_epoch_accuracy/len(bert_val_dataloader)))
    
plot_loss(epoch_list, train_loss_list, val_loss_list)
plot_accuracy(epoch_list, train_accuracy_list, val_accuracy_list)