# Testing and comparing the probes for BERT and VisualBERT

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from models import Probe
from dataset import create_df, get_gold_data, get_bert_embedding_dict, get_visual_bert_embedding_dict, get_lists_and_dicts

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.seq_relationship.weight', 'c

In [2]:
device = 'cuda:1'

In [3]:
df = create_df('../data/affordance_annotations.txt')
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
unique_objects, unique_affordances, word_to_index, index_to_word = get_lists_and_dicts(df)
train_pairs = get_gold_data(shuffled_df[:42])
val_pairs = get_gold_data(shuffled_df[42:52])
test_pairs = get_gold_data(shuffled_df[52:])
bert_word_to_embedding = get_bert_embedding_dict([train_pairs + val_pairs + test_pairs])
visual_bert_word_to_embedding = get_visual_bert_embedding_dict([train_pairs + val_pairs + test_pairs])

In [4]:
baseline_dict_objects = dict.fromkeys(unique_objects, 0)
for index, row in df.iterrows():
        for i, value in enumerate(row):
            if type(value) == str:
                pass
            else:
                baseline_dict_objects[row[0]] += value
                
baseline_total_objects = 0
for k,v in baseline_dict_objects.items():
    baseline_dict_objects[k] = np.round((v * 100)/15, 2)
    baseline_total_objects += v

baseline_total_objects = np.round((baseline_total_objects/(15*62))*100,2)
print(f'{100-baseline_total_objects} %')

72.26 %


In [5]:
baseline_dict_affordances = dict.fromkeys(unique_affordances, 0)

for index, row in df.iterrows():
    for k in baseline_dict_affordances.keys():
        baseline_dict_affordances[k] += row[k]
        
baseline_total_affordances = 0
for k,v in baseline_dict_affordances.items():
    baseline_dict_affordances[k] = np.round((v * 100)/62, 2)
    baseline_total_affordances += v

baseline_total_affordances = np.round((baseline_total_affordances/(15*62))*100,2)
print(f'{100-baseline_total_objects} %')

72.26 %


## Testing the BERT Probe on test data

In [6]:
bert_probe = Probe().to(device)
bert_probe.load_state_dict(torch.load("../model_bert_probe|epochs_2000|batch_size_64|learning_rate_0.005"))

<All keys matched successfully>

In [7]:
criterion = nn.NLLLoss()
test_data = [(bert_word_to_embedding[x], bert_word_to_embedding[y], z, word_to_index[x], word_to_index[y]) for x,y,z in test_pairs]
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [8]:
test_loss = 0
bert_probe.eval()

total = 0
correct = 0

per_word_total = dict.fromkeys(bert_word_to_embedding, 0)
per_word_correct = dict.fromkeys(bert_word_to_embedding, 0)

tp_bert = 0
fp_bert = 0
tn_bert = 0
fn_bert = 0

for i, batch in enumerate(test_dataloader):
    
    obj = batch[0].to(device)
    affordance = batch[1].to(device)
    target = batch[2].to(device)

    with torch.no_grad(): 
        
        output = bert_probe(obj, affordance)
        
        bert_loss = criterion(output, target)
        test_loss += bert_loss.item()

        # Calculate total accuracy
        total += len(batch[0])
        
        prediction = torch.argmax(output, dim=1)
        correct_predictions = torch.eq(prediction,target).long()
        correct += float(sum(correct_predictions))

        # Calculate per-object and per-affordance accuracy
        object_indices = batch[3].tolist()
        objects = [index_to_word[i] for i in object_indices]
        affordance_indices = batch[4].tolist()
        affordances = [index_to_word[i] for i in affordance_indices]
        
        for n,word in enumerate(objects):
            if prediction[n] == target[n]:
                per_word_correct[word] += 1
            per_word_total[word] += 1
            
        for n,word in enumerate(affordances):
            if prediction[n] == target[n]:
                per_word_correct[word] += 1
            per_word_total[word] += 1
            
        # Calculate tp,fp,tn,fn
        for i, value in enumerate(prediction.tolist()):
            if target.tolist()[i] == 1 and prediction.tolist()[i] == 1:
                tp_bert += 1
            elif target.tolist()[i] == 0 and prediction.tolist()[i] == 1:
                fp_bert += 1
            elif target.tolist()[i] == 1 and prediction.tolist()[i] == 0:
                fn_bert += 1
            elif target.tolist()[i] == 0 and prediction.tolist()[i] == 0:
                tn_bert += 1
        

        print('>', np.round(test_loss/(i+1), 4))

accuracy_bert_probe = correct / total
per_object_accuracy_bert_probe = {word : (per_word_correct[word] / per_word_total[word]) for word in unique_objects if per_word_total[word] > 0}
per_affordance_accuracy_bert_probe = {word : (per_word_correct[word] / per_word_total[word]) for word in unique_affordances if per_word_total[word] > 0}

print()
print(f'Total accuracy BERT probe: {np.round(accuracy_bert_probe * 100, 2)} %')
print()

print('Per-object accuracy BERT probe:')
for k,v in per_object_accuracy_bert_probe.items():
    print(f'{k} : {np.round(v * 100, 2)} %')
print()
    
print('Per-affordance accuracy BERT probe:')
for k,v in per_affordance_accuracy_bert_probe.items():
    print(f'{k} : {np.round(v * 100, 2)} %')

> 0.0056
> 0.0113
> 0.0543

Total accuracy BERT probe: 86.67 %

Per-object accuracy BERT probe:
carving knife : 86.67 %
dustcloth : 93.33 %
guitar : 80.0 %
handset : 100.0 %
laptop : 80.0 %
power saw : 93.33 %
violin : 93.33 %
bowl : 100.0 %
kayak : 53.33 %
walkie-talkie : 86.67 %

Per-affordance accuracy BERT probe:
grasp : 90.0 %
lift : 90.0 %
throw : 70.0 %
push : 100.0 %
fix : 70.0 %
ride : 80.0 %
play : 80.0 %
watch : 60.0 %
sit on : 80.0 %
feed : 100.0 %
row : 90.0 %
pour from : 100.0 %
look through : 100.0 %
write with : 100.0 %
type on : 90.0 %


In [9]:
accuracy_bert = (tp_bert + tn_bert) / (tp_bert + fp_bert + tn_bert + fn_bert)
print(f'{np.round(accuracy_bert * 100, 2)}%')

86.67%


In [10]:
precision_bert = tp_bert / (tp_bert + fp_bert)
print(f'{np.round(precision_bert * 100, 2)}%')

76.36%


In [11]:
recall_bert = tp_bert / (tp_bert + fn_bert)
print(f'{np.round(recall_bert * 100, 2)}%')

85.71%


In [12]:
f1_bert = (2 * recall_bert * precision_bert) / (recall_bert + precision_bert)
print(f'{np.round(f1_bert * 100, 2)}%')

80.77%


## Testing the VisualBERT Probe on test data

In [13]:
visual_bert_probe = Probe().to(device)
visual_bert_probe.load_state_dict(torch.load("../model_visual_bert_probe|epochs_2000|batch_size_64|learning_rate_0.005"))

<All keys matched successfully>

In [14]:
criterion = nn.NLLLoss()
test_data = [(visual_bert_word_to_embedding[x], visual_bert_word_to_embedding[y], z, word_to_index[x], word_to_index[y]) for x,y,z in test_pairs]
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [15]:
test_loss = 0
criterion = nn.NLLLoss()
visual_bert_probe.eval()

total = 0
correct = 0

per_word_total = dict.fromkeys(visual_bert_word_to_embedding, 0)
per_word_correct = dict.fromkeys(visual_bert_word_to_embedding, 0)

tp_visual_bert = 0
fp_visual_bert = 0
tn_visual_bert = 0
fn_visual_bert = 0

for i, batch in enumerate(test_dataloader):
    
    obj = batch[0].to(device)
    affordance = batch[1].to(device)
    target = batch[2].to(device)

    with torch.no_grad(): 
        
        output = visual_bert_probe(obj, affordance)
        
        visual_bert_loss = criterion(output, target)
        test_loss += visual_bert_loss.item()

        # Calculate total accuracy
        total += len(batch[0])
        
        prediction = torch.argmax(output, dim=1)
        correct_predictions = torch.eq(prediction,target).long()
        correct += float(sum(correct_predictions))

        # Calculate per word accuracy
        object_indices = batch[3].tolist()
        objects = [index_to_word[i] for i in object_indices]
        affordance_indices = batch[4].tolist()
        affordances = [index_to_word[i] for i in affordance_indices]
        
        for n,word in enumerate(objects):
            if prediction[n] == target[n]:
                per_word_correct[word] += 1
            per_word_total[word] += 1
            
        for n,word in enumerate(affordances):
            if prediction[n] == target[n]:
                per_word_correct[word] += 1
            per_word_total[word] += 1
            
        # Calculate tp,fp,tn,fn
        for i, value in enumerate(prediction.tolist()):
            if target.tolist()[i] == 1 and prediction.tolist()[i] == 1:
                tp_visual_bert += 1
            elif target.tolist()[i] == 0 and prediction.tolist()[i] == 1:
                fp_visual_bert += 1
            elif target.tolist()[i] == 1 and prediction.tolist()[i] == 0:
                fn_visual_bert += 1
            elif target.tolist()[i] == 0 and prediction.tolist()[i] == 0:
                tn_visual_bert += 1

        print('>', np.round(test_loss/(i+1), 4))

accuracy_visual_bert_probe = correct / total
per_object_accuracy_visual_bert_probe = {word : (per_word_correct[word] / per_word_total[word]) for word in unique_objects if per_word_total[word] > 0}
per_affordance_accuracy_visual_bert_probe = {word : (per_word_correct[word] / per_word_total[word]) for word in unique_affordances if per_word_total[word] > 0}

print(f'Total accuracy VisualBERT probe: {np.round(accuracy_visual_bert_probe * 100, 2)} %')
print()

print('Per-object accuracy VisualBERT probe:')
for k,v in per_object_accuracy_visual_bert_probe.items():
    print(f'{k} : {np.round(v * 100, 2)} %')
print()

print('Per-affordance accuracy VisualBERT probe:')
for k,v in per_affordance_accuracy_visual_bert_probe.items():
    print(f'{k} : {np.round(v * 100, 2)} %')

> 0.0071
> 0.0131
> 0.0548
Total accuracy VisualBERT probe: 87.33 %

Per-object accuracy VisualBERT probe:
carving knife : 93.33 %
dustcloth : 93.33 %
guitar : 93.33 %
handset : 100.0 %
laptop : 86.67 %
power saw : 93.33 %
violin : 86.67 %
bowl : 93.33 %
kayak : 60.0 %
walkie-talkie : 73.33 %

Per-affordance accuracy VisualBERT probe:
grasp : 80.0 %
lift : 80.0 %
throw : 70.0 %
push : 100.0 %
fix : 50.0 %
ride : 90.0 %
play : 80.0 %
watch : 90.0 %
sit on : 90.0 %
feed : 100.0 %
row : 90.0 %
pour from : 100.0 %
look through : 100.0 %
write with : 100.0 %
type on : 90.0 %


In [16]:
accuracy_visual_bert = (tp_visual_bert + tn_visual_bert) / (tp_visual_bert + fp_visual_bert + tn_visual_bert + fn_visual_bert)
print(f'{np.round(accuracy_visual_bert * 100, 2)}%')

87.33%


In [17]:
precision_visual_bert = tp_visual_bert / (tp_visual_bert + fp_visual_bert)
print(f'{np.round(precision_visual_bert * 100, 2)}%')

84.09%


In [18]:
recall_visual_bert = tp_visual_bert / (tp_visual_bert + fn_visual_bert)
print(f'{np.round(recall_visual_bert * 100, 2)}%')

75.51%


In [19]:
f1_visual_bert = (2 * recall_visual_bert * precision_visual_bert) / (recall_visual_bert + precision_visual_bert)
print(f'{np.round(f1_visual_bert * 100, 2)}%')

79.57%


# Comparison of the results

### i) Per-affordance-accuracy for BERT and VisualBERT Probe compared with baseline

The table below shows the per-affordance-accuracy on the testset for BERT and VisualBERT probes. To compare, the baseline accuracy on the total dataset and the testset are there as well. The baseline accuracy is the accuracy obtained when the model guesses the most common value (0 or 1) for the particular affordance.

In [20]:
baseline_dict_affordances_test = dict.fromkeys(unique_affordances, 0)

for index, row in df.iterrows():
    if row['Object'] in per_object_accuracy_bert_probe.keys():
        for k in baseline_dict_affordances_test.keys():
            baseline_dict_affordances_test[k] += row[k]

for k,v in baseline_dict_affordances_test.items():
    baseline_dict_affordances_test[k] = np.round((v * 100)/10, 2)

results = {'BERT':list(per_affordance_accuracy_bert_probe.values()), 'VisualBERT':list(per_affordance_accuracy_visual_bert_probe.values()), 'Baseline':list(baseline_dict_affordances.values()), 'Baseline testset':baseline_dict_affordances_test}
df_results = pd.DataFrame(results, index=list(per_affordance_accuracy_bert_probe.keys()))

def show_percentage(val):
    return f'{np.round(val * 100, 2)} %'

def baseline_percentage(val):
    if val < 50:
        val = 100-val
    return f'{np.round(val, 2)} %'

df_results['BERT'] = df_results['BERT'].apply(show_percentage)
df_results['VisualBERT'] = df_results['VisualBERT'].apply(show_percentage)
df_results['Baseline'] = df_results['Baseline'].apply(baseline_percentage)
df_results['Baseline testset'] = df_results['Baseline testset'].apply(baseline_percentage)

df_results

Unnamed: 0,BERT,VisualBERT,Baseline,Baseline testset
grasp,90.0 %,80.0 %,59.68 %,90.0 %
lift,90.0 %,80.0 %,82.26 %,90.0 %
throw,70.0 %,70.0 %,50.0 %,80.0 %
push,100.0 %,100.0 %,90.32 %,100.0 %
fix,70.0 %,50.0 %,59.68 %,60.0 %
ride,80.0 %,90.0 %,80.65 %,90.0 %
play,80.0 %,80.0 %,95.16 %,80.0 %
watch,60.0 %,90.0 %,93.55 %,90.0 %
sit on,80.0 %,90.0 %,74.19 %,90.0 %
feed,100.0 %,100.0 %,90.32 %,100.0 %


### ii) Per-object-accuracy for BERT and VisualBERT Probe compared with baseline

The table below shows the per-object-accuracy on the testset for BERT and VisualBERT probes. To compare, the baseline accuracy on the total dataset and the testset are there as well. The baseline accuracy is the accuracy obtained when the model guesses the most common value (0 or 1) for the particular object.

In [21]:
baseline_dict_objects_test = dict.fromkeys([x for x in unique_objects if x in per_object_accuracy_bert_probe.keys()], 0)

for index, row in df.iterrows():
    if row['Object'] in per_object_accuracy_bert_probe.keys():
        for affordance in unique_affordances:
            baseline_dict_objects_test[row['Object']] += row[affordance]

for k,v in baseline_dict_objects_test.items():
    baseline_dict_objects_test[k] = np.round((v * 100)/10, 2)
    
results = {'BERT':list(per_object_accuracy_bert_probe.values()), 'VisualBERT':list(per_object_accuracy_visual_bert_probe.values()), 'Baseline':list(baseline_dict_objects_test.values()), 'Baseline testset':baseline_dict_objects_test}
df_results = pd.DataFrame(results, index=list(per_object_accuracy_bert_probe.keys()))

def show_percentage(val):
    return f'{np.round(val * 100, 2)} %'

def baseline_percentage(val):
    if val < 50:
        val = 100-val
    return f'{np.round(val, 2)} %'

df_results['BERT'] = df_results['BERT'].apply(show_percentage)
df_results['VisualBERT'] = df_results['VisualBERT'].apply(show_percentage)
df_results['Baseline'] = df_results['Baseline'].apply(baseline_percentage)
df_results['Baseline testset'] = df_results['Baseline testset'].apply(baseline_percentage)

df_results

Unnamed: 0,BERT,VisualBERT,Baseline,Baseline testset
carving knife,86.67 %,93.33 %,60.0 %,60.0 %
dustcloth,93.33 %,93.33 %,60.0 %,60.0 %
guitar,80.0 %,93.33 %,60.0 %,60.0 %
handset,100.0 %,100.0 %,50.0 %,50.0 %
laptop,80.0 %,86.67 %,70.0 %,70.0 %
power saw,93.33 %,93.33 %,60.0 %,60.0 %
violin,93.33 %,86.67 %,60.0 %,60.0 %
bowl,100.0 %,93.33 %,60.0 %,60.0 %
kayak,53.33 %,60.0 %,60.0 %,60.0 %
walkie-talkie,86.67 %,73.33 %,50.0 %,50.0 %


### iii) Comparing pairs of seen and unseen objects with identical affordances

In [83]:
with torch.no_grad():
    
    bert_sickle = []
    bert_carving_knife = []
    bert_banjo = []
    bert_guitar = []
    bert_small_boat = []
    bert_kayak = []
    
    carving_knife = df.iloc[7]
    sickle = df.iloc[57]
    banjo = df.iloc[40]
    guitar = df.iloc[18]
    small_boat = df.iloc[30]
    kayak = df.iloc[51]
    
    for affordance in unique_affordances:
        output = bert_probe(bert_word_to_embedding['sickle'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        bert_sickle.append(torch.argmax(output).item())
        output = bert_probe(bert_word_to_embedding['carving knife'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        bert_carving_knife.append(torch.argmax(output).item())
    
    for affordance in unique_affordances:
        output = bert_probe(bert_word_to_embedding['banjo'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        bert_banjo.append(torch.argmax(output).item())
        output = bert_probe(bert_word_to_embedding['guitar'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        bert_guitar.append(torch.argmax(output).item())
        
    for affordance in unique_affordances:
        output = bert_probe(bert_word_to_embedding['small boat'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        bert_small_boat.append(torch.argmax(output).item())
        output = bert_probe(bert_word_to_embedding['kayak'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        bert_kayak.append(torch.argmax(output).item())
        
bert_results1 = pd.DataFrame({'affordance':unique_affordances,'sickle':bert_sickle, 'carving knife':bert_carving_knife, 'target':list(sickle)[2:]})
bert_results2 = pd.DataFrame({'affordance':unique_affordances,'banjo':bert_banjo, 'guitar':bert_guitar, 'target':list(guitar)[2:]})
bert_results3 = pd.DataFrame({'affordance':unique_affordances,'small boat':bert_small_boat, 'kayak':bert_kayak, 'target':list(kayak)[2:]})

In [84]:
with torch.no_grad():
    
    visual_bert_sickle = []
    visual_bert_carving_knife = []
    visual_bert_banjo = []
    visual_bert_guitar = []
    visual_bert_small_boat = []
    visual_bert_kayak = []
    
    carving_knife = df.iloc[7]
    sickle = df.iloc[57]
    banjo = df.iloc[40]
    guitar = df.iloc[18]
    small_boat = df.iloc[30]
    kayak = df.iloc[51]
    
    for affordance in unique_affordances:
        output = visual_bert_probe(visual_bert_word_to_embedding['sickle'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        visual_bert_sickle.append(torch.argmax(output).item())
        output = visual_bert_probe(visual_bert_word_to_embedding['carving knife'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        visual_bert_carving_knife.append(torch.argmax(output).item())
        
    for affordance in unique_affordances:
        output = visual_bert_probe(visual_bert_word_to_embedding['banjo'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        visual_bert_banjo.append(torch.argmax(output).item())
        output = visual_bert_probe(visual_bert_word_to_embedding['guitar'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        visual_bert_guitar.append(torch.argmax(output).item())
        
    for affordance in unique_affordances:
        output = visual_bert_probe(visual_bert_word_to_embedding['small boat'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        visual_bert_small_boat.append(torch.argmax(output).item())
        output = visual_bert_probe(visual_bert_word_to_embedding['kayak'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        visual_bert_kayak.append(torch.argmax(output).item())
        
vb_results1 = pd.DataFrame({'affordance':unique_affordances,'sickle':visual_bert_sickle, 'carving knife':visual_bert_carving_knife, 'target':list(sickle)[2:]})
vb_results2 = pd.DataFrame({'affordance':unique_affordances,'banjo':visual_bert_banjo, 'guitar':visual_bert_guitar, 'target':list(guitar)[2:]})
vb_results3 = pd.DataFrame({'affordance':unique_affordances,'small boat':visual_bert_small_boat, 'kayak':visual_bert_kayak, 'target':list(kayak)[2:]})

In [86]:
bert_results1

Unnamed: 0,affordance,sickle,carving knife,target
0,grasp,1,1,1
1,lift,1,1,1
2,throw,1,0,1
3,push,1,1,1
4,fix,0,1,0
5,ride,0,0,0
6,play,0,0,0
7,watch,0,0,0
8,sit on,0,0,0
9,feed,0,0,0


In [87]:
vb_results1

Unnamed: 0,affordance,sickle,carving knife,target
0,grasp,1,1,1
1,lift,1,1,1
2,throw,1,1,1
3,push,1,1,1
4,fix,1,1,0
5,ride,0,0,0
6,play,0,0,0
7,watch,0,0,0
8,sit on,0,0,0
9,feed,0,0,0


In [88]:
bert_results2

Unnamed: 0,affordance,banjo,guitar,target
0,grasp,1,1,1
1,lift,1,1,1
2,throw,1,1,1
3,push,1,1,1
4,fix,1,1,1
5,ride,0,1,0
6,play,0,0,1
7,watch,0,1,0
8,sit on,0,0,0
9,feed,0,0,0


In [89]:
vb_results2

Unnamed: 0,affordance,banjo,guitar,target
0,grasp,1,1,1
1,lift,1,1,1
2,throw,1,1,1
3,push,1,1,1
4,fix,1,1,1
5,ride,0,0,0
6,play,0,0,1
7,watch,0,0,0
8,sit on,0,0,0
9,feed,0,0,0


In [90]:
bert_results3

Unnamed: 0,affordance,small boat,kayak,target
0,grasp,0,1,0
1,lift,1,1,0
2,throw,0,1,0
3,push,1,1,1
4,fix,0,1,0
5,ride,0,1,1
6,play,0,0,0
7,watch,0,1,0
8,sit on,0,0,1
9,feed,0,0,0


In [91]:
vb_results3

Unnamed: 0,affordance,small boat,kayak,target
0,grasp,1,1,0
1,lift,1,1,0
2,throw,1,0,0
3,push,1,1,1
4,fix,1,1,0
5,ride,0,0,1
6,play,0,0,0
7,watch,0,0,0
8,sit on,0,0,1
9,feed,0,0,0


In [26]:
# CHECK Visa per affordance accuracy BERT, VisualBERT, Baseline
# CHECK Visa per object accuracy BERT, VisualBERT, Baseline
# CHECK Visa seen och unseen objects och jämför med facit
# 4 Ta med graf som visar alla 10 modellers accuracy

### iv) Graph showing accuracy, precision, recall and F1 for BERT and VisualBERT