# Testing and comparing the probes for BERT and VisualBERT

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from models import Probe
from dataset import create_df, get_gold_data, get_bert_embedding_dict, get_visual_bert_embedding_dict, get_lists_and_dicts

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.predictions.decoder.weight',

In [2]:
device = 'cuda:1'

In [3]:
df = create_df('../data/affordance_annotations.txt')
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
unique_objects, unique_affordances, word_to_index, index_to_word = get_lists_and_dicts(df)
train_pairs = get_gold_data(shuffled_df[:42])
val_pairs = get_gold_data(shuffled_df[42:52])
test_pairs = get_gold_data(shuffled_df[52:])
bert_word_to_embedding = get_bert_embedding_dict([train_pairs + val_pairs + test_pairs])
visual_bert_word_to_embedding = get_visual_bert_embedding_dict([train_pairs + val_pairs + test_pairs])

In [4]:
baseline_dict_objects = dict.fromkeys(unique_objects, 0)
for index, row in df.iterrows():
        for i, value in enumerate(row):
            if type(value) == str:
                pass
            else:
                baseline_dict_objects[row[0]] += value
                
baseline_total_objects = 0
for k,v in baseline_dict_objects.items():
    baseline_dict_objects[k] = np.round((v * 100)/15, 2)
    baseline_total_objects += v

baseline_total_objects = np.round((baseline_total_objects/(15*62))*100,2)
print(f'{100-baseline_total_objects} %')

72.26 %


In [5]:
baseline_dict_affordances = dict.fromkeys(unique_affordances, 0)

for index, row in df.iterrows():
    for k in baseline_dict_affordances.keys():
        baseline_dict_affordances[k] += row[k]
        
baseline_total_affordances = 0
for k,v in baseline_dict_affordances.items():
    baseline_dict_affordances[k] = np.round((v * 100)/62, 2)
    baseline_total_affordances += v

baseline_total_affordances = np.round((baseline_total_affordances/(15*62))*100,2)
print(f'{100-baseline_total_objects} %')

72.26 %


## Testing the BERT Probe on test data

In [6]:
bert_probe = Probe().to(device)
bert_probe.load_state_dict(torch.load("../model_bert_probe|epochs_2000|batch_size_64|learning_rate_0.005"))

<All keys matched successfully>

In [7]:
criterion = nn.NLLLoss()
test_data = [(bert_word_to_embedding[x], bert_word_to_embedding[y], z, word_to_index[x], word_to_index[y]) for x,y,z in test_pairs]
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [8]:
test_loss = 0
bert_probe.eval()

total = 0
correct = 0

per_word_total = dict.fromkeys(bert_word_to_embedding, 0)
per_word_correct = dict.fromkeys(bert_word_to_embedding, 0)

tp_bert = 0
fp_bert = 0
tn_bert = 0
fn_bert = 0

for i, batch in enumerate(test_dataloader):
    
    obj = batch[0].to(device)
    affordance = batch[1].to(device)
    target = batch[2].to(device)

    with torch.no_grad(): 
        
        output = bert_probe(obj, affordance)
        
        bert_loss = criterion(output, target)
        test_loss += bert_loss.item()

        # Calculate total accuracy
        total += len(batch[0])
        
        prediction = torch.argmax(output, dim=1)
        correct_predictions = torch.eq(prediction,target).long()
        correct += float(sum(correct_predictions))

        # Calculate per-object and per-affordance accuracy
        object_indices = batch[3].tolist()
        objects = [index_to_word[i] for i in object_indices]
        affordance_indices = batch[4].tolist()
        affordances = [index_to_word[i] for i in affordance_indices]
        
        for n,word in enumerate(objects):
            if prediction[n] == target[n]:
                per_word_correct[word] += 1
            per_word_total[word] += 1
            
        for n,word in enumerate(affordances):
            if prediction[n] == target[n]:
                per_word_correct[word] += 1
            per_word_total[word] += 1
            
        # Calculate tp,fp,tn,fn
        for i, value in enumerate(prediction.tolist()):
            if target.tolist()[i] == 1 and prediction.tolist()[i] == 1:
                tp_bert += 1
            elif target.tolist()[i] == 0 and prediction.tolist()[i] == 1:
                fp_bert += 1
            elif target.tolist()[i] == 1 and prediction.tolist()[i] == 0:
                fn_bert += 1
            elif target.tolist()[i] == 0 and prediction.tolist()[i] == 0:
                tn_bert += 1
        

        print('>', np.round(test_loss/(i+1), 4))

accuracy_bert_probe = correct / total
per_object_accuracy_bert_probe = {word : (per_word_correct[word] / per_word_total[word]) for word in unique_objects if per_word_total[word] > 0}
per_affordance_accuracy_bert_probe = {word : (per_word_correct[word] / per_word_total[word]) for word in unique_affordances if per_word_total[word] > 0}

print()
print(f'Total accuracy BERT probe: {np.round(accuracy_bert_probe * 100, 2)} %')
print()

print('Per-object accuracy BERT probe:')
for k,v in per_object_accuracy_bert_probe.items():
    print(f'{k} : {np.round(v * 100, 2)} %')
print()
    
print('Per-affordance accuracy BERT probe:')
for k,v in per_affordance_accuracy_bert_probe.items():
    print(f'{k} : {np.round(v * 100, 2)} %')

> 0.0068
> 0.0122
> 0.049

Total accuracy BERT probe: 86.67 %

Per-object accuracy BERT probe:
carving knife : 86.67 %
dustcloth : 93.33 %
guitar : 80.0 %
handset : 100.0 %
laptop : 80.0 %
power saw : 93.33 %
violin : 93.33 %
bowl : 100.0 %
kayak : 53.33 %
walkie-talkie : 86.67 %

Per-affordance accuracy BERT probe:
grasp : 90.0 %
lift : 90.0 %
throw : 70.0 %
push : 100.0 %
fix : 70.0 %
ride : 80.0 %
play : 80.0 %
watch : 60.0 %
sit on : 80.0 %
feed : 100.0 %
row : 90.0 %
pour from : 100.0 %
look through : 100.0 %
write with : 100.0 %
type on : 90.0 %


In [9]:
accuracy_bert = (tp_bert + tn_bert) / (tp_bert + fp_bert + tn_bert + fn_bert)
print(f'{np.round(accuracy_bert * 100, 2)}%')

86.67%


In [10]:
precision_bert = tp_bert / (tp_bert + fp_bert)
print(f'{np.round(precision_bert * 100, 2)}%')

76.36%


In [11]:
recall_bert = tp_bert / (tp_bert + fn_bert)
print(f'{np.round(recall_bert * 100, 2)}%')

85.71%


In [12]:
f1_bert = (2 * recall_bert * precision_bert) / (recall_bert + precision_bert)
print(f'{np.round(f1_bert * 100, 2)}%')

80.77%


## Testing the BERT Probe on seen objects

In [13]:
with torch.no_grad():
    for affordance in unique_affordances:
        output = bert_probe(bert_word_to_embedding['sickle'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print('sickle, carving knife')
        print(f'{affordance}: {torch.argmax(output)}')
        output = bert_probe(bert_word_to_embedding['carving knife'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print(f'{affordance}: {torch.argmax(output)}')
        print()
    
    for affordance in unique_affordances:
        output = bert_probe(bert_word_to_embedding['banjo'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print('banjo, guitar')
        print(f'{affordance}: {torch.argmax(output)}')
        output = bert_probe(bert_word_to_embedding['guitar'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print(f'{affordance}: {torch.argmax(output)}')
        print()
        
    for affordance in unique_affordances:
        output = bert_probe(bert_word_to_embedding['small boat'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print('small boat, kayak')
        print(f'{affordance}: {torch.argmax(output)}')
        output = bert_probe(bert_word_to_embedding['kayak'].unsqueeze(0).to(device), bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print(f'{affordance}: {torch.argmax(output)}')
        print()

sickle, carving knife
grasp: 1
grasp: 1

sickle, carving knife
lift: 1
lift: 1

sickle, carving knife
throw: 1
throw: 0

sickle, carving knife
push: 1
push: 1

sickle, carving knife
fix: 0
fix: 1

sickle, carving knife
ride: 0
ride: 0

sickle, carving knife
play: 0
play: 0

sickle, carving knife
watch: 0
watch: 0

sickle, carving knife
sit on: 0
sit on: 0

sickle, carving knife
feed: 0
feed: 0

sickle, carving knife
row: 0
row: 0

sickle, carving knife
pour from: 0
pour from: 0

sickle, carving knife
look through: 0
look through: 0

sickle, carving knife
write with: 0
write with: 0

sickle, carving knife
type on: 0
type on: 0

banjo, guitar
grasp: 1
grasp: 1

banjo, guitar
lift: 1
lift: 1

banjo, guitar
throw: 1
throw: 1

banjo, guitar
push: 1
push: 1

banjo, guitar
fix: 1
fix: 1

banjo, guitar
ride: 0
ride: 1

banjo, guitar
play: 0
play: 0

banjo, guitar
watch: 0
watch: 1

banjo, guitar
sit on: 0
sit on: 0

banjo, guitar
feed: 0
feed: 0

banjo, guitar
row: 0
row: 0

banjo, guitar
pour

## Testing the VisualBERT Probe on test data

In [14]:
visual_bert_probe = Probe().to(device)
visual_bert_probe.load_state_dict(torch.load("../model_visual_bert_probe|epochs_2000|batch_size_64|learning_rate_0.005"))

<All keys matched successfully>

In [15]:
criterion = nn.NLLLoss()
test_data = [(visual_bert_word_to_embedding[x], visual_bert_word_to_embedding[y], z, word_to_index[x], word_to_index[y]) for x,y,z in test_pairs]
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [16]:
test_loss = 0
criterion = nn.NLLLoss()
visual_bert_probe.eval()

total = 0
correct = 0

per_word_total = dict.fromkeys(visual_bert_word_to_embedding, 0)
per_word_correct = dict.fromkeys(visual_bert_word_to_embedding, 0)

tp_visual_bert = 0
fp_visual_bert = 0
tn_visual_bert = 0
fn_visual_bert = 0

for i, batch in enumerate(test_dataloader):
    
    obj = batch[0].to(device)
    affordance = batch[1].to(device)
    target = batch[2].to(device)

    with torch.no_grad(): 
        
        output = visual_bert_probe(obj, affordance)
        
        visual_bert_loss = criterion(output, target)
        test_loss += visual_bert_loss.item()

        # Calculate total accuracy
        total += len(batch[0])
        
        prediction = torch.argmax(output, dim=1)
        correct_predictions = torch.eq(prediction,target).long()
        correct += float(sum(correct_predictions))

        # Calculate per word accuracy
        object_indices = batch[3].tolist()
        objects = [index_to_word[i] for i in object_indices]
        affordance_indices = batch[4].tolist()
        affordances = [index_to_word[i] for i in affordance_indices]
        
        for n,word in enumerate(objects):
            if prediction[n] == target[n]:
                per_word_correct[word] += 1
            per_word_total[word] += 1
            
        for n,word in enumerate(affordances):
            if prediction[n] == target[n]:
                per_word_correct[word] += 1
            per_word_total[word] += 1
            
        # Calculate tp,fp,tn,fn
        for i, value in enumerate(prediction.tolist()):
            if target.tolist()[i] == 1 and prediction.tolist()[i] == 1:
                tp_visual_bert += 1
            elif target.tolist()[i] == 0 and prediction.tolist()[i] == 1:
                fp_visual_bert += 1
            elif target.tolist()[i] == 1 and prediction.tolist()[i] == 0:
                fn_visual_bert += 1
            elif target.tolist()[i] == 0 and prediction.tolist()[i] == 0:
                tn_visual_bert += 1

        print('>', np.round(test_loss/(i+1), 4))

accuracy_visual_bert_probe = correct / total
per_object_accuracy_visual_bert_probe = {word : (per_word_correct[word] / per_word_total[word]) for word in unique_objects if per_word_total[word] > 0}
per_affordance_accuracy_visual_bert_probe = {word : (per_word_correct[word] / per_word_total[word]) for word in unique_affordances if per_word_total[word] > 0}

print(f'Total accuracy VisualBERT probe: {np.round(accuracy_visual_bert_probe * 100, 2)} %')
print()

print('Per-object accuracy VisualBERT probe:')
for k,v in per_object_accuracy_visual_bert_probe.items():
    print(f'{k} : {np.round(v * 100, 2)} %')
print()

print('Per-affordance accuracy VisualBERT probe:')
for k,v in per_affordance_accuracy_visual_bert_probe.items():
    print(f'{k} : {np.round(v * 100, 2)} %')

> 0.0053
> 0.0128
> 0.0562
Total accuracy VisualBERT probe: 87.33 %

Per-object accuracy VisualBERT probe:
carving knife : 93.33 %
dustcloth : 93.33 %
guitar : 93.33 %
handset : 100.0 %
laptop : 86.67 %
power saw : 93.33 %
violin : 86.67 %
bowl : 93.33 %
kayak : 60.0 %
walkie-talkie : 73.33 %

Per-affordance accuracy VisualBERT probe:
grasp : 80.0 %
lift : 80.0 %
throw : 70.0 %
push : 100.0 %
fix : 50.0 %
ride : 90.0 %
play : 80.0 %
watch : 90.0 %
sit on : 90.0 %
feed : 100.0 %
row : 90.0 %
pour from : 100.0 %
look through : 100.0 %
write with : 100.0 %
type on : 90.0 %


In [17]:
accuracy_visual_bert = (tp_visual_bert + tn_visual_bert) / (tp_visual_bert + fp_visual_bert + tn_visual_bert + fn_visual_bert)
print(f'{np.round(accuracy_visual_bert * 100, 2)}%')

87.33%


In [18]:
precision_visual_bert = tp_visual_bert / (tp_visual_bert + fp_visual_bert)
print(f'{np.round(precision_visual_bert * 100, 2)}%')

84.09%


In [19]:
recall_visual_bert = tp_visual_bert / (tp_visual_bert + fn_visual_bert)
print(f'{np.round(recall_visual_bert * 100, 2)}%')

75.51%


In [20]:
f1_visual_bert = (2 * recall_visual_bert * precision_visual_bert) / (recall_visual_bert + precision_visual_bert)
print(f'{np.round(f1_visual_bert * 100, 2)}%')

79.57%


## Testing the VisualBERT Probe on seen data

In [21]:
with torch.no_grad():
    for affordance in unique_affordances:
        output = visual_bert_probe(visual_bert_word_to_embedding['sickle'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print('sickle, carving knife')
        print(f'{affordance}: {torch.argmax(output)}')
        output = visual_bert_probe(visual_bert_word_to_embedding['carving knife'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print(f'{affordance}: {torch.argmax(output)}')
        print()
    
    for affordance in unique_affordances:
        output = visual_bert_probe(visual_bert_word_to_embedding['banjo'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print('banjo, guitar')
        print(f'{affordance}: {torch.argmax(output)}')
        output = visual_bert_probe(visual_bert_word_to_embedding['guitar'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print(f'{affordance}: {torch.argmax(output)}')
        print()
        
    for affordance in unique_affordances:
        output = visual_bert_probe(visual_bert_word_to_embedding['small boat'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print('small boat, kayak')
        print(f'{affordance}: {torch.argmax(output)}')
        output = visual_bert_probe(visual_bert_word_to_embedding['kayak'].unsqueeze(0).to(device), visual_bert_word_to_embedding[affordance].unsqueeze(0).to(device))
        print(f'{affordance}: {torch.argmax(output)}')
        print()

sickle, carving knife
grasp: 1
grasp: 1

sickle, carving knife
lift: 1
lift: 1

sickle, carving knife
throw: 1
throw: 1

sickle, carving knife
push: 1
push: 1

sickle, carving knife
fix: 1
fix: 1

sickle, carving knife
ride: 0
ride: 0

sickle, carving knife
play: 0
play: 0

sickle, carving knife
watch: 0
watch: 0

sickle, carving knife
sit on: 0
sit on: 0

sickle, carving knife
feed: 0
feed: 0

sickle, carving knife
row: 0
row: 0

sickle, carving knife
pour from: 0
pour from: 0

sickle, carving knife
look through: 0
look through: 0

sickle, carving knife
write with: 0
write with: 0

sickle, carving knife
type on: 0
type on: 0

banjo, guitar
grasp: 1
grasp: 1

banjo, guitar
lift: 1
lift: 1

banjo, guitar
throw: 1
throw: 1

banjo, guitar
push: 1
push: 1

banjo, guitar
fix: 1
fix: 1

banjo, guitar
ride: 0
ride: 0

banjo, guitar
play: 0
play: 0

banjo, guitar
watch: 0
watch: 0

banjo, guitar
sit on: 0
sit on: 0

banjo, guitar
feed: 0
feed: 0

banjo, guitar
row: 0
row: 0

banjo, guitar
pour

# Comparison of the results

In [22]:
df

Unnamed: 0,Object,ImageNet synset,grasp,lift,throw,push,fix,ride,play,watch,sit on,feed,row,pour from,look through,write with,type on
0,automobile engine,n02761557,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,axe,n02764044,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
2,bicycle,n02834778,0,1,0,1,1,1,0,0,1,0,0,0,0,0,0
3,bottle,n02876657,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0
4,camera,n02942699,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,sickle,n04213353,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
58,spoon,n04284002,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
59,stool,n04326896,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
60,typewriter,n04505036,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1


In [34]:
baseline_dict_affordances_test = dict.fromkeys(unique_affordances, 0)

for index, row in df.iterrows():
    if row['Object'] in per_object_accuracy_bert_probe.keys():
        for k in baseline_dict_affordances_test.keys():
            baseline_dict_affordances_test[k] += row[k]
        
baseline_total_affordances = 0
for k,v in baseline_dict_affordances_test.items():
    baseline_dict_affordances_test[k] = np.round((v * 100)/62, 2)
    baseline_total_affordances += v

baseline_total_affordances = np.round((baseline_total_affordances/(15*62))*100,2)
print(f'{100-baseline_total_objects} %')

72.26 %


In [35]:
baseline_dict_affordances_test

{'grasp': 14.52,
 'lift': 14.52,
 'throw': 12.9,
 'push': 16.13,
 'fix': 9.68,
 'ride': 1.61,
 'play': 3.23,
 'watch': 1.61,
 'sit on': 1.61,
 'feed': 0.0,
 'row': 1.61,
 'pour from': 0.0,
 'look through': 0.0,
 'write with': 0.0,
 'type on': 1.61}

In [36]:
results = {'BERT':list(per_affordance_accuracy_bert_probe.values()), 'VisualBERT':list(per_affordance_accuracy_visual_bert_probe.values()), 'Baseline':list(baseline_dict_affordances.values()), 'Baseline testset':baseline_dict_affordances_test}

In [37]:
results

{'BERT': [0.9,
  0.9,
  0.7,
  1.0,
  0.7,
  0.8,
  0.8,
  0.6,
  0.8,
  1.0,
  0.9,
  1.0,
  1.0,
  1.0,
  0.9],
 'VisualBERT': [0.8,
  0.8,
  0.7,
  1.0,
  0.5,
  0.9,
  0.8,
  0.9,
  0.9,
  1.0,
  0.9,
  1.0,
  1.0,
  1.0,
  0.9],
 'Baseline': [59.68,
  82.26,
  50.0,
  90.32,
  40.32,
  19.35,
  4.84,
  6.45,
  25.81,
  9.68,
  3.23,
  9.68,
  4.84,
  4.84,
  4.84],
 'Baseline testset': {'grasp': 14.52,
  'lift': 14.52,
  'throw': 12.9,
  'push': 16.13,
  'fix': 9.68,
  'ride': 1.61,
  'play': 3.23,
  'watch': 1.61,
  'sit on': 1.61,
  'feed': 0.0,
  'row': 1.61,
  'pour from': 0.0,
  'look through': 0.0,
  'write with': 0.0,
  'type on': 1.61}}

In [38]:
df_results = pd.DataFrame(results, index=list(per_affordance_accuracy_bert_probe.keys()))

In [39]:
df_results

Unnamed: 0,BERT,VisualBERT,Baseline,Baseline testset
grasp,0.9,0.8,59.68,14.52
lift,0.9,0.8,82.26,14.52
throw,0.7,0.7,50.0,12.9
push,1.0,1.0,90.32,16.13
fix,0.7,0.5,40.32,9.68
ride,0.8,0.9,19.35,1.61
play,0.8,0.8,4.84,3.23
watch,0.6,0.9,6.45,1.61
sit on,0.8,0.9,25.81,1.61
feed,1.0,1.0,9.68,0.0


In [None]:
# 1 Visa per affordance accuracy BERT, VisualBERT, Baseline
# 2 Visa per object accuracy BERT, VisualBERT, Baseline
# 3 Visa seen och unseen objects och jämför med facit
# 4 Ta med graf som visar alla 10 modellers accuracy