# Parsing VLM outputs

In [1]:
from utils.paths import ROOT
from utils.utils import *
from itertools import permutations
import json
import pandas as pd
import numpy as np


In [2]:
img_labels = ['SHTL', 'XHWK', 'AKRC', 'ZHRN']
directions = ['left', 'right', 'up', 'down']
label_to_direction = {label:direction for label,direction in zip(img_labels, directions)}

## Molmo

In [18]:
# outputs = open_json(ROOT / 'src/fullpaper/results/vlms/molmo_human_oneshot.json')
outputs = open_json(ROOT / 'src/fullpaper/results/vlms/molmo-72b_human_oneshot.json')

In [20]:
out_dict = {}
inv_ans = 0
inv_concepts = set()
for event in outputs:
    out_dict[event] = {'up': 0, 'down': 0, 'left': 0, 'right': 0}
    for permutation in outputs[event]:
        ans = outputs[event][permutation]
        cond = ((img_labels[0]in ans) | (img_labels[1]in ans) | (img_labels[2]in ans) | (img_labels[3]in ans))
        if np.array(cond).sum() == 1:
            if img_labels[0] in ans:
                out_dict[event][label_to_direction[img_labels[0]]] += 1
            elif img_labels[1] in ans:
                out_dict[event][label_to_direction[img_labels[1]]] += 1
            elif img_labels[2] in ans:
                out_dict[event][label_to_direction[img_labels[2]]] += 1
            elif img_labels[3] in ans:
                out_dict[event][label_to_direction[img_labels[3]]] += 1
        else:
            # print('Invalid answer')
            inv_ans +=1
            inv_concepts.add(event)
for event in out_dict:
    for concept in out_dict[event]:
        out_dict[event][concept] = round(out_dict[event][concept] / 24 * 100, 1)
print(f'There are {inv_ans} invalid answers')
print(f'There are invalid answers for {len(inv_concepts)} concepts')




There are 0 invalid answers
There are invalid answers for 0 concepts


In [74]:
dict_to_json(out_dict, ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo-72b_human_oneshot.json')

Adding this to the valid answers, since it seems that's the only kind of invalid answer output by the model.

In [4]:
label_to_direction['XHVK'] = 'right'

In [5]:
parsed_outputs = {}

invalid_answers = 0
for verb in outputs:
    verb_dict = {'up': 0, 'down': 0, 'left': 0, 'right': 0}
    for permutation in outputs[verb]:
        answer = outputs[verb][permutation].strip(' ')
        if answer in label_to_direction:
            direction = label_to_direction[answer]
            verb_dict[direction] += 1
        else:
            print(answer)
            invalid_answers += 1
    for k in verb_dict:
        verb_dict[k] = round(verb_dict[k]/24*100, 1)
    parsed_outputs[verb] = verb_dict

print(f'Number of invalid answers: {invalid_answers}')

Number of invalid answers: 0


In [None]:
dict_to_json(parsed_outputs, ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo_reg.json')

## Qwen-VL - Regular

In [23]:
# outputs = open_json(ROOT / 'src/fullpaper/results/vlms/qwen-vl_human_oneshot.json')
outputs = open_json(ROOT / 'src/fullpaper/results/vlms/qwen-vl-72b_human_oneshot.json')

In [24]:
parsed_outputs = {}

invalid_answers = 0
for verb in outputs:
    verb_dict = {'up': 0, 'down': 0, 'left': 0, 'right': 0}
    for permutation in outputs[verb]:
        answer = outputs[verb][permutation][0].strip(' \n').replace('Image: ', '')
        if answer in label_to_direction:
            direction = label_to_direction[answer]
            verb_dict[direction] += 1
        else:
            print(answer)
            invalid_answers += 1
    for k in verb_dict:
        verb_dict[k] = round(verb_dict[k]/24*100, 1)
    parsed_outputs[verb] = verb_dict

print(f'Number of invalid answers: {invalid_answers}')

Number of invalid answers: 0


In [11]:
dict_to_json(parsed_outputs, ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl-72b_human_oneshot.json')

## Molmo - Analogy

In [25]:
outputs = open_json(ROOT / 'src/fullpaper/results/vlms/molmo_human_oneshot_analogy.json')


In [28]:
invalid_responses = 0
parsed_analogies = {}
parsed_concepts = {}
for event in outputs:
    parsed_analogies[event] = {}
    parsed_concepts[event] = {}
    for permutation in outputs[event]:
        try:
            analogy, concept = outputs[event][permutation].split("Image: ")
            parsed_analogies[event][permutation] = analogy.strip('\n')
            parsed_concepts[event][permutation] = concept[:4]
        except: 
            try:
                analogy, concept = outputs[event][permutation].split("Image choice: ")
                parsed_analogies[event][permutation] = analogy.strip('\n')
                parsed_concepts[event][permutation] = concept
            except:
                try: 
                    actual_response = outputs[event][permutation].split("2.")[1]
                    analogy, concept = actual_response.split("Image choice:")
                    parsed_analogies[event][permutation] = analogy.strip('\n ').replace('\n', ' ')
                    parsed_concepts[event][permutation] = concept.replace(' ', '')
                except:
                    invalid_responses += 1
                    print(permutation, outputs[event][permutation])


In [66]:
dict_to_json(parsed_analogies, ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo-human-analogies.json')

In [29]:
out_dict = {}
inv_ans = 0
for event in parsed_concepts:
    out_dict[event] = {'up': 0, 'down': 0, 'left': 0, 'right': 0}
    for permutation in parsed_concepts[event]:
        ans = parsed_concepts[event][permutation]
        cond = ((img_labels[0]in ans) | (img_labels[1]in ans) | (img_labels[2]in ans) | (img_labels[3]in ans))
        if np.array(cond).sum() == 1:
            if img_labels[0] in ans:
                out_dict[event][label_to_direction[img_labels[0]]] += 1
            elif img_labels[1] in ans:
                out_dict[event][label_to_direction[img_labels[1]]] += 1
            elif img_labels[2] in ans:
                out_dict[event][label_to_direction[img_labels[2]]] += 1
            elif img_labels[3] in ans:
                out_dict[event][label_to_direction[img_labels[3]]] += 1
        else:
            # print('Invalid answer')
            inv_ans +=1
for event in out_dict:
    for concept in out_dict[event]:
        out_dict[event][concept] = round(out_dict[event][concept] / 24 * 100, 1)
print(f'There are {inv_ans} invalid answers')




There are 0 invalid answers


In [68]:
dict_to_json(out_dict, ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo-human-concepts.json')


In [30]:
outputs = open_json(ROOT / 'src/fullpaper/results/vlms/molmo-72b_human_oneshot_analogy.json')

In [31]:
invalid_responses = 0
parsed_analogies = {}
parsed_concepts = {}
for event in outputs:
    parsed_analogies[event] = {}
    parsed_concepts[event] = {}
    for permutation in outputs[event]:
        try:
            analogy, concept = outputs[event][permutation].split("Image: ")
            parsed_analogies[event][permutation] = analogy.strip('\n')
            parsed_concepts[event][permutation] = concept[:4]
        except: 
            
            try:
                analogy, concept = outputs[event][permutation].split("Looking at the images, we can see that the ")
                parsed_analogies[event][permutation] = analogy.strip('\n')
                parsed_concepts[event][permutation] = concept[:4]
            except:
                invalid_responses += 1
                print(permutation, outputs[event][permutation])
                

            #     try: 
            #         actual_response = outputs[event][permutation].split("2.")[1]
            #         analogy, concept = actual_response.split("Image choice:")
            #         parsed_analogies[event][permutation] = analogy.strip('\n ').replace('\n', ' ')
            #         parsed_concepts[event][permutation] = concept.replace(' ', '')
            #     except:
            #         invalid_responses += 1
            #         print(permutation, outputs[event][permutation])


print(invalid_responses)

0


In [15]:
for event in parsed_analogies:
    for permutation in parsed_analogies[event]:
        if 'Analogy: ' in parsed_analogies[event][permutation]:
            new_analogy = parsed_analogies[event][permutation].split('Analogy: ')[1]
            parsed_analogies[event][permutation] = new_analogy


In [21]:
dict_to_json(parsed_analogies, ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo72b-human-analogies.json')

In [32]:
out_dict = {}
inv_ans = 0
for event in parsed_concepts:
    out_dict[event] = {'up': 0, 'down': 0, 'left': 0, 'right': 0}
    for permutation in parsed_concepts[event]:
        ans = parsed_concepts[event][permutation]
        cond = ((img_labels[0]in ans) | (img_labels[1]in ans) | (img_labels[2]in ans) | (img_labels[3]in ans))
        if np.array(cond).sum() == 1:
            if img_labels[0] in ans:
                out_dict[event][label_to_direction[img_labels[0]]] += 1
            elif img_labels[1] in ans:
                out_dict[event][label_to_direction[img_labels[1]]] += 1
            elif img_labels[2] in ans:
                out_dict[event][label_to_direction[img_labels[2]]] += 1
            elif img_labels[3] in ans:
                out_dict[event][label_to_direction[img_labels[3]]] += 1
        else:
            # print('Invalid answer')
            inv_ans +=1
for event in out_dict:
    for concept in out_dict[event]:
        out_dict[event][concept] = round(out_dict[event][concept] / 24 * 100, 1)
print(f'There are {inv_ans} invalid answers')




There are 0 invalid answers


In [20]:
dict_to_json(out_dict, ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo72b-human-concepts.json')

## Qwen-VL - Analogy

In [36]:
outputs = open_json(ROOT / 'src/fullpaper/results/vlms/qwen-vl_human_oneshot_analogy.json')
# outputs = open_json(ROOT / 'src/fullpaper/results/vlms/qwen-vl-72b_human_oneshot_analogy.json')

In [37]:
parsed_analogies = {}
parsed_concepts = {}
for event in outputs:
    parsed_analogies[event] = {}
    parsed_concepts[event] = {}
    for permutation in outputs[event]:
        analogy, concept = outputs[event][permutation][0].split("Image: ")
        parsed_analogies[event][permutation] = analogy.strip('\n')
        parsed_concepts[event][permutation] = concept[:4]


In [7]:
dict_to_json(parsed_analogies, ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl-72b-human-analogies.json')

In [38]:
parsed_outputs = {}

invalid_answers = 0
for verb in parsed_concepts:
    verb_dict = {'up': 0, 'down': 0, 'left': 0, 'right': 0}
    for permutation in parsed_concepts[verb]:
        answer = parsed_concepts[verb][permutation]
        if answer in label_to_direction:
            direction = label_to_direction[answer]
            verb_dict[direction] += 1
        else:
            print(answer)
            print(verb)
            invalid_answers += 1
    for k in verb_dict:
        verb_dict[k] = round(verb_dict[k]/24*100, 1)
    parsed_outputs[verb] = verb_dict

print(f'Number of invalid answers: {invalid_answers}')

Number of invalid answers: 0


In [9]:
# dict_to_json(parsed_outputs, ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl-human-concepts.json')

dict_to_json(parsed_outputs, ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl-72b-human-concepts.json')
