In [1]:
from utils.paths import ROOT
from utils.utils import *
import json
from scipy.stats import spearmanr, pearsonr
import pandas as pd
from scipy.special import kl_div
import numpy as np
from pprint import pprint
from sklearn.metrics import f1_score

In [2]:
_, richardson_data, richardson_normed = load_richardson_data()

## Qwen-VL - Regular

In [3]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl_human_oneshot.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(richardson_data, outputs)
get_corr_with_human_data_binary(richardson_data, outputs)
get_f1_score(richardson_data, outputs)
get_f1_score_binary(richardson_data, outputs)

{'down': 4.59,
 'left': 16.936666666666667,
 'right': 5.413333333333333,
 'up': 73.05999999999999}
Up:	corr=0.22	p_value=0.2336
Down:	corr=0.45	p_value=0.0124
Left:	corr=0.05	p_value=0.7951
Right:	corr=0.19	p_value=0.3231
Horizontal:	corr=0.66	p_value=0.0001
Vertical:	corr=0.66	p_value=0.0001
The f1-score with human data is 0.18
The binary f1-score with human data is 0.6


In [4]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl-72b_human_oneshot.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(richardson_data, outputs)
get_corr_with_human_data_binary(richardson_data, outputs)
get_f1_score(richardson_data, outputs)
get_f1_score_binary(richardson_data, outputs)

{'down': 8.333333333333334,
 'left': 41.53333333333333,
 'right': 40.830000000000005,
 'up': 9.303333333333335}
Up:	corr=0.53	p_value=0.0024
Down:	corr=0.5	p_value=0.0053
Left:	corr=0.31	p_value=0.0967
Right:	corr=0.44	p_value=0.0162
Horizontal:	corr=0.71	p_value=0.0
Vertical:	corr=0.71	p_value=0.0
The f1-score with human data is 0.41
The binary f1-score with human data is 0.6


## Molmo - Regular

In [5]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo_human_oneshot.json'))
answers = get_label_distribution(outputs)
pprint(answers)
to_exclude = ['hunted', 'offended', 'warned', 'wanted', 'respected']
get_corr_with_human_data(richardson_data, outputs, events_to_exclude=to_exclude)
get_corr_with_human_data_binary(richardson_data, outputs, events_to_exclude=to_exclude)
get_f1_score(richardson_data, outputs, events_to_exclude=to_exclude)
get_f1_score_binary(richardson_data, outputs, events_to_exclude=to_exclude)

hunted
offended
warned
wanted
respected
{'down': 30.0, 'left': 0.0, 'right': 3.3333333333333335, 'up': 50.0}
Up:	corr=0.11	p_value=0.6081
Down:	corr=0.36	p_value=0.0733
Left:	corr=nan	p_value=nan
Right:	corr=0.34	p_value=0.0965
Horizontal:	corr=0.33	p_value=0.1122
Vertical:	corr=0.33	p_value=0.1121
The f1-score with human data is 0.3
The binary f1-score with human data is 0.39


  corr, p_value = spearmanr(model_values, richardson_values)


In [6]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo-72b_human_oneshot.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(richardson_data, outputs)
get_corr_with_human_data_binary(richardson_data, outputs)
get_f1_score(richardson_data, outputs)
get_f1_score_binary(richardson_data, outputs)

{'down': 0.0, 'left': 3.3333333333333335, 'right': 0.0, 'up': 96.66666666666667}
Up:	corr=0.19	p_value=0.306
Down:	corr=nan	p_value=nan
Left:	corr=-0.27	p_value=0.1517
Right:	corr=nan	p_value=nan
Horizontal:	corr=0.3	p_value=0.1068
Vertical:	corr=0.3	p_value=0.1067
The f1-score with human data is 0.05
The binary f1-score with human data is 0.33


  corr, p_value = spearmanr(model_values, richardson_values)


## Qwen-VL - Analogy

In [9]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl-human-concepts.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(richardson_data, outputs)
get_corr_with_human_data_binary(richardson_data, outputs)
get_f1_score(richardson_data, outputs)
get_f1_score_binary(richardson_data, outputs)

{'down': 43.20000000000002,
 'left': 17.223333333333336,
 'right': 7.503333333333332,
 'up': 32.07333333333333}
Up:	corr=0.56	p_value=0.0013
Down:	corr=0.52	p_value=0.0034
Left:	corr=0.11	p_value=0.5772
Right:	corr=0.22	p_value=0.2408
Horizontal:	corr=0.79	p_value=0.0
Vertical:	corr=0.79	p_value=0.0
The f1-score with human data is 0.34
The binary f1-score with human data is 0.55


In [10]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl-72b-human-concepts.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(richardson_data, outputs)
get_corr_with_human_data_binary(richardson_data, outputs)
get_f1_score(richardson_data, outputs)
get_f1_score_binary(richardson_data, outputs)

{'down': 8.479999999999999,
 'left': 23.466666666666665,
 'right': 39.58666666666667,
 'up': 28.466666666666672}
Up:	corr=0.37	p_value=0.045
Down:	corr=0.42	p_value=0.0219
Left:	corr=0.36	p_value=0.0483
Right:	corr=0.52	p_value=0.0033
Horizontal:	corr=0.67	p_value=0.0
Vertical:	corr=0.67	p_value=0.0
The f1-score with human data is 0.51
The binary f1-score with human data is 0.9


## Molmo - Analogy

In [7]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo-human-concepts.json'))

answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(richardson_data, outputs)
get_corr_with_human_data_binary(richardson_data, outputs)
get_f1_score(richardson_data, outputs)
get_f1_score_binary(richardson_data, outputs)

{'down': 53.333333333333336,
 'left': 0.0,
 'right': 3.3333333333333335,
 'up': 43.333333333333336}
Up:	corr=0.29	p_value=0.1177
Down:	corr=-0.17	p_value=0.3583
Left:	corr=nan	p_value=nan
Right:	corr=-0.26	p_value=0.1694
Horizontal:	corr=-0.25	p_value=0.1887
Vertical:	corr=-0.25	p_value=0.1886
The f1-score with human data is 0.15
The binary f1-score with human data is 0.25


  corr, p_value = spearmanr(model_values, richardson_values)


In [8]:

outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo72b-human-concepts.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(richardson_data, outputs)
get_corr_with_human_data_binary(richardson_data, outputs)
get_f1_score(richardson_data, outputs)
get_f1_score_binary(richardson_data, outputs)

{'down': 10.0,
 'left': 20.0,
 'right': 6.666666666666667,
 'up': 63.333333333333336}
Up:	corr=0.32	p_value=0.0847
Down:	corr=-0.04	p_value=0.8398
Left:	corr=-0.07	p_value=0.7044
Right:	corr=0.15	p_value=0.439
Horizontal:	corr=0.52	p_value=0.0031
Vertical:	corr=0.52	p_value=0.0033
The f1-score with human data is 0.15
The binary f1-score with human data is 0.68
