# Correlations with our data

This notebook contains code to compute the correlation between the concepts from our human experiment and models' chosen concepts

In [1]:
from utils.paths import ROOT
from utils.utils import *
import json
from scipy.stats import spearmanr, pearsonr
import pandas as pd
from scipy.special import kl_div
import numpy as np
from pprint import pprint
from sklearn.metrics import f1_score

In [2]:
_, our_data, _ = load_our_data()

## Qwen-2-VL Regular

In [3]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl_human_oneshot.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(our_data, outputs)
get_corr_with_human_data_binary(our_data, outputs)
get_f1_score(our_data, outputs)
get_f1_score_binary(our_data, outputs)

{'down': 4.59,
 'left': 16.936666666666667,
 'right': 5.413333333333333,
 'up': 73.05999999999999}
Up:	corr=0.3	p_value=0.1019
Down:	corr=0.26	p_value=0.1611
Left:	corr=0.25	p_value=0.1898
Right:	corr=0.06	p_value=0.7372
Horizontal:	corr=0.61	p_value=0.0003
Vertical:	corr=0.61	p_value=0.0003
The f1-score with human data is 0.23
The binary f1-score with human data is 0.6


In [4]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl-72b_human_oneshot.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(our_data, outputs)
get_corr_with_human_data_binary(our_data, outputs)
get_f1_score(our_data, outputs)
get_f1_score_binary(our_data, outputs)

{'down': 8.333333333333334,
 'left': 41.53333333333333,
 'right': 40.830000000000005,
 'up': 9.303333333333335}
Up:	corr=0.44	p_value=0.0153
Down:	corr=0.31	p_value=0.0959
Left:	corr=0.41	p_value=0.0249
Right:	corr=0.3	p_value=0.11
Horizontal:	corr=0.59	p_value=0.0006
Vertical:	corr=0.59	p_value=0.0006
The f1-score with human data is 0.35
The binary f1-score with human data is 0.52


## Molmo - Regular

In [5]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo_human_oneshot.json'))
answers = get_label_distribution(outputs)
pprint(answers)
to_exclude = ['hunted', 'offended', 'warned', 'wanted', 'respected']
get_corr_with_human_data(our_data, outputs, events_to_exclude=to_exclude)
get_corr_with_human_data_binary(our_data, outputs, events_to_exclude=to_exclude)
get_f1_score(our_data, outputs, events_to_exclude=to_exclude)
get_f1_score_binary(our_data, outputs, events_to_exclude=to_exclude)

hunted
offended
warned
wanted
respected
{'down': 30.0, 'left': 0.0, 'right': 3.3333333333333335, 'up': 50.0}
Up:	corr=-0.05	p_value=0.8187
Down:	corr=0.11	p_value=0.5916
Left:	corr=nan	p_value=nan
Right:	corr=0.3	p_value=0.1405
Horizontal:	corr=0.23	p_value=0.2727
Vertical:	corr=0.23	p_value=0.2727
The f1-score with human data is 0.2
The binary f1-score with human data is 0.44


  corr, p_value = spearmanr(model_values, richardson_values)


In [6]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo-72b_human_oneshot.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(our_data, outputs)
get_corr_with_human_data_binary(our_data, outputs)
get_f1_score(our_data, outputs)
get_f1_score_binary(our_data, outputs)

{'down': 0.0, 'left': 3.3333333333333335, 'right': 0.0, 'up': 96.66666666666667}
Up:	corr=0.17	p_value=0.3613
Down:	corr=nan	p_value=nan
Left:	corr=-0.15	p_value=0.4152
Right:	corr=nan	p_value=nan
Horizontal:	corr=0.28	p_value=0.1319
Vertical:	corr=0.28	p_value=0.1319
The f1-score with human data is 0.05
The binary f1-score with human data is 0.4


  corr, p_value = spearmanr(model_values, richardson_values)


## Qwen-VL - Analogy

In [7]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl-human-concepts.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(our_data, outputs)
get_corr_with_human_data_binary(our_data, outputs)
get_f1_score(our_data, outputs)
get_f1_score_binary(our_data, outputs)

{'down': 43.20000000000002,
 'left': 17.223333333333336,
 'right': 7.503333333333332,
 'up': 32.07333333333333}
Up:	corr=0.46	p_value=0.0098
Down:	corr=0.44	p_value=0.0158
Left:	corr=0.13	p_value=0.4807
Right:	corr=0.06	p_value=0.768
Horizontal:	corr=0.73	p_value=0.0
Vertical:	corr=0.73	p_value=0.0
The f1-score with human data is 0.22
The binary f1-score with human data is 0.62


In [8]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/qwen-vl-72b-human-concepts.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(our_data, outputs)
get_corr_with_human_data_binary(our_data, outputs)
get_f1_score(our_data, outputs)
get_f1_score_binary(our_data, outputs)

{'down': 8.479999999999999,
 'left': 23.466666666666665,
 'right': 39.58666666666667,
 'up': 28.466666666666672}
Up:	corr=0.28	p_value=0.1293
Down:	corr=0.37	p_value=0.045
Left:	corr=0.37	p_value=0.0417
Right:	corr=0.33	p_value=0.0759
Horizontal:	corr=0.56	p_value=0.0013
Vertical:	corr=0.56	p_value=0.0013
The f1-score with human data is 0.38
The binary f1-score with human data is 0.69


## Molmo - Analogy

In [9]:
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo-human-concepts.json'))
# outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo72b-human-concepts.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(our_data, outputs)
get_corr_with_human_data_binary(our_data, outputs)
get_f1_score(our_data, outputs)
get_f1_score_binary(our_data, outputs)

{'down': 53.333333333333336,
 'left': 0.0,
 'right': 3.3333333333333335,
 'up': 43.333333333333336}
Up:	corr=0.03	p_value=0.861
Down:	corr=-0.08	p_value=0.6772
Left:	corr=nan	p_value=nan
Right:	corr=-0.24	p_value=0.2029
Horizontal:	corr=-0.22	p_value=0.2505
Vertical:	corr=-0.22	p_value=0.2505
The f1-score with human data is 0.12
The binary f1-score with human data is 0.32


  corr, p_value = spearmanr(model_values, richardson_values)


In [10]:
# outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo-human-concepts.json'))
outputs = fix_label_distributions(open_json(ROOT / 'src/fullpaper/results/vlms/parsed_results/molmo72b-human-concepts.json'))
answers = get_label_distribution(outputs)
pprint(answers)
get_corr_with_human_data(our_data, outputs)
get_corr_with_human_data_binary(our_data, outputs)
get_f1_score(our_data, outputs)
get_f1_score_binary(our_data, outputs)

{'down': 10.0,
 'left': 20.0,
 'right': 6.666666666666667,
 'up': 63.333333333333336}
Up:	corr=0.3	p_value=0.1069
Down:	corr=-0.11	p_value=0.5793
Left:	corr=-0.1	p_value=0.614
Right:	corr=0.05	p_value=0.8054
Horizontal:	corr=0.37	p_value=0.0447
Vertical:	corr=0.37	p_value=0.0447
The f1-score with human data is 0.16
The binary f1-score with human data is 0.61
