#### Imports and dataset loading

In [None]:
import numpy as np
import pickle
from scipy.special import softmax
from utils import utils_vectorize as uv

with open('datasets/msrpar_samples.pkl', 'rb') as f: 
    par_samples, par_scores = pickle.load(f)
with open('datasets/msrvid_samples.pkl', 'rb') as f: 
    vid_samples, vid_scores = pickle.load(f)
with open('datasets/msranswer_samples.pkl', 'rb') as f: 
    answer_samples, answer_scores = pickle.load(f)
with open('datasets/def2def_adjusted_samples.pkl', 'rb') as f:      # For this purpose, only adjusted DEF2DEF
    def2def_adjusted_samples, def2def_adjusted_scores = pickle.load(f)

unified_samples = par_samples + vid_samples + answer_samples + def2def_adjusted_samples
unified_scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def_adjusted_scores]

#### Model functions

Output of fact_to_vector() function can be changed to obtain embeddings of only objects, or relations, or subjects, and combinations as v_subj_obj or so

In [179]:
s_norm_accum = 0.
r_norm_accum = 0.
o_norm_accum = 0.
s_norm_count = 0
r_norm_count = 0
o_norm_count = 0
experiments = 0
def fact_to_vector(fact:tuple, stop_words=False, punct_marks=False, embed_model='w2v', mu='ratio', weights=(1.0, 1.0, 1.6)):
    global s_norm_accum, r_norm_accum, o_norm_accum, s_norm_count, r_norm_count, o_norm_count, experiments
    # 1. A sequential composition into each element of triplet
    v_subj = uv.icds_vectorize(fact[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) 
    v_rel = uv.icds_vectorize(fact[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) 
    v_obj = uv.icds_vectorize(fact[2], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) 
    # Gathering some information about norm sizes
    s_norm_accum += np.linalg.norm(v_subj)
    r_norm_accum += np.linalg.norm(v_rel)
    o_norm_accum += np.linalg.norm(v_obj)
    if np.linalg.norm(v_obj) > np.linalg.norm(v_subj) and np.linalg.norm(v_obj) > np.linalg.norm(v_rel):
        o_norm_count += 1 
    elif np.linalg.norm(v_subj) > np.linalg.norm(v_rel):
        s_norm_count += 1
    else:
        r_norm_count += 1
    experiments += 1
    # 2. Weighting elements
    v_subj = v_subj * weights[0]
    v_rel = v_rel * weights[1]
    v_obj = v_obj * weights[2]
    #v_subj = v_subj * np.linalg.norm(v_subj)
    #v_rel = v_rel * np.linalg.norm(v_rel)
    #v_obj = v_obj * np.linalg.norm(v_obj) * 1.2
    # 3. A full composition on whole triplet
    #v_fact = v_subj + v_rel + v_obj
    v_subj_rel = uv.icds_composition(v_subj, v_rel)
    v_subj_obj = uv.icds_composition(v_subj, v_obj)
    v_rel_obj = uv.icds_composition(v_rel, v_obj)
    #v_fact = uv.icds_composition(v_subj_rel, v_obj) 
    v_fact = uv.icds_composition(v_subj, v_rel_obj) 
    #v_fact = uv.icds_composition(v_subj_obj, v_rel)  
    # In return: change v_fact to, for instance, v_obj; or v_subj_obj...
    return(v_fact, v_subj, v_rel, v_obj)

# Self-attention
def kgtxt_to_selfatt_vectors(txt_kg, stop_words=False, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio', weights=(1.0, 1.0, 1.6)):
    n_facts = len(txt_kg)
    weight_mtrx = np.empty((n_facts, n_facts))
    v_kg = []
    for txt_fact in txt_kg:
        v_fact, _, _, _ = fact_to_vector(txt_fact, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu, weights=weights)
        v_kg.append(v_fact)
    # 1. Compute fact-wise similarity (self-attention scores)
    for idx in range(n_facts):
        norm0 = np.max([1.e-125, np.linalg.norm(v_kg[idx])])
        for jdx in range(n_facts):
            norm1 = np.max([1.e-125, np.linalg.norm(v_kg[jdx])])
            norms_ratio = np.max(max(1.e-125, np.min([norm0, norm1])/np.max([norm0, norm1]))) 
            weight_mtrx[idx][jdx] = (norms_ratio ** 2) * uv.cos_sim(v_kg[idx], v_kg[jdx])
    # 2. Softmax/normalizing (self-att weights)
    norm_weight_mtrx = weight_mtrx 
    # 3. Context vectors (new contextual embeddings)
    self_att_mtrx = norm_weight_mtrx @ np.array(v_kg) 
    #print(self_att_mtrx)
    return(self_att_mtrx, np.array(v_kg))

In [176]:
# Functions to get similarity between two KGs
# Combinations: composition = (sum | ICDS); sim = (cos | ICM | euclid | dot prod); sents_sim: (mean | median | bidir | bertscore)
def pair_sim(kg_pair, self_att=True, stop_words=True, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio', weights=(1.0, 1.0, 1.6)):  # List of tuples of 3 strings
    sims = []
    kg0 = kg_pair[0]
    kg1 = kg_pair[1]
    n_facts_kg0 = len(kg0)
    n_facts_kg1 = len(kg1)
    n_sims = n_facts_kg0 * n_facts_kg1
    sim_mtrx = np.empty((n_facts_kg0, n_facts_kg1))
    if self_att:
        self_att_mtrx0, _ = kgtxt_to_selfatt_vectors(kg0, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu, weights=weights)
        self_att_mtrx1, _ = kgtxt_to_selfatt_vectors(kg1, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu, weights=weights)
        for idx in range(n_facts_kg0):
            norm0 = np.max([1.e-125, np.linalg.norm(self_att_mtrx0[idx])])
            for jdx in range(n_facts_kg1):
                norm1 = np.max([1.e-125, np.linalg.norm(self_att_mtrx1[jdx])])
                norms_weight = np.max(max(1.e-125, np.min([norm0, norm1])/np.max([norm0, norm1])))
                ic_01 = (abs(norm0 + norm1 - self_att_mtrx0[idx] @ self_att_mtrx1[jdx])) 
                sim_mtrx[idx][jdx] = max(0, uv.cos_sim(self_att_mtrx0[idx], self_att_mtrx1[jdx])) 
    else:           # No self-attention
        for idx, fact0 in enumerate(kg0):
            fact0_vector, _, _, _ = fact_to_vector(fact0, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu, weights=weights) 
            norm0 = np.linalg.norm(fact0_vector)
            for jdx, fact1 in enumerate(kg1):
                fact1_vector, _, _, _ = fact_to_vector(fact1, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu, weights=weights) 
                norm1 = np.linalg.norm(fact1_vector)
                sim_mtrx[idx][jdx] = max(0, uv.cos_sim(fact0_vector, fact1_vector)) 
    #sents_sim = (np.mean(sim_mtrx)) 
    sents_sim = uv.bidir_avgmax_sim(sim_mtrx, stdst='mean') 
    #sents_sim = uv.bertscore(sim_mtrx)
    return(sents_sim)

# Correlation with trues in a dataset of KG pairs
def ds_sents_sim(ds, true_scores, self_att=True, stop_words=True, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio', weights=(1.0, 1.0, 1.6)):
    sims = []
    for pair in ds:
        sims.append(pair_sim(pair, self_att=self_att, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu,  weights=weights))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

### Tests

Individual datasets (each one with best set of parameters)

In [221]:
# MSRpar dataset | Best weights: no sa (0.9, 0.4, 2.9); sa (0.6, 0.3, 3.2) 
# no_self_att =   [.626]  # s_words no | sim_mtrx=bertscore | SRO ws: 0.9 0.4 2.9 | mu=0 | v_fact=subj·rel_obj | 
# self_att =      [.619]  # s_words no | sim_mtrx=bertscore | SRO ws: 0.6 0.3 3.2 | mu=0 | v_fact=subj·rel_obj | 

corr, sims = ds_sents_sim(par_samples, par_scores, self_att=False, 
                              stop_words=False, punct_marks=False, beta=1.5, embed_model='w2v', mu=0, weights=(.9, .4, 2.9))  

print('MSRpar dataset:')
#print('True scores min, max, mean and std:',np.min(unified_scores), np.max(unified_scores), np.mean(unified_scores), np.std(unified_scores))
#print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Corr. for MSRpar:', corr, '\n')
print(s_norm_accum/experiments, r_norm_accum/experiments, o_norm_accum/experiments, s_norm_count, r_norm_count, o_norm_count, experiments)
print('Percentages of bigger norms: s:', s_norm_count/experiments, 'r:', r_norm_count/experiments, 'o:', o_norm_count/experiments)
s_norm_accum = 0.
r_norm_accum = 0.
o_norm_accum = 0.
s_norm_count = 0
r_norm_count = 0
o_norm_count = 0
experiments = 0

MSRpar dataset:
Corr. for MSRpar: 0.6130285647577763 

4.4839947820369055 2.872382723745743 5.53839483817318 3124 846 3555 7525
Percentages of bigger norms: s: 0.41514950166112957 r: 0.11242524916943522 o: 0.47242524916943524


In [216]:
# MSRvid dataset | Best weights: no sa (1., 1., 1.8); sa (1., 1., 1.75) 
corr, sims = ds_sents_sim(vid_samples, vid_scores, self_att=True, 
                              stop_words=True, punct_marks=False, beta=1.5, embed_model='w2v', mu=0, weights=(1., 1., 1.8))  

print('MSRvid dataset:')
#print('True scores min, max, mean and std:',np.min(unified_scores), np.max(unified_scores), np.mean(unified_scores), np.std(unified_scores))
#print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Corr. for MSRvid:', corr, '\n')
print(s_norm_accum/experiments, r_norm_accum/experiments, o_norm_accum/experiments, s_norm_count, r_norm_count, o_norm_count, experiments)
print('Percentages of bigger norms: s:', s_norm_count/experiments, 'r:', r_norm_count/experiments, 'o:', o_norm_count/experiments)
s_norm_accum = 0.
r_norm_accum = 0.
o_norm_accum = 0.
s_norm_count = 0
r_norm_count = 0
o_norm_count = 0
experiments = 0

MSRvid dataset:
Corr. for MSRvid: 0.8250672482659904 

2.679514014013282 2.610415814419858 2.9301497516322805 110 150 397 657
Percentages of bigger norms: s: 0.167427701674277 r: 0.228310502283105 o: 0.604261796042618


In [185]:
# answer dataset | Best weights: no sa (.7, 1.6, 1.6); sa (1., 1.4, 1.2) 
# no_self_att =   [.593]  # s_words false | sim_mtrx=bidir mean   | SRO ws: 0.7 1.6 1.6 | mu=1 | v_fact=subj_obj·rel
# self_att =      [.517]  # s_words false | sim_mtrx=mean         | SRO ws: 1.0 1.4 1.2 | mu=1 | v_fact=subj_rel·obj | sa=no_norm

corr, sims = ds_sents_sim(answer_samples, answer_scores, self_att=False, 
                              stop_words=False, punct_marks=False, beta=1.5, embed_model='w2v', mu=1, weights=(.7, 1.6, 1.6))  

print('answer dataset:')
#print('True scores min, max, mean and std:',np.min(unified_scores), np.max(unified_scores), np.mean(unified_scores), np.std(unified_scores))
#print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Corr. for answer:', corr, '\n')
print(s_norm_accum/experiments, r_norm_accum/experiments, o_norm_accum/experiments, s_norm_count, r_norm_count, o_norm_count, experiments)
print('Percentages of bigger norms: s:', s_norm_count/experiments, 'r:', r_norm_count/experiments, 'o:', o_norm_count/experiments)
s_norm_accum = 0.
r_norm_accum = 0.
o_norm_accum = 0.
s_norm_count = 0
r_norm_count = 0
o_norm_count = 0
experiments = 0

answer dataset:
Corr. for answer: 0.5868373773233031 

2.485584754968897 2.6671725611027073 2.8431897694669557 381 532 722 1635
Percentages of bigger norms: s: 0.23302752293577983 r: 0.3253822629969419 o: 0.4415902140672783


In [223]:
# DEF2DEF adj. dataset | Best weights: no sa (1.1, 0.6, 1.3); sa (1.2, 0.7, 1.65) 
# no_self_att =   [0.581]  # s_words ok | sim_mtrx=bidir mean | SRO ws: 1.1 0.6 1.3 | mu=1 | v_fact=subj_rel·obj 
# self_att =      [0.584]  # s_words ok | sim_mtrx=bidir mean | SRO ws: 1.2 0.7 1.65 | mu=1 | v_fact=subj_obj·rel 
corr, sims = ds_sents_sim(def2def_adjusted_samples, def2def_adjusted_scores, self_att=True,
                              stop_words=True, punct_marks=False, beta=1.5, embed_model='w2v', mu=1, weights=(1.1, 0.6, 1.3))

print('DEF2DEF adj. dataset:')
#print('True scores min, max, mean and std:',np.min(unified_scores), np.max(unified_scores), np.mean(unified_scores), np.std(unified_scores))
#print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Corr. for DEF2DEF adj.:', corr, '\n')
print(s_norm_accum/experiments, r_norm_accum/experiments, o_norm_accum/experiments, s_norm_count, r_norm_count, o_norm_count, experiments)
print('Percentages of bigger norms: s:', s_norm_count/experiments, 'r:', r_norm_count/experiments, 'o:', o_norm_count/experiments)
s_norm_accum = 0.
r_norm_accum = 0.
o_norm_accum = 0.
s_norm_count = 0
r_norm_count = 0
o_norm_count = 0
experiments = 0

DEF2DEF adj. dataset:
Corr. for DEF2DEF adj.: 0.5840108619084013 

2.8419606527590853 1.0367040057166153 3.168220801892817 5783 1099 8312 15194
Percentages of bigger norms: s: 0.38061076740818744 r: 0.07233118336185336 o: 0.5470580492299592


Joint tests (individual datasets with same parmeters

In [212]:
# Main: launches computation of similarities correlation from a set of labeled datasets and gives additional info 
# This file with DEF2DEF adjusted: ok 20250331
ALL_no_self_att = [.630]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.0 1.0 1.6 | mu=1 | v_fact=subj·rel_obj
ALL_self_att    = [.611]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.1 1.0 2.0 | mu=1 | v_fact=subj·rel_obj 

names = ['MSRpar', 'MSRvid', 'MSRanswer', 'def2def_adj']
sets = [par_samples, vid_samples, answer_samples, def2def_adjusted_samples]
true_scores = [par_scores, vid_scores, answer_scores, def2def_adjusted_scores]
correlations = []

for idx, elem in enumerate(sets):
    scores = true_scores[idx]
    corr, sims = ds_sents_sim(elem, scores, self_att=False, 
                              stop_words=False, punct_marks=False, beta=1.5, embed_model='w2v', mu=1, weights=(1, 1, 1.6))   
    print(s_norm_count, r_norm_count, o_norm_count, experiments)
    correlations.append(corr)
    print('Dataset:', names[idx])
    print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
    print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
    print('Corr. for', names[idx]+':', corr, '\n')
print('Correlations mean:', np.mean(np.array(correlations)))

print(s_norm_count, r_norm_count, o_norm_count, experiments)
print('Percentages of bigger norms: s:', s_norm_count/experiments, 'r:', r_norm_count/experiments, 'o:', o_norm_count/experiments)
s_norm_accum = 0.
r_norm_accum = 0.
o_norm_accum = 0.
s_norm_count = 0
r_norm_count = 0
o_norm_count = 0
experiments = 0

3199 763 3563 7525
Dataset: MSRpar
True scores min, max, mean and std: 0.5 5.0 3.21282 0.8924830192222147
Sim scores min, max, mean and std: 0.41859224415024243 4.969872247862995 3.7316982898976923 0.7772476149079477
Corr. for MSRpar: 0.5903491889627053 

3277 1146 3890 8313
Dataset: MSRvid
True scores min, max, mean and std: 0.0 5.0 2.105848 1.6065765144853823
Sim scores min, max, mean and std: 0.9680423140525818 5.000000149011612 3.3478207177669637 0.9679625538404487
Corr. for MSRvid: 0.8067548003910293 

3658 1678 4612 9948
Dataset: MSRanswer
True scores min, max, mean and std: 0.0 5.0 2.4921259842519685 1.7473086751273263
Sim scores min, max, mean and std: 1.905647440660122 5.000000596046448 4.035044073575274 0.6596854151975906
Corr. for MSRanswer: 0.5586727542550838 

15717 5868 21641 43226
Dataset: def2def_adj
True scores min, max, mean and std: 0 50 24.631877729257642 12.341513586651518
Sim scores min, max, mean and std: 0.13162139531477904 5.000000298023224 1.8305093865144875 0

Unified dataset

In [170]:
corr, sims = ds_sents_sim(unified_samples, unified_scores, self_att=True, 
                              stop_words=True, punct_marks=False, beta=1.5, embed_model='w2v', mu=1, weights=(1., 1.1, 2.4))  

print('Unified dataset:')
print('True scores min, max, mean and std:',np.min(unified_scores), np.max(unified_scores), np.mean(unified_scores), np.std(unified_scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Corr. for unified dataset:', corr, '\n')
print(s_norm_accum/experiments, r_norm_accum/experiments, o_norm_accum/experiments, s_norm_count, r_norm_count, o_norm_count, experiments)
print('Percentages of bigger norms: s:', s_norm_count/experiments, 'r:', r_norm_count/experiments, 'o:', o_norm_count/experiments)
s_norm_accum = 0.
r_norm_accum = 0.
o_norm_accum = 0.
s_norm_count = 0
r_norm_count = 0
o_norm_count = 0
experiments = 0

Unified dataset:
True scores min, max, mean and std: 0.0 5.0 2.4978209592641263 1.3166309708785562
Sim scores min, max, mean and std: 0.0 5.000000000000001 2.0994930630866966 1.1991706987956299
Corr. for unified dataset: 0.49258080059048603 

2.9688061627383324 1.1816499513123866 3.391188019426801 7300 1624 10327 19251
Percentages of bigger norms: s: 0.37920108046335255 r: 0.08435925406472392 o: 0.5364396654719236
