#### Imports and dataset loading

In [1]:
import numpy as np
import pickle
from scipy.special import softmax
from utils import utils_vectorize as uv

with open('datasets/def2def_samples.pkl', 'rb') as f: 
    def2def_samples, def2def_gold_scores = pickle.load(f)

#### Model functions

In [None]:
def fact_to_vector(fact:tuple, stop_words=False, punct_marks=False, embed_model='w2v', mu='ratio'):
    # 1. A sequential composition into each element of triplet
    v_subj = uv.icds_vectorize(fact[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * 1.1 
    v_rel = uv.icds_vectorize(fact[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * .45 
    v_obj = uv.icds_vectorize(fact[2], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * 1.6 
    # 2. A full composition on whole triplet
    #v_fact = v_subj + v_rel + v_obj
    v_subj_rel = uv.icds_composition(v_subj, v_rel)
    v_subj_obj = uv.icds_composition(v_subj, v_obj)
    v_rel_obj = uv.icds_composition(v_rel, v_obj)
    #v_fact = uv.icds_composition(v_subj_rel, v_obj) 
    v_fact = uv.icds_composition(v_subj_obj, v_rel) 
    #v_fact = uv.icds_composition(v_subj, v_rel_obj)   
    return(v_fact, v_subj, v_rel, v_obj)

# Self-attention
def kgtxt_to_selfatt_vectors(txt_kg, stop_words=False, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio'):
    n_facts = len(txt_kg)
    weight_mtrx = np.empty((n_facts, n_facts))
    v_kg = []
    for txt_fact in txt_kg:
        v_fact, _, _, _ = fact_to_vector(txt_fact, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
        v_kg.append(v_fact)
    # 1. Compute fact-wise similarity (self-attention scores)
    for idx in range(n_facts):
        norm0 = np.max([1.e-125, np.linalg.norm(v_kg[idx])])
        for jdx in range(n_facts):
            norm1 = np.max([1.e-125, np.linalg.norm(v_kg[jdx])])
            norms_ratio = max(1.e-125, np.min([norm0, norm1])/np.max([norm0, norm1])) 
            # Joint information content = IC(x,y) = IC(x) + IC(y) - <x,y> 
            #weight_mtrx[idx][jdx] = norm0**2 + norm1**2 - v_kg[idx] @ v_kg[jdx] #uv.cos_sim(v_kg[idx], v_kg[jdx])) 
            weight_mtrx[idx][jdx] = (norms_ratio ** 2) * uv.cos_sim(v_kg[idx], v_kg[jdx]) 
    # 2. Softmax/normalizing (self-att weights)
    #norm_weight_mtrx = softmax(weight_mtrx, axis=1) 
    #norm_weight_mtrx = np.apply_along_axis(np.tanh, axis=1, arr=weight_mtrx) 
    #norm_weight_mtrx = np.apply_along_axis(uv.sigmoid, axis=1, arr=weight_mtrx) 
    norm_weight_mtrx = weight_mtrx
    # 3. Context vectors (new contextual embeddings)
    self_att_mtrx = norm_weight_mtrx @ np.array(v_kg) 
    #print(self_att_mtrx)
    return(self_att_mtrx, np.array(v_kg))

In [None]:
# Functions to get similarity between two KGs
# Combinations: composition = (sum | ICDS); sim = (cos | ICM | euclid | dot prod); sents_sim: (mean | median | bidir | bertscore)
def pair_sim(kg_pair, self_att=True, stop_words=True, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio'):  # List of tuples of 3 strings
    sims = []
    kg0 = kg_pair[0]
    kg1 = kg_pair[1]
    n_facts_kg0 = len(kg0)
    n_facts_kg1 = len(kg1)
    n_sims = n_facts_kg0 * n_facts_kg1
    sim_mtrx = np.empty((n_facts_kg0, n_facts_kg1))
    if self_att:
        self_att_mtrx0, _ = kgtxt_to_selfatt_vectors(kg0, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu)
        self_att_mtrx1, _ = kgtxt_to_selfatt_vectors(kg1, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu)
        for idx in range(n_facts_kg0):
            norm0 = np.max([1.e-125, np.linalg.norm(self_att_mtrx0[idx])])
            for jdx in range(n_facts_kg1):
                norm1 = np.max([1.e-125, np.linalg.norm(self_att_mtrx1[jdx])])
                norms_weight = np.max(max(1.e-125, np.min([norm0, norm1])/np.max([norm0, norm1])))
                ic_01 = (abs(norm0 + norm1 - self_att_mtrx0[idx] @ self_att_mtrx1[jdx])) 
                sim_mtrx[idx][jdx] =  max(0, uv.cos_sim(self_att_mtrx0[idx], self_att_mtrx1[jdx])) 
                #sim_mtrx[idx][jdx] = uv.icm_sim(self_att_mtrx0[idx], self_att_mtrx1[jdx], beta=beta) 
                #sim_mtrx[idx][jdx] = np.linalg.norm(self_att_mtrx0[idx] - self_att_mtrx1[jdx])         
                #sim_mtrx[idx][jdx] = self_att_mtrx0[idx] @ self_att_mtrx1[jdx].T 
    else:           # No self-attention
        for idx, fact0 in enumerate(kg0):
            fact0_vector, _, _, _ = fact_to_vector(fact0, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) 
            norm0 = np.linalg.norm(fact0_vector)
            for jdx, fact1 in enumerate(kg1):
                fact1_vector, _, _, _ = fact_to_vector(fact1, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) 
                norm1 = np.linalg.norm(fact1_vector)
                sim_mtrx[idx][jdx] = max(0, uv.cos_sim(fact0_vector, fact1_vector)) 
                #sim_mtrx[idx][jdx] = uv.icm_sim(fact0_vector, fact1_vector, beta=1.4) 
    #print(sim_mtrx)
    #sents_sim = np.mean(sim_mtrx) 
    sents_sim = uv.bidir_avgmax_sim(sim_mtrx, stdst='mean') 
    #sents_sim = uv.bertscore(sim_mtrx)
    return(sents_sim)

# Correlation with true in a dataset of KG pairs
def ds_sents_sim(ds, true_scores, self_att=True, stop_words=True, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio'):
    sims = []
    for pair in ds:
        sims.append(pair_sim(pair, self_att=self_att, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

### Tests

In [89]:
# Test combinations: embedding = ('w2v', 'glove'); mu = (0, 1, 'ratio')
samples = def2def_samples
scores = def2def_gold_scores
corr, sims = ds_sents_sim(samples, scores, self_att=True,  
                        stop_words=True, punct_marks=True, beta=1.5, embed_model='w2v', mu=1)   
print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Corr. for MSRpar:', corr, '\n')

True scores min, max, mean and std: 0 50 25.029067824924823 12.310826756521202
Sim scores min, max, mean and std: 0.0 5.000000000000001 1.6063525950298876 0.8426754600232405
Corr. for MSRpar: 0.5140641569324236 



In [None]:
# Best results | This file: ok 20250318
paper_ref =     [0.53]  # W2V + BEST STR + F.INF + COS
no_self_att =   [0.5114]  # s_words ok | sim_mtrx=bidir mean | SRO ws: 1.2 0.6 1.5 | mu=1 | v_fact=subj_rel·obj
self_att =      [0.5141]  # s_words ok | sim_mtrx=bidir mean | SRO ws: 1.1 0.4 1.6 | mu=1 | v_fact=subj_obj·rel | sa=no_sa