#### Some imports and datasets loading

In [None]:
import numpy as np
import pickle
from scipy.special import softmax
from utils import utils_vectorize as uv

with open('datasets/msrpar_samples.pkl', 'rb') as f: 
    par_samples, par_scores = pickle.load(f)
with open('datasets/msrvid_samples.pkl', 'rb') as f: 
    vid_samples, vid_scores = pickle.load(f)
with open('datasets/msranswer_samples.pkl', 'rb') as f: 
    answer_samples, answer_scores = pickle.load(f)

# DEF2DEF flavors
with open('datasets/def2def_samples.pkl', 'rb') as f: 
    def2def_samples, def2def_scores = pickle.load(f)
with open('datasets/def2def_adjusted_samples.pkl', 'rb') as f: 
    def2def_adjusted_samples, def2def_adjusted_scores = pickle.load(f)
with open('datasets/def2def250_adjusted_samples.pkl', 'rb') as f: 
    def2def250_samples, def2def250_scores = pickle.load(f)

#### Model functions

In [147]:
# Gives embedding of a sentence expressed as a knowledge graph 
def fact_to_vector(fact:tuple, stop_words=False, punct_marks=False, embed_model='w2v', mu='ratio', weights=(1.0, 1.0, 2.0)):
    # 1. A sequential composition into each element of triplet
    v_subj = uv.icds_vectorize(fact[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * weights[0] #*.5
    v_rel = uv.icds_vectorize(fact[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * weights[1] #*1.1 
    v_obj = uv.icds_vectorize(fact[2], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * weights[2] # *2.4
    # 2. A full composition on whole triplet
    #v_fact = v_subj + v_rel + v_obj
    v_subj_rel = uv.icds_composition(v_subj, v_rel)
    v_subj_obj = uv.icds_composition(v_subj, v_obj)
    v_rel_obj = uv.icds_composition(v_rel, v_obj)
    v_fact = uv.icds_composition(v_subj_rel, v_obj)    
    #v_fact = uv.icds_composition(v_subj, v_rel_obj)    
    #v_fact = uv.icds_composition(v_subj_obj, v_rel) 
    return(v_fact, v_subj, v_rel, v_obj)

# Returns new, context embeddings with self-attention, if requested
def kgtxt_to_selfatt_vectors(txt_kg, stop_words=False, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio', weights=(1.0, 1.0, 2.0)):
    n_facts = len(txt_kg)
    weight_mtrx = np.empty((n_facts, n_facts))
    v_kg = []
    for txt_fact in txt_kg:
        v_fact, _, _, _ = fact_to_vector(txt_fact, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu, weights=weights)
        v_kg.append(v_fact)
    # 1. Compute fact-wise similarity (self-attention scores)
    for idx in range(n_facts):
        norm0 = np.max([1.e-125, np.linalg.norm(v_kg[idx])])**1
        for jdx in range(n_facts):
            norm1 = np.max([1.e-125, np.linalg.norm(v_kg[jdx])])**1
            norms_ratio = np.max(max(1.e-125, np.min([norm0, norm1])/np.max([norm0, norm1]))) 
            # Joint information content = IC(x,y) = IC(x) + IC(y) - <x,y>. Here ICM like dot product, with beta = 1.01  
            #weight_mtrx[idx][jdx] = (norm0**2 + norm1**2 - v_kg[idx] @ v_kg[jdx])  #uv.cos_sim(v_kg[idx], v_kg[jdx])) 
            weight_mtrx[idx][jdx] = (norms_ratio**2) * uv.cos_sim(v_kg[idx], v_kg[jdx])   
    # 2. Softmax/normalizing (self-att weights)
    #norm_weight_mtrx = softmax(weight_mtrx, axis=1) 
    #norm_weight_mtrx = np.apply_along_axis(np.tanh, axis=1, arr=weight_mtrx) 
    #norm_weight_mtrx = np.apply_along_axis(uv.sigmoid, axis=1, arr=weight_mtrx) 
    norm_weight_mtrx = weight_mtrx 
    # 3. Context vectors (new contextual embeddings)
    self_att_mtrx = norm_weight_mtrx @ np.array(v_kg)     
    return(self_att_mtrx, np.array(v_kg))

In [86]:
# Computes similarity between two sentences expressed as knowledge graphs; uses self-attention if requested
def pair_sim(kg_pair, self_att=True, stop_words=True, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio', weights=(1.0, 1.0, 2.0)):   # kg_pair is a list of tuples of 3 strings
    kg0 = kg_pair[0]
    kg1 = kg_pair[1]
    sim_mtrx = np.empty((len(kg0), len(kg1)))
    if self_att:
        self_att_mtrx0, _ = kgtxt_to_selfatt_vectors(kg0, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu, weights=weights)
        self_att_mtrx1, _ = kgtxt_to_selfatt_vectors(kg1, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu, weights=weights)
        for idx in range(len(kg0)):
            for jdx in range(len(kg1)):
                sim_mtrx[idx][jdx] = max(0, uv.cos_sim(self_att_mtrx0[idx], self_att_mtrx1[jdx])) 
    else:          
        for idx, fact0 in enumerate(kg0):
            fact0_vector, _, _, _ = fact_to_vector(fact0, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu, weights=weights) 
            for jdx, fact1 in enumerate(kg1):
                fact1_vector, _, _, _ = fact_to_vector(fact1, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) 
                sim_mtrx[idx][jdx] = max(0, uv.cos_sim(fact0_vector, fact1_vector))
    #sents_sim = (np.mean(sim_mtrx)) 
    sents_sim = (uv.bidir_avgmax_sim(sim_mtrx, stdst='mean'))  
    #sents_sim = uv.bertscore(sim_mtrx) 
    return(sents_sim)

# Receives a dataset, calls necessary functions, and returns a list of correlations between true and predicted similarities
def ds_sents_sim(ds, true_scores, self_att=True, stop_words=True, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio', weights=(1.0, 1.0, 2.0)):
    sims = []
    for pair in ds:
        sims.append(pair_sim(pair, self_att=self_att, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu, weights=weights))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

#### Tests

With original DEF2DEF

In [64]:
# New unified datasets: 80% of samples and 83% of triplets are from DEF2DEF
samples = par_samples + vid_samples + answer_samples + def2def_samples
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def_scores]   # def2def scores 0-50 -> 0-5

In [65]:
# Main: launches computation of similarities correlation from a labeled dataset and gives additional info 
corr, sims = ds_sents_sim(samples, scores, self_att=True,  
                        stop_words=True, punct_marks=False, beta=1.5, embed_model='w2v', mu=1, weights=(.8, .8, 2.1))   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

# This file: ok 20250615
# All datasets correlations
#   0.423 No self-att 1.0 1.0 2.1 
#   0.449 Self-att    0.8 0.8 2.1 s_words=True mu=ratio

True scores min, max, mean and std: 0.0 5.0 2.5230496397117688 1.2987121599071274
Sim scores min, max, mean and std: 0.0 5.000000000000001 1.9410112279557226 1.1643903815883945
Correlation: 0.4486059682012903


With adjusted DEF2DEF

In [67]:
samples = par_samples + vid_samples + answer_samples + def2def_adjusted_samples
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def_adjusted_scores]   # def2def scores 0-50 -> 0-5

In [69]:
# Main: launches computation of similarities correlation from a labeled dataset and gives additional info 
corr, sims = ds_sents_sim(samples, scores, self_att=True,  
                        stop_words=True, punct_marks=False, beta=1.5, embed_model='w2v', mu=1, weights=(.8, .8, 2.1))   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

# This file: ok 20250615
# All datasets correlations
#   0.468 No self-att 1.0 1.0 2.1 
#   0.496 Self-att    0.8 0.8 2.1 s_words=True mu=ratio

True scores min, max, mean and std: 0.0 5.0 2.4978209592641263 1.3166309708785562
Sim scores min, max, mean and std: 0.0 5.000000000000001 2.0787000873651333 1.2043549524052133
Correlation: 0.49572963089479344


With DEF2DEF_250

In [148]:
samples = par_samples + vid_samples + answer_samples + def2def250_samples
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def250_scores]   # def2def scores 0-50 -> 0-5

In [151]:
# Main: launches computation of similarities correlation from a labeled dataset and gives additional info 
corr, sims = ds_sents_sim(samples, scores, self_att=True,  
                        stop_words=True, punct_marks=False, beta=1.5, embed_model='w2v', mu=1, weights=(.7, 1.5, 3.7))   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

# This file: ok 20250615
# All datasets correlations
#   0.559 No self-att 0.4 1.6 3.9 
#   0.568 Self-att    0.7 1.5 3.7 s_words=True mu=ratio

True scores min, max, mean and std: 0.0 5.0 2.550066733067729 1.4702609203245356
Sim scores min, max, mean and std: 0.0 5.000000000000001 2.8312302216626666 1.4010997353108443
Correlation: 0.5679214681146413


#### Tests summary

In [None]:
# This file: ok 20250331    
# All datasets with original DEF2DEF
no_self_att = [0.423]   # sw=True | mu=1 | sents_sim=bidir | SROweights=1.0 1.0 2.1 | v_fact=s_r·o
self_att    = [0.449]   # sw=True | mu=1 | sents_sim=bidir | SROweights=0.8 0.8 2.1 | v_fact=s_r·o 

# All datasets with DEF2DEF adjusted
no_self_att = [0.468]   # sw=True | mu=1 | sents_sim=bidir | SROweights=1.0 1.0 2.1 | v_fact=s_r·o
self_att    = [0.496]   # sw=True | mu=1 | sents_sim=bidir | SROweights=0.8 0.8 2.1 | v_fact=s_r·o 

# All datasets with DEF2DEF_250 adjusted 
no_self_att = [0.559]   # sw=True | mu=0 | sents_sim=bertscore | SROweights=0.4 1.6 3.9 | v_fact=s_r·o
self_att    = [0.568]   # sw=True | mu=1 | sents_sim=bertscore | SROweights=0.7 1.5 3.7 | v_fact=s_r·o