#### Some imports and datasets loading

In [7]:
import numpy as np
import pickle
from scipy.special import softmax
from utils import utils_vectorize as uv

with open('datasets/msrpar_samples.pkl', 'rb') as f: 
    msrpar_samples, msrpar_gold_scores = pickle.load(f)
with open('datasets/msrvid_samples.pkl', 'rb') as f: 
    msrvid_samples, msrvid_gold_scores = pickle.load(f)
with open('datasets/msranswer_samples.pkl', 'rb') as f: 
    msranswer_samples, msranswer_gold_scores = pickle.load(f)
# DEF2DEF flavors
with open('datasets/def2def_samples.pkl', 'rb') as f: 
    def2def_samples, def2def_gold_scores = pickle.load(f)
with open('datasets/def2def_adjusted_samples.pkl', 'rb') as f: 
    def2def_adjusted_samples, def2def_adjusted_scores = pickle.load(f)
with open('datasets/def2def250_adjusted_samples.pkl', 'rb') as f: 
    def2def250_samples, def2def250_scores = pickle.load(f)

#### Model functions

In [None]:
# Returns embedding of an input sentence expressed as a knowledge graph 
def fact_to_vector(fact:tuple, stop_words=False, punct_marks=False, embed_model='w2v', mu='ratio', weights=(1., 1., 2.)):
    # 1. A sequential composition into each element of triplet
    v_subj = uv.icds_vectorize(fact[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * weights[0] 
    v_rel = uv.icds_vectorize(fact[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * weights[1] 
    v_obj = uv.icds_vectorize(fact[2], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * weights[2] 
    # 2. A full composition on whole triplet
    #v_fact = v_subj + v_rel + v_obj
    v_subj_rel = uv.icds_composition(v_subj, v_rel)
    v_subj_obj = uv.icds_composition(v_subj, v_obj)
    v_rel_obj = uv.icds_composition(v_rel, v_obj)
    #v_fact = uv.icds_composition(v_subj_rel, v_obj)   
    v_fact = uv.icds_composition(v_subj, v_rel_obj)    
    #v_fact = uv.icds_composition(v_subj_obj, v_rel) 
    return(v_fact, v_subj, v_rel, v_obj)

# Returns a new, context embedding with self-attention, if requested
def kgtxt_to_selfatt_vectors(txt_kg, stop_words=False, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio', weights=(1., 1., 2.)):
    n_facts = len(txt_kg)
    weight_mtrx = np.empty((n_facts, n_facts))
    v_kg = []
    for txt_fact in txt_kg:
        v_fact, _, _, _ = fact_to_vector(txt_fact, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu, weights=weights)
        v_kg.append(v_fact)
    # 1. Compute fact-wise similarity (self-attention scores)
    for idx in range(n_facts):
        norm0 = np.max([1.e-125, np.linalg.norm(v_kg[idx])])
        for jdx in range(n_facts):
            norm1 = np.max([1.e-125, np.linalg.norm(v_kg[jdx])])
            norms_ratio = max(1.e-125, min(norm0, norm1)/max(norm0, norm1))  
            # Joint information content = IC(x,y) = IC(x) + IC(y) - <x,y>. Here ICM like dot product, with beta = 1.01  
            #weight_mtrx[idx][jdx] = (norm0**2 + norm1**2 - v_kg[idx] @ v_kg[jdx])  #uv.cos_sim(v_kg[idx], v_kg[jdx])) 
            weight_mtrx[idx][jdx] = (norms_ratio**2) * uv.cos_sim(v_kg[idx], v_kg[jdx])  
            #weight_mtrx[idx][jdx] = norms_ratio 
    # 2. Softmax/normalizing (self-att weights)
    #norm_weight_mtrx = softmax(weight_mtrx, axis=1)
    #norm_weight_mtrx = np.apply_along_axis(np.tanh, axis=1, arr=weight_mtrx) 
    #norm_weight_mtrx = np.apply_along_axis(uv.sigmoid, axis=1, arr=weight_mtrx) 
    norm_weight_mtrx = weight_mtrx 
    # 3. Context vectors (new contextual embeddings)
    self_att_mtrx = norm_weight_mtrx @ np.array(v_kg)     
    #print(self_att_mtrx)
    return(self_att_mtrx, np.array(v_kg))

In [3]:
# Computes similarity between two sentences expressed as knowledge graphs; uses self-attention if requested
def pair_sim(kg_pair, self_att=True, stop_words=True, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio', weights=(1., 1., 2.)):      # List of tuples of 3 strings
    #sims = []
    kg0 = kg_pair[0]
    kg1 = kg_pair[1]
    n_facts_kg0 = len(kg0)
    n_facts_kg1 = len(kg1)
    n_sims = n_facts_kg0 * n_facts_kg1
    sim_mtrx = np.empty((n_facts_kg0, n_facts_kg1))
    if self_att:
        self_att_mtrx0, _ = kgtxt_to_selfatt_vectors(kg0, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu, weights=weights)
        self_att_mtrx1, _ = kgtxt_to_selfatt_vectors(kg1, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu, weights=weights)
        for idx in range(n_facts_kg0):
            norm0 = np.max([1.e-125, np.linalg.norm(self_att_mtrx0[idx])])
            for jdx in range(n_facts_kg1):
                norm1 = np.max([1.e-125, np.linalg.norm(self_att_mtrx1[jdx])])
                norms_weight = np.max(max(1.e-125, np.min([norm0, norm1])/np.max([norm0, norm1])))
                ic_01 = (abs(norm0 + norm1 - self_att_mtrx0[idx] @ self_att_mtrx1[jdx])) 
                sim_mtrx[idx][jdx] = max(0, uv.cos_sim(self_att_mtrx0[idx], self_att_mtrx1[jdx])) 
                #sim_mtrx[idx][jdx] = uv.icm_sim(self_att_mtrx0[idx], self_att_mtrx1[jdx], beta=beta) 
                #sim_mtrx[idx][jdx] = np.linalg.norm(self_att_mtrx0[idx] - self_att_mtrx1[jdx])          
                #sim_mtrx[idx][jdx] = self_att_mtrx0[idx] @ self_att_mtrx1[jdx].T 
    else:           # No self-attention
        for idx, fact0 in enumerate(kg0):
            fact0_vector, _, _, _ = fact_to_vector(fact0, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu, weights=weights) 
            norm0 = np.linalg.norm(fact0_vector)
            for jdx, fact1 in enumerate(kg1):
                fact1_vector, _, _, _ = fact_to_vector(fact1, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu, weights=weights) 
                norm1 = np.linalg.norm(fact1_vector)
                ic_01 = (abs(norm0 + norm1 - fact0_vector @ fact1_vector)) 
                sim_mtrx[idx][jdx] = max(0, uv.cos_sim(fact0_vector, fact1_vector)) 
                #sim_mtrx[idx][jdx] = uv.icm_sim(fact0_vector, fact1_vector, beta=1.4) 
                #sim_mtrx[idx][jdx] = np.linalg.norm(fact0_vector - fact1_vector)
                #sim_mtrx[idx][jdx] = fact0_vector @ fact1_vector
    #print(sim_mtrx)
    #sents_sim = (np.mean(sim_mtrx)) 
    sents_sim = (uv.bidir_avgmax_sim(sim_mtrx, stdst='mean')) 
    #sents_sim = uv.bertscore(sim_mtrx)
    return(sents_sim)

# Receives a dataset, calls necessary functions, and returns a list of correlations between true and predicted similarities
def ds_sents_sim(ds, true_scores, self_att=True, stop_words=True, punct_marks=False, 
                 beta=1.2, embed_model='w2v', mu='ratio', fact_elems_weights=(1., 1., 2.)):
    sims = []
    for pair in ds:
        sims.append(pair_sim(pair, self_att=self_att, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu, weights=fact_elems_weights))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

#### Tests

With original DEF2DEF

In [None]:
# Main: launches computation of similarities correlation from a set of labeled datasets and gives additional info 
names = ['MSRpar', 'MSRvid', 'MSRanswer', 'def2def']
sets = [msrpar_samples, msrvid_samples, msranswer_samples, def2def_samples]
true_scores = [msrpar_gold_scores, msrvid_gold_scores, msranswer_gold_scores, def2def_gold_scores]
correlations = []

for idx, elem in enumerate(sets):
    scores = true_scores[idx]
    corr, sims = ds_sents_sim(elem, scores, self_att=True, stop_words=False, punct_marks=False, 
                              beta=1.5, embed_model='w2v', mu=1, fact_elems_weights=(1.0, 1.0, 1.9))   
    correlations.append(corr)
    print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
    print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
    print('Corr. for', names[idx]+':', corr, '\n')
print('Correlations mean:', np.mean(np.array(correlations)))

True scores min, max, mean and std: 0.5 5.0 3.21282 0.8924830192222147
Sim scores min, max, mean and std: 0.4078647181677021 4.986520309295598 3.9676966419359867 0.8177732215403202
Corr. for MSRpar: 0.5824865187312448 

True scores min, max, mean and std: 0.0 5.0 2.105848 1.6065765144853823
Sim scores min, max, mean and std: 0.8359113421734456 5.0 3.341124561591591 1.0528415365016415
Corr. for MSRvid: 0.8152805306709233 

True scores min, max, mean and std: 0.0 5.0 2.4921259842519685 1.7473086751273263
Sim scores min, max, mean and std: 1.4490575243077282 5.000000000000001 4.100845519618307 0.7114536349511864
Corr. for MSRanswer: 0.49090158401663037 

True scores min, max, mean and std: 0 50 25.029067824924823 12.310826756521202
Sim scores min, max, mean and std: 0.0 5.0 1.9746216313983886 0.7557850736923761
Corr. for def2def: 0.48004830664207027 

Correlations mean: 0.5921792350152172


In [None]:
# This file: ok 20250317
ALL_no_self_att = [.613]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.0 1.0 1.9 | mu=1 | v_fact=subj·rel_obj
ALL_self_att    = [.593]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.0 1.0 2.2 | mu=1 | v_fact=subj·rel_obj      

paper_refs =       [0.42, 0.82, 0.52, 0.53] # -> mean = 0.57 | W2V + BEST STR + F.INF + COS
bline_sents =      [0.44, 0.77, 0.46, 0.48] # -> mean = 0.54
bline_facts =      [0.51, 0.82, 0.42, 0.49] # -> mean = 0.56
dsets_best_corrs = [0.63, 0.83, 0.59, 0.51] # -> mean = 0.64  
# This file: best results (no self-att) resume for datasets:
model_mean_corrs = [0.61, 0.81, 0.55, 0.48] # -> mean = 0.61

With adjusted DEF2DEF

In [9]:
# Main: launches computation of similarities correlation from a set of labeled datasets and gives additional info 
names = ['MSRpar', 'MSRvid', 'MSRanswer', 'def2def']
sets = [msrpar_samples, msrvid_samples, msranswer_samples, def2def_adjusted_samples]
true_scores = [msrpar_gold_scores, msrvid_gold_scores, msranswer_gold_scores, def2def_adjusted_scores]
correlations = []

for idx, elem in enumerate(sets):
    scores = true_scores[idx]
    corr, sims = ds_sents_sim(elem, scores, self_att=True, stop_words=False, punct_marks=False, 
                              beta=1.5, embed_model='w2v', mu=1, fact_elems_weights=(1.0, 1.0, 2.0))   
    correlations.append(corr)
    print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
    print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
    print('Corr. for', names[idx]+':', corr, '\n')
print('Correlations mean:', np.mean(np.array(correlations)))

True scores min, max, mean and std: 0.5 5.0 3.21282 0.8924830192222147
Sim scores min, max, mean and std: 0.4263387526907665 4.987039938268274 3.9623629223121317 0.818220326385093
Corr. for MSRpar: 0.5856849078833871 

True scores min, max, mean and std: 0.0 5.0 2.105848 1.6065765144853823
Sim scores min, max, mean and std: 0.7821638518421423 5.0 3.319480636813137 1.0752708342034265
Corr. for MSRvid: 0.8145607140596144 

True scores min, max, mean and std: 0.0 5.0 2.4921259842519685 1.7473086751273263
Sim scores min, max, mean and std: 1.4180754665802828 5.0 4.083602520840214 0.7291004683030337
Corr. for MSRanswer: 0.48891924050713587 

True scores min, max, mean and std: 0 50 24.631877729257642 12.341513586651518
Sim scores min, max, mean and std: 0.08239538384240767 5.000000000000001 2.0318795664987066 0.7659579782203524
Corr. for def2def: 0.5554186043373748 

Correlations mean: 0.6111458666968781


In [None]:
# This file with new DEF2DEF adjusted subset: ok 20250331
ALL_no_self_att = [.630]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.0 1.0 1.6 | mu=1 | v_fact=subj·rel_obj
corrs = [0.60, 0.81, 0.56, 0.55] # -> mean = 0.63

ALL_self_att    = [.611]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.1 1.0 2.0 | mu=1 | v_fact=subj·rel_obj 
corrs = [0.59, 0.81, 0.49, 0.55] # -> mean = 0.61

With DEF2DEF_250

In [14]:
# Main: launches computation of similarities correlation from a set of labeled datasets and gives additional info 
names = ['MSRpar', 'MSRvid', 'MSRanswer', 'def2def']
sets = [msrpar_samples, msrvid_samples, msranswer_samples, def2def250_samples]
true_scores = [msrpar_gold_scores, msrvid_gold_scores, msranswer_gold_scores, def2def250_scores]
correlations = []

for idx, elem in enumerate(sets):
    scores = true_scores[idx]
    corr, sims = ds_sents_sim(elem, scores, self_att=True, stop_words=False, punct_marks=False, 
                              beta=1.5, embed_model='w2v', mu=1, fact_elems_weights=(1.1, 1.0, 2.1))   
    correlations.append(corr)
    print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
    print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
    print('Corr. for', names[idx]+':', corr, '\n')
print('Correlations mean:', np.mean(np.array(correlations)))

True scores min, max, mean and std: 0.5 5.0 3.21282 0.8924830192222147
Sim scores min, max, mean and std: 0.3953972235004608 4.986428620129196 3.9610215694670896 0.8213104102539583
Corr. for MSRpar: 0.5840683476095786 

True scores min, max, mean and std: 0.0 5.0 2.105848 1.6065765144853823
Sim scores min, max, mean and std: 0.8162183826021784 5.000000000000001 3.329364357979125 1.0738371942397933
Corr. for MSRvid: 0.8135236249961558 

True scores min, max, mean and std: 0.0 5.0 2.4921259842519685 1.7473086751273263
Sim scores min, max, mean and std: 1.3849142672543637 5.0 4.080324144879368 0.7333532304696779
Corr. for MSRanswer: 0.4873022171067613 

True scores min, max, mean and std: 1 49 23.904 12.394465861827205
Sim scores min, max, mean and std: 0.40222865687403736 5.0 1.983367053435215 0.761786920221834
Corr. for def2def: 0.5775524074900298 

Correlations mean: 0.6156116493006314


In [None]:
# This file with new DEF2DEF adjusted subset and _250_ samples: ok 20250331
ALL_no_self_att = [.638]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 0.9 1.0 1.8 | mu=1 | v_fact=subj·rel_obj
corrs = [0.61, 0.81, 0.56, 0.57] # -> mean = 0.64

ALL_self_att    = [.616]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.1 1.0 2.1 | mu=1 | v_fact=subj·rel_obj 
corrs = [0.58, 0.81, 0.49, 0.58] # -> mean = 0.62

#### Tests summary

In [None]:
# With original DEF2DEF     | This file: ok 20250616
ALL_no_self_att = [.613]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.0 1.0 1.9 | mu=1 | v_fact=subj·rel_obj
model_mean_corrs = [0.61, 0.81, 0.55, 0.48] # -> mean = 0.61
ALL_self_att    = [.593]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.0 1.0 2.2 | mu=1 | v_fact=subj·rel_obj      
self_att_corrs = [0.58, 0.82, 0.49, 0.48] # -> mean = 0.59

# With DEF2DEF adjusted     | ok 20250616
ALL_no_self_att = [.630]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.0 1.0 1.6 | mu=1 | v_fact=subj·rel_obj
no_self_att_corrs = [0.60, 0.81, 0.56, 0.55] # -> mean = 0.63
ALL_self_att    = [.611]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.1 1.0 2.0 | mu=1 | v_fact=subj·rel_obj 
self_att_corrs = [0.59, 0.81, 0.49, 0.55] # -> mean = 0.61

# With new DEF2DEF _250     | ok 20250616
ALL_no_self_att = [.638]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 0.9 1.0 1.8 | mu=1 | v_fact=subj·rel_obj
no_self_att_corrs = [0.61, 0.81, 0.56, 0.57] # -> mean = 0.64
ALL_self_att    = [.616]  # s_words=false | sim_mtrx=bidir mean | SRO weights: 1.1 1.0 2.1 | mu=1 | v_fact=subj·rel_obj 
self_att_corrs = [0.58, 0.81, 0.49, 0.58] # -> mean = 0.62

#### Some statistics from original datasets

In [6]:
pairs = []
triplets = []
words = []
vocab = []
triplets_x_sent = []

for elem in sets:
    n_pairs = 0
    n_triplets = 0
    n_words = 0
    unique_words = set()
    for pair in elem:
        n_pairs += 1
        for sentence in pair:
            for triplet in sentence:
                utterance = ' '.join(triplet).split()
                n_words += len(utterance)
                for word in utterance:
                    unique_words.add(word)
                n_triplets += 1
    pairs.append(n_pairs)
    triplets.append(n_triplets)
    words.append(n_words)
    vocab.append(len(unique_words))
    triplets_x_sent.append(float('%.2f' %(n_triplets /(n_pairs*2))))

print('Pairs of sentences:\n\t', pairs)
print('Total pairs:\n\t', sum(pairs))
print('Triplets:\n\t', triplets)
print('Total triplets:\n\t', sum(triplets))
print('Triplets by sent:\n\t', triplets_x_sent)
print('Words:\n\t', words)
print('Vocab (unique words):\n\t', vocab)

Pairs of sentences:
	 [250, 250, 254, 2993]
Total pairs:
	 3747
Triplets:
	 [2402, 657, 998, 19770]
Total triplets:
	 23827
Triplets by sent:
	 [4.8, 1.31, 1.96, 3.3]
Words:
	 [13486, 2872, 5263, 81556]
Vocab (unique words):
	 [2724, 565, 618, 2393]
