#### Some imports and datasets loading

In [1]:
import numpy as np
import pickle
from utils import utils_vectorize as uv

with open('datasets/msrpar_samples.pkl', 'rb') as f: 
    par_samples, par_scores = pickle.load(f)
with open('datasets/msrvid_samples.pkl', 'rb') as f: 
    vid_samples, vid_scores = pickle.load(f)
with open('datasets/msranswer_samples.pkl', 'rb') as f: 
    answer_samples, answer_scores = pickle.load(f)
# DEF2DEF flavors: original, adjusted, and 250 adjusted samples
with open('datasets/def2def_samples.pkl', 'rb') as f: 
    def2def_samples, def2def_scores = pickle.load(f)
with open('datasets/def2def_adjusted_samples.pkl', 'rb') as f: 
    def2def_adj_samples, def2def_adj_scores = pickle.load(f)
with open('datasets/def2def250_adjusted_samples.pkl', 'rb') as f: 
    def2def250_samples, def2def250_scores = pickle.load(f)

#### Functions

In [6]:
# Combinations: composition = (sum | ICDS); sim = (cos | ICM | euclid | dot prod)
# Receives a fact/triplet and returns a representative embedding, including subject, relation and object embeddings
def txtfact_to_vector(fact:tuple, stop_words=False, punct_marks=False, embed_model='w2v', mu='ratio'):
    # 1. A sequential composition into each element of triplet
    v_subj = uv.icds_vectorize(fact[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) #* 1.
    v_rel = uv.icds_vectorize(fact[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) #* 1
    v_obj = uv.icds_vectorize(fact[2], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * 3 #* 2 
    # 2. A last composition (see other composition functions) of the whole fact
    v_fact = uv.icds_composition(v_rel, v_obj)
    v_fact = uv.icds_composition(v_fact, v_subj)   # Best composition order
    return(v_fact, v_subj, v_rel, v_obj)

# Receives a complete knowledge graph (representative of a sentence) and returns a single vector
def txtkg_to_vector(txt_kg, stop_words=False, punct_marks=False, embed_model='w2v', mu='ratio'):
    n_facts = len(txt_kg)
    kg_vectors = []
    for txt_fact in txt_kg:
        v_fact, _, _, _ = txtfact_to_vector(txt_fact, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
        kg_vectors.append(v_fact)
    kg_vector = np.zeros(300)
    for vector in kg_vectors:
        kg_vector = kg_vector + vector
        #kg_vector = uv.icds_composition(kg_vector, vector)
    return(kg_vector)

# Receives a pair of knowldege graphs (two sentences) and returns a similarity measure between the two sentences
def txtpair_sim(kg_pair, stop_words=True, punct_marks=False, embed_model='w2v', mu='ratio'):     
    sims = []
    kg0 = kg_pair[0]
    kg1 = kg_pair[1]
    kg0_vector = txtkg_to_vector(kg0, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
    kg1_vector = txtkg_to_vector(kg1, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
    sim = max(0, uv.cos_sim(kg0_vector, kg1_vector))  
    #sim = uv.icm_sim(kg0_vector, kg1_vector, beta=1.2)     # icm sim 
    #sim = kg0_vector @ kg1_vector                          # dot sim
    #sim = np.linalg.norm(kg0_vector - kg1_vector)          # eucl sim
    return(sim)

# Correlation with trues in a dataset of KG pairs
def txt_kgs_sim(ds_txt_pairs, true_scores, stop_words=True, punct_marks=False, embed_model='w2v', mu='ratio'):
    sims = []
    for pair in ds_txt_pairs:
        sims.append(txtpair_sim(pair, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

#### Individual baselines by facts (weights = (1., 1., 2.))

In [4]:
# Test data
names = ['MSRpar', 'MSRvid', 'MSRanswer', 'def2def', 'def2def_adj', 'def2def250']
sets = [par_samples, vid_samples, answer_samples, def2def_samples, def2def_adj_samples, def2def250_samples]
true_scores = [par_scores, vid_scores, answer_scores, def2def_scores, def2def_adj_scores, def2def250_scores]
correlations = []

# Test combinations: embedding = ('w2v', 'glove'); mu = (0, 1, 'ratio')
for idx, elem in enumerate(sets):
    scores = true_scores[idx]
    corr, sims = txt_kgs_sim(elem, scores, stop_words=True, 
                               punct_marks=False, embed_model='w2v', mu='ratio')   
    correlations.append(corr)
    #print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
    #print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
    print('Corr. for', names[idx]+':', corr, '\n')
print('\tCorrelations mean:', np.mean(np.array(correlations)))

#paper_refs = [0.42, 0.82, 0.52, 0.52] # -> mean = .57 | W2V + BEST STR + F.INF + COS
# This file: ok 20250318

Corr. for MSRpar: 0.4889874832385299 

Corr. for MSRvid: 0.8138064009323206 

Corr. for MSRanswer: 0.428785711078538 

Corr. for def2def: 0.49104606918558713 

Corr. for def2def_adj: 0.5534544019433109 

Corr. for def2def250: 0.5880439602268333 

	Correlations mean: 0.56068733776752


#### Unified baseline by facts: with original DEF2DEF (weights = (1., 1., 3.))

In [33]:
# New unified datasets: 80% of samples and 83% of triplets are from DEF2DEF if original dataset 
samples = par_samples + vid_samples + answer_samples + def2def_samples
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def_scores]   # def2def scores 0-50 -> 0-5
print('Unified dataset size:', len(samples))

Unified dataset size: 3747


In [34]:
corr, sims = txt_kgs_sim(samples, scores, stop_words=True, punct_marks=True, embed_model='w2v', mu=1)   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

True scores min, max, mean and std: 0.0 5.0 2.5230496397117688 1.2987121599071274
Sim scores min, max, mean and std: 0.0 5.0000000000000036 1.8913147753614241 1.1798341943056625
Correlation: 0.4363632343040948


#### Unified baseline by facts: with adjusted DEF2DEF

In [7]:
samples = par_samples + vid_samples + answer_samples + def2def_adj_samples
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def_adj_scores]   # def2def scores 0-50 -> 0-5
print('Unified dataset size:', len(samples))

Unified dataset size: 3044


In [8]:
corr, sims = txt_kgs_sim(samples, scores, stop_words=True, punct_marks=True, embed_model='w2v', mu='ratio')   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

True scores min, max, mean and std: 0.0 5.0 2.4978209592641263 1.3166309708785562
Sim scores min, max, mean and std: 0.0 5.0000000000000036 2.025512038318277 1.224489238164242
Correlation: 0.47984201478817323


#### Unified baseline by facts: with adjusted DEF2DEF_250

In [37]:
samples = par_samples + vid_samples + answer_samples + def2def250_samples
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def250_scores]   # def2def scores 0-50 -> 0-5
print('Unified dataset size:', len(samples))

Unified dataset size: 1004


In [40]:
corr, sims = txt_kgs_sim(samples, scores, stop_words=True, punct_marks=True, embed_model='w2v', mu='ratio')   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

True scores min, max, mean and std: 0.0 5.0 2.550066733067729 1.4702609203245356
Sim scores min, max, mean and std: 0.0 5.0000000000000036 2.8584363034534332 1.4156776341649313
Correlation: 0.5405930536972031


#### Unified correlations summary

In [None]:
# This code: ok 20250615

# Individual baselines      # weights = 1., 1., 2. | sw = True
# Corr. for MSRpar: 0.4889874832385299 
# Corr. for MSRvid: 0.8138064009323206 
# Corr. for MSRanswer: 0.428785711078538 
# Corr. for def2def: 0.49104606918558713 
#       Mean four original datasets: 0.556
# Corr. for def2def_adj: 0.5534544019433109 
# Corr. for def2def250: 0.5880439602268333 

# Unified datasets          # weights = 1., 1., 3. | sw = True
# With original DEF2DEF = 0.436     
# With DEF2DEF adjusted = 0.480    
# With DEF2DEF_250 adj. = 0.541     