#### Some imports and datasets loading

In [1]:
import numpy as np
import pickle
from utils import utils_vectorize as uv

with open('datasets/msrpar_samples.pkl', 'rb') as f: 
    par_samples, par_gold_scores = pickle.load(f)
with open('datasets/msrvid_samples.pkl', 'rb') as f: 
    vid_samples, vid_gold_scores = pickle.load(f)
with open('datasets/msranswer_samples.pkl', 'rb') as f: 
    answer_samples, answer_gold_scores = pickle.load(f)
with open('datasets/def2def_samples.pkl', 'rb') as f: 
    def2def_samples, def2def_gold_scores = pickle.load(f)

#### Functions

In [None]:
# Combinations: composition = (sum | ICDS); sim = (cos | ICM | euclid | dot prod)
# Receives a fact/triplet and returns a representative embedding, including subject, relation and object embeddings
def txtfact_to_vector(fact:tuple, stop_words=False, punct_marks=False, embed_model='w2v', mu='ratio'):
    # 1. A sequential composition into each element of triplet
    #v_subj = uv.sum_vectorize(fact[0], stop_words=stop_words, punct_marks=punct_marks) * .9 
    #v_rel = uv.sum_vectorize(fact[1], stop_words=stop_words, punct_marks=punct_marks) 
    #v_obj = uv.sum_vectorize(fact[2], stop_words=stop_words, punct_marks=punct_marks) * 2. 
    v_subj = uv.icds_vectorize(fact[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) #* 1.
    v_rel = uv.icds_vectorize(fact[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) #* 1
    v_obj = uv.icds_vectorize(fact[2], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * 2.  
    # 2. A last composition (see other composition functions) of the whole fact
    #v_fact = v_subj + v_rel + v_obj
    v_fact = uv.icds_composition(v_rel, v_obj)
    v_fact = uv.icds_composition(v_fact, v_subj)   # Best composition order
    return(v_fact, v_subj, v_rel, v_obj)

# Receives a complete knowledge graph (representative of a sentence) and returns a single vector
def txtkg_to_vector(txt_kg, stop_words=False, punct_marks=False, embed_model='w2v', mu='ratio'):
    n_facts = len(txt_kg)
    kg_vectors = []
    for txt_fact in txt_kg:
        v_fact, _, _, _ = txtfact_to_vector(txt_fact, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
        kg_vectors.append(v_fact)
    kg_vector = np.zeros(300)
    for vector in kg_vectors:
        kg_vector = kg_vector + vector
        #kg_vector = uv.icds_composition(kg_vector, vector)
    return(kg_vector)

# Receives a pair of knowldege graphs (two sentences) and returns a similarity measure between the two sentences
def txtpair_sim(kg_pair, stop_words=True, punct_marks=False, embed_model='w2v', mu='ratio'):     
    sims = []
    kg0 = kg_pair[0]
    kg1 = kg_pair[1]
    kg0_vector = txtkg_to_vector(kg0, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
    kg1_vector = txtkg_to_vector(kg1, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
    sim = max(0, uv.cos_sim(kg0_vector, kg1_vector))  
    #sim = uv.icm_sim(kg0_vector, kg1_vector, beta=1.2)     # icm sim 
    #sim = kg0_vector @ kg1_vector                          # dot sim
    #sim = np.linalg.norm(kg0_vector - kg1_vector)          # eucl sim
    return(sim)

# Correlation with trues in a dataset of KG pairs
def txt_kgs_sim(ds_txt_pairs, true_scores, stop_words=True, punct_marks=False, embed_model='w2v', mu='ratio'):
    sims = []
    for pair in ds_txt_pairs:
        sims.append(txtpair_sim(pair, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

#### Tests

In [43]:
# Test data
names = ['MSRpar', 'MSRvid', 'MSRanswer', 'def2def']
sets = [par_samples, vid_samples, answer_samples, def2def_samples]
true_scores = [par_gold_scores, vid_gold_scores, answer_gold_scores, def2def_gold_scores]
correlations = []

# Test combinations: embedding = ('w2v', 'glove'); mu = (0, 1, 'ratio')
for idx, elem in enumerate(sets):
    scores = true_scores[idx]
    corr, sims = txt_kgs_sim(elem, scores, stop_words=True, 
                               punct_marks=False, embed_model='w2v', mu='ratio')   
    correlations.append(corr)
    #print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
    #print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
    print('Corr. for', names[idx]+':', corr, '\n')
print('\tCorrelations mean:', np.mean(np.array(correlations)))

#paper_refs = [0.42, 0.82, 0.52, 0.52] # -> mean = .57 | W2V + BEST STR + F.INF + COS
# This file: ok 20250318

Corr. for MSRpar: 0.5011930360856764 

Corr. for MSRvid: 0.8199665210729021 

Corr. for MSRanswer: 0.4308747028354711 

Corr. for def2def: 0.4860895691212907 

	Correlations mean: 0.5595309572788351
