#### Some imports and datasets loading

In [1]:
import numpy as np
import pickle
from scipy.special import softmax
from utils import utils_vectorize as uv

# Sentences
with open('datasets/msrpar_txt.pkl', 'rb') as f:    
    par_txt_samples, _ = pickle.load(f)
with open('datasets/msrvid_txt.pkl', 'rb') as f:    
    vid_txt_samples, _ = pickle.load(f)
with open('datasets/msranswer_txt.pkl', 'rb') as f:    
    answer_txt_samples, _ = pickle.load(f)
with open('datasets/def2def_txt.pkl', 'rb') as f: 
    def2def_txt_samples, _ = pickle.load(f)
# Sentences as triplets
with open('datasets/msrpar_samples.pkl', 'rb') as f: 
    par_samples, par_scores = pickle.load(f)
with open('datasets/msrvid_samples.pkl', 'rb') as f: 
    vid_samples, vid_scores = pickle.load(f)
with open('datasets/msranswer_samples.pkl', 'rb') as f: 
    answer_samples, answer_scores = pickle.load(f)
with open('datasets/def2def_samples.pkl', 'rb') as f: 
    def2def_samples, def2def_scores = pickle.load(f)

#### Unified datasets: full (4) unified and only STS datasets (3)

In [36]:
# New unified datasets: 80% of samples and 83% of triplets are from DEF2DEF
sent_txt_samples = par_txt_samples + vid_txt_samples + answer_txt_samples + def2def_txt_samples
samples = par_samples + vid_samples + answer_samples + def2def_samples
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def_scores]   # def2def scores 0-50 -> 0-5

In [43]:
# Tests with only STS subdatasets
sent_txt_samples = par_txt_samples + vid_txt_samples + answer_txt_samples 
samples = par_samples + vid_samples + answer_samples
scores = par_scores + vid_scores + answer_scores 

#### Test with text sentences: one embedding per sentence

In [8]:
# Similarity functions between two sentences and correlation with trues in a dataset of pairs
def txt_sents_sim(ds_txt_pairs, true_scores, stop_words=True, punct_marks=False, embed_model='w2v', mu='ratio'):
    sims = []
    for pair in ds_txt_pairs:
        v_sent0 = uv.icds_vectorize(pair[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
        v_sent1 = uv.icds_vectorize(pair[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
        sims.append(max(0, uv.cos_sim(v_sent0, v_sent1)))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

In [9]:
# Main: presents correlation and additional info
corr, sims = txt_sents_sim(sent_txt_samples, scores, stop_words=True, punct_marks=True, embed_model='w2v', mu='ratio')   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)
# All datasets
#   0.38 
# Only STS subdatasets:
#   0.46

True scores min, max, mean and std: 0.0 5.0 2.6030066312997344 1.5355082496804318
Sim scores min, max, mean and std: 0.07803620708872415 5.000000000000001 3.51004306471365 1.099592301119218
Correlation: 0.45679290202770284


#### With text triplets: one embedding per sentence 

In [25]:
# Functions to get similarity between two KGs
# Get embedding of a sentence expressed as a knowledge graph 
def txtkg_to_vector(txt_kg, stop_words=False, punct_marks=False, embed_model='w2v', mu='ratio'):
    kg_vector = np.zeros(300)
    for txt_fact in txt_kg:
        v_subj = uv.icds_vectorize(txt_fact[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * .4 #* .4 #* .8
        v_rel = uv.icds_vectorize(txt_fact[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * 1. #* 1. #*.7
        v_obj = uv.icds_vectorize(txt_fact[2], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) * 2.5 #* 2.5 #* 2.5
        v_fact = uv.icds_composition(v_subj, v_rel)
        v_fact = uv.icds_composition(v_fact, v_obj)
        kg_vector = uv.icds_composition(kg_vector, v_fact)
        #kg_vector = kg_vector + v_fact
    return(kg_vector)

# Correlation with trues in a dataset of KG pairs
def txt_kgs_sim(ds_txt_pairs, true_scores, stop_words=True, punct_marks=False, embed_model='w2v', mu='ratio'):
    sims = []
    for pair in ds_txt_pairs:
        kg0_vector = txtkg_to_vector(pair[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
        kg1_vector = txtkg_to_vector(pair[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
        sims.append(max(0, uv.cos_sim(kg0_vector, kg1_vector)))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

In [26]:
# Main: presents correlation and additional info
corr, sims = txt_kgs_sim(samples, scores, stop_words=True, punct_marks=True, embed_model='w2v', mu=1)   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)
# All datasets
#   0.43  No self-att 0.8 0.7 2.5
# Only STS subdatasets:
#   0.60  No self-att 0.4 1.0 2.5

True scores min, max, mean and std: 0.0 5.0 2.6030066312997344 1.5355082496804318
Sim scores min, max, mean and std: 0.0 5.000000000000002 3.1544875409962536 1.4017510271528228
Correlation: 0.6044777287008982


#### Full model: one embedding per fact

In [None]:
# Gives embedding of a sentence expressed as a knowledge graph 
def fact_to_vector(fact:tuple, stop_words=False, punct_marks=False, embed_model='w2v', mu='ratio'):
    # 1. A sequential composition into each element of triplet
    v_subj = uv.icds_vectorize(fact[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) *.5
    v_rel = uv.icds_vectorize(fact[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) *1.1 
    v_obj = uv.icds_vectorize(fact[2], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) *2.4
    # 2. A full composition on whole triplet
    v_subj_rel = uv.icds_composition(v_subj, v_rel)
    v_subj_obj = uv.icds_composition(v_subj, v_obj)
    v_rel_obj = uv.icds_composition(v_rel, v_obj)
    v_fact = uv.icds_composition(v_subj_rel, v_obj)    
    #v_fact = uv.icds_composition(v_subj, v_rel_obj)    
    #v_fact = uv.icds_composition(v_subj_obj, v_rel) 
    return(v_fact, v_subj, v_rel, v_obj)

# Returns new, context embeddings with self-attention, if requested
def kgtxt_to_selfatt_vectors(txt_kg, stop_words=False, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio'):
    n_facts = len(txt_kg)
    weight_mtrx = np.empty((n_facts, n_facts))
    v_kg = []
    for txt_fact in txt_kg:
        v_fact, _, _, _ = fact_to_vector(txt_fact, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
        v_kg.append(v_fact)
    # 1. Compute fact-wise similarity (self-attention scores)
    for idx in range(n_facts):
        norm0 = np.max([1.e-125, np.linalg.norm(v_kg[idx])])**1
        for jdx in range(n_facts):
            norm1 = np.max([1.e-125, np.linalg.norm(v_kg[jdx])])**1
            norms_ratio = np.max(max(1.e-125, np.min([norm0, norm1])/np.max([norm0, norm1]))) 
            # Joint information content = IC(x,y) = IC(x) + IC(y) - <x,y>. Here ICM like dot product, with beta = 1.01  
            #weight_mtrx[idx][jdx] = (norm0**2 + norm1**2 - v_kg[idx] @ v_kg[jdx])  #uv.cos_sim(v_kg[idx], v_kg[jdx])) 
            weight_mtrx[idx][jdx] = (norms_ratio**2) * uv.cos_sim(v_kg[idx], v_kg[jdx])   
    # 2. Softmax/normalizing (self-att weights)
    #norm_weight_mtrx = softmax(weight_mtrx, axis=1) 
    #norm_weight_mtrx = np.apply_along_axis(np.tanh, axis=1, arr=weight_mtrx) 
    #norm_weight_mtrx = np.apply_along_axis(uv.sigmoid, axis=1, arr=weight_mtrx) 
    norm_weight_mtrx = weight_mtrx 
    # 3. Context vectors (new contextual embeddings)
    self_att_mtrx = norm_weight_mtrx @ np.array(v_kg)     
    return(self_att_mtrx, np.array(v_kg))

In [None]:
# Computes similarity between two sentences expressed as knowledge graphs; uses self-attention if requested
def pair_sim(kg_pair, self_att=True, stop_words=True, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio'):   # kg_pair is a list of tuples of 3 strings
    kg0 = kg_pair[0]
    kg1 = kg_pair[1]
    sim_mtrx = np.empty((len(kg0), len(kg1)))
    if self_att:
        self_att_mtrx0, _ = kgtxt_to_selfatt_vectors(kg0, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu)
        self_att_mtrx1, _ = kgtxt_to_selfatt_vectors(kg1, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu)
        for idx in range(len(kg0)):
            for jdx in range(len(kg1)):
                sim_mtrx[idx][jdx] = max(0, uv.cos_sim(self_att_mtrx0[idx], self_att_mtrx1[jdx])) 
    else:          
        for idx, fact0 in enumerate(kg0):
            fact0_vector, _, _, _ = fact_to_vector(fact0, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) 
            for jdx, fact1 in enumerate(kg1):
                fact1_vector, _, _, _ = fact_to_vector(fact1, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu) 
                sim_mtrx[idx][jdx] = max(0, uv.cos_sim(fact0_vector, fact1_vector))
    #sents_sim = (np.mean(sim_mtrx)) 
    sents_sim = (uv.bidir_avgmax_sim(sim_mtrx, stdst='mean'))  
    #sents_sim = uv.bertscore(sim_mtrx) 
    return(sents_sim)

# Receives a dataset, calls necessary functions, and returns a list of correlations between true and predicted similarities
def ds_sents_sim(ds, true_scores, self_att=True, stop_words=True, punct_marks=False, beta=1.2, embed_model='w2v', mu='ratio'):
    sims = []
    for pair in ds:
        sims.append(pair_sim(pair, self_att=self_att, stop_words=stop_words, punct_marks=punct_marks, beta=beta, embed_model=embed_model, mu=mu))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

In [46]:
# Main: launches computation of similarities correlation from a labeled dataset and gives additional info 
corr, sims = ds_sents_sim(samples, scores, self_att=False,  
                        stop_words=True, punct_marks=False, beta=1.5, embed_model='w2v', mu=1)   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)
# All datasets
#   0.42 No self-att 1.0 1.0 2.1 
#   0.45 Self-att    0.8 0.8 2.3 s_words=True mu=ratio
# Only STS subdatasets
#   0.66 No self-att 0.5 1.1 2.4 s_words=False
#   0.65 Self_att    0.4 1.1 2.6 s_words=False

True scores min, max, mean and std: 0.0 5.0 2.6030066312997344 1.5355082496804318
Sim scores min, max, mean and std: 0.0 5.0 3.150165341045448 1.2528576225134695
Correlation: 0.6556616326365252


In [None]:
# This file: ok 20250318