#### Some imports and datasets loading

In [1]:
import numpy as np
import pickle
from utils import utils_vectorize as uv

# Loading datasets
with open('datasets/msrpar_txt.pkl', 'rb') as f:    
    par_txt_pairs, par_scores = pickle.load(f)
with open('datasets/msrvid_txt.pkl', 'rb') as f:    
    vid_txt_pairs, vid_scores = pickle.load(f)
with open('datasets/msranswer_txt.pkl', 'rb') as f:    
    answer_txt_pairs, answer_scores = pickle.load(f)
with open('datasets/def2def_txt.pkl', 'rb') as f: 
    def2def_txt_pairs, def2def_scores = pickle.load(f)

#### Functions

In [None]:
# Similarity functions between two sentences
# Combinations: composition = (sum | ICDS); sim = (cos | ICM | dot prod | euclid)
def txt_pair_sim(txt_pair, stop_words=True, punct_marks=False, embed_model='w2v', mu='ratio'):  
    #v_sent0 = uv.sum_vectorize(txt_pair[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model)
    #v_sent1 = uv.sum_vectorize(txt_pair[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model)
    v_sent0 = uv.icds_vectorize(txt_pair[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
    v_sent1 = uv.icds_vectorize(txt_pair[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)   
    #sents_sim = uv.cos_sim(v_sent0, v_sent1)
    sents_sim = max(0, uv.cos_sim(v_sent0, v_sent1))        
    #sents_sim = uv.icm_sim(v_sent0, v_sent1, beta=1.1)     # ICM sim
    #sents_sim = v_sent0 @ v_sent1                          # dot sim
    #sents_sim = np.linalg.norm(v_sent0 - v_sent1)          # eucl sim
    return(sents_sim)

# Correlation with trues in a dataset of pairs
def txt_sents_sim(ds_txt_pairs, true_scores, stop_words=True, punct_marks=False, embed_model='w2v', mu='ratio'):
    sims = []
    for pair in ds_txt_pairs:
        sims.append(txt_pair_sim(pair, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

#### Tests

In [37]:
# Test data
names = ['MSRpar', 'MSRvid', 'MSRanswer', 'def2def']
sets = [par_txt_pairs, vid_txt_pairs, answer_txt_pairs, def2def_txt_pairs]
true_scores = [par_scores, vid_scores, answer_scores, def2def_scores]
correlations = []

# Test combinations: embedding = ('w2v', 'glove'); mu = (0, 1, 'ratio')
for idx, elem in enumerate(sets):
    scores = true_scores[idx]
    corr, sims = txt_sents_sim(elem, scores, stop_words=True, 
                               punct_marks=False, embed_model='w2v', mu='ratio')   
    correlations.append(corr)
    #print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
    #print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
    print('Corr. for', names[idx]+':', corr, '\n')
print('\tCorrelations mean:', np.mean(np.array(correlations)))

#paper_refs = [0.42, 0.82, 0.52, 0.53] # -> mean = .57 | W2V + BEST STR + F.INF + COS
# This file: ok 20250318

Corr. for MSRpar: 0.4408478804523934 

Corr. for MSRvid: 0.7711317385208524 

Corr. for MSRanswer: 0.4575873463810465 

Corr. for def2def: 0.4784509796334444 

	Correlations mean: 0.5370044862469342
