#### Some imports and datasets loading

In [23]:
import numpy as np
import pickle
from utils import utils_vectorize as uv

# Loading datasets
with open('datasets/msrpar_txt.pkl', 'rb') as f:    
    par_txt_pairs, par_scores = pickle.load(f)
with open('datasets/msrvid_txt.pkl', 'rb') as f:    
    vid_txt_pairs, vid_scores = pickle.load(f)
with open('datasets/msranswer_txt.pkl', 'rb') as f:    
    answer_txt_pairs, answer_scores = pickle.load(f)

# DEF2DEF dataset flavors
with open('datasets/def2def_txt.pkl', 'rb') as f: 
    def2def_txt_pairs, def2def_scores = pickle.load(f)
with open('datasets/def2def_adjusted_txt.pkl', 'rb') as f: 
    def2def_adj_txt_pairs, def2def_adj_scores = pickle.load(f)
with open('datasets/def2def250_adjusted_txt.pkl', 'rb') as f: 
    def2def250_txt_pairs, def2def250_scores = pickle.load(f)

#### Functions

In [24]:
# Similarity functions between two sentences
# Combinations: composition = (sum | ICDS); sim = (cos | ICM | dot prod | euclid)
def txt_pair_sim(txt_pair, stop_words=True, punct_marks=False, embed_model='w2v', mu='ratio'):  
    v_sent0 = uv.icds_vectorize(txt_pair[0], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)
    v_sent1 = uv.icds_vectorize(txt_pair[1], stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu)   
    sents_sim = max(0, uv.cos_sim(v_sent0, v_sent1))        
    #sents_sim = uv.icm_sim(v_sent0, v_sent1, beta=1.1)     # ICM sim
    #sents_sim = v_sent0 @ v_sent1                          # dot sim
    #sents_sim = np.linalg.norm(v_sent0 - v_sent1)          # eucl sim
    return(sents_sim)

# Correlation with trues in a dataset of pairs
def txt_sents_sim(ds_txt_pairs, true_scores, stop_words=True, punct_marks=False, embed_model='w2v', mu='ratio'):
    sims = []
    for pair in ds_txt_pairs:
        sims.append(txt_pair_sim(pair, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model, mu=mu))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

#### Individual datasets baseline by sentences

In [25]:
# Test data
names = ['MSRpar', 'MSRvid', 'MSRanswer', 'def2def', 'def2def_adj', 'def2def250']
sets = [par_txt_pairs, vid_txt_pairs, answer_txt_pairs, def2def_txt_pairs, def2def_adj_txt_pairs, def2def250_txt_pairs]
true_scores = [par_scores, vid_scores, answer_scores, def2def_scores, def2def_adj_scores, def2def250_scores]
correlations = []

# Test combinations: embedding = ('w2v', 'glove'); mu = (0, 1, 'ratio')
for idx, elem in enumerate(sets):
    scores = true_scores[idx]
    corr, sims = txt_sents_sim(elem, scores, stop_words=True, 
                               punct_marks=False, embed_model='w2v', mu='ratio')   
    correlations.append(corr)
    #print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
    #print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
    print('Corr. for', names[idx]+':', corr, '\n')
print('\tCorrelations mean:', np.mean(np.array(correlations)))

#paper_refs = [0.42, 0.82, 0.52, 0.53] # -> mean = .57 | W2V + BEST STR + F.INF + COS
# This file: ok 20250615

Corr. for MSRpar: 0.4197711326094004 

Corr. for MSRvid: 0.7790563368288715 

Corr. for MSRanswer: 0.45504263740487616 

Corr. for def2def: 0.4851967199582447 

Corr. for def2def_adj: 0.5505453897281186 

Corr. for def2def250: 0.5950603614061716 

	Correlations mean: 0.5474454296559471


#### Unified dataset baseline by sentences with original DEF2DEF

In [26]:
# New unified datasets: 80% of samples and 83% of triplets are from DEF2DEF if original dataset 
samples = par_txt_pairs + vid_txt_pairs + answer_txt_pairs + def2def_txt_pairs
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def_scores]   # def2def scores 0-50 -> 0-5
print('Unified dataset size:', len(samples))

Unified dataset size: 3747


In [28]:
# Main: presents correlation and additional info
corr, sims = txt_sents_sim(samples, scores, stop_words=True, punct_marks=True, embed_model='w2v', mu='ratio')   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

# This file: ok 20250615
# All datasets correlations
#   0.38 

True scores min, max, mean and std: 0.0 5.0 2.5230496397117688 1.2987121599071274
Sim scores min, max, mean and std: 0.0 5.000000000000001 1.9932547732950059 1.1623045610888645
Correlation: 0.377704549984576


#### Unified dataset baseline by sentences with adjusted DEF2DEF

In [29]:
# New unified datasets: 80% of samples and 83% of triplets are from DEF2DEF if original dataset 
samples = par_txt_pairs + vid_txt_pairs + answer_txt_pairs + def2def_adj_txt_pairs
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def_adj_scores]   # def2def scores 0-50 -> 0-5
print('Unified dataset size:', len(samples))

Unified dataset size: 3044


In [30]:
corr, sims = txt_sents_sim(samples, scores, stop_words=True, punct_marks=True, embed_model='w2v', mu='ratio')   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

True scores min, max, mean and std: 0.0 5.0 2.4978209592641263 1.3166309708785562
Sim scores min, max, mean and std: 0.0 5.000000000000001 2.138213165021107 1.199493150939687
Correlation: 0.4168268607477843


#### Unified dataset baseline by sentences with adjusted DEF2DEF_250

In [31]:
samples = par_txt_pairs + vid_txt_pairs + answer_txt_pairs + def2def250_txt_pairs
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def250_scores]   # def2def scores 0-50 -> 0-5
print('Unified dataset size:', len(samples))

Unified dataset size: 1004


In [33]:
corr, sims = txt_sents_sim(samples, scores, stop_words=True, punct_marks=True, embed_model='w2v', mu='ratio')   

print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

True scores min, max, mean and std: 0.0 5.0 2.550066733067729 1.4702609203245356
Sim scores min, max, mean and std: 0.03960421153787738 5.000000000000001 3.0525938375508277 1.3088540103541193
Correlation: 0.4125516395567429


#### Unified correlations summary

In [None]:
# This code: ok 20250615

# Individual baselines          sw=True

# Corr. for MSRpar: 0.4197711326094004 
# Corr. for MSRvid: 0.7790563368288715 
# Corr. for MSRanswer: 0.45504263740487616 
# Corr. for def2def: 0.4851967199582447 
#       Mean four original datasets: 0.535  
# Corr. for def2def_adj: 0.5505453897281186 
# Corr. for def2def250: 0.5950603614061716 

# Unified datasets              sw=True
# With original DEF2DEF = 0.378     
# With DEF2DEF adjusted = 0.417    
# With DEF2DEF_250 adj. = 0.413    