#### Some imports and datasets loading

In [14]:
import numpy as np
import pickle
from utils import utils_vectorize as uv

# Loading datasets
with open('datasets/msrpar_txt.pkl', 'rb') as f:    
    par_txt_pairs, par_scores = pickle.load(f)
with open('datasets/msrvid_txt.pkl', 'rb') as f:    
    vid_txt_pairs, vid_scores = pickle.load(f)
with open('datasets/msranswer_txt.pkl', 'rb') as f:    
    answer_txt_pairs, answer_scores = pickle.load(f)
    
# DEF2DEF dataset flavors
with open('datasets/def2def_txt.pkl', 'rb') as f: 
    def2def_txt_pairs, def2def_scores = pickle.load(f)
with open('datasets/def2def_adjusted_txt.pkl', 'rb') as f: 
    def2def_adjusted_txt_pairs, def2def_adjusted_scores = pickle.load(f)
with open('datasets/def2def250_adjusted_txt.pkl', 'rb') as f: 
    def2def250_txt_pairs, def2def250_scores = pickle.load(f)

#### Functions

In [23]:
# Correlation with trues in a dataset of pairs with Mihalcea similarity function
def txt_sents_sim(ds_txt_pairs, true_scores, stop_words=True, punct_marks=False, embed_model='w2v'):
    sims = []
    for pair in ds_txt_pairs:
        sims.append(uv.mihalcea_sim(pair, stop_words=stop_words, punct_marks=punct_marks, embed_model=embed_model))
    correlation = np.corrcoef(sims, np.array(true_scores))[0][1]
    return(correlation, np.array(sims))

#### Individual datasets tests

In [None]:
# Test data
names = ['MSRpar', 'MSRvid', 'MSRanswer', 'def2def', 'def2def_adjusted', 'def2def_250']
sets = [par_txt_pairs, vid_txt_pairs, answer_txt_pairs, def2def_txt_pairs, def2def_adjusted_txt_pairs, def2def250_txt_pairs]
true_scores = [par_scores, vid_scores, answer_scores, def2def_scores, def2def_adjusted_scores, def2def250_scores]
correlations = []

# Test combinations: embedding = ('w2v', 'glove')
for idx, elem in enumerate(sets):
    scores = true_scores[idx]
    corr, sims = txt_sents_sim(elem, scores, stop_words=True, 
                               punct_marks=False, embed_model='w2v')   
    correlations.append(corr)
   # print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
   # print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
    print('Corr. for', names[idx]+':', corr, '\n')
print('\tCorrelations mean:', np.mean(np.array(correlations)))

Corr. for MSRpar: 0.5633251387251759 

Corr. for MSRvid: 0.7119601701442593 

Corr. for MSRanswer: 0.498384244352392 

Corr. for def2def: 0.4762962028471135 

Corr. for def2def_adjusted: 0.5363979457342523 

Corr. for def2def_250: 0.5828188187111102 

	Correlations mean: 0.5615304200857172


#### Unified datasets tests

With original DEF2DEF

In [31]:
# New unified datasets: 80% of samples and 83% of triplets are from DEF2DEF if original dataset
sent_txt_samples = par_txt_pairs + vid_txt_pairs + answer_txt_pairs + def2def_txt_pairs
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def_scores]   # def2def scores 0-50 -> 0-5
print('Unified dataset size:', len(sent_txt_samples))

# Main: presents correlation and additional info
corr, sims = txt_sents_sim(sent_txt_samples, scores, stop_words=True, punct_marks=False, embed_model='w2v')   

#print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
#print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

Unified dataset size: 3747
Correlation: 0.32799052627820124


With adjusted DEF2DEF

In [32]:
sent_txt_samples = par_txt_pairs + vid_txt_pairs + answer_txt_pairs + def2def_adjusted_txt_pairs
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def_adjusted_scores]   # def2def scores 0-50 -> 0-5
print('Unified dataset size:', len(sent_txt_samples))

# Main: presents correlation and additional info
corr, sims = txt_sents_sim(sent_txt_samples, scores, stop_words=True, punct_marks=False, embed_model='w2v')   

#print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
#print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

Unified dataset size: 3044
Correlation: 0.3653456542028317


With DEF2DEF_250

In [33]:
sent_txt_samples = par_txt_pairs + vid_txt_pairs + answer_txt_pairs + def2def250_txt_pairs
scores = par_scores + vid_scores + answer_scores + [score/10 for score in def2def250_scores]   # def2def scores 0-50 -> 0-5
print('Unified dataset size:', len(sent_txt_samples))

# Main: presents correlation and additional info
corr, sims = txt_sents_sim(sent_txt_samples, scores, stop_words=True, punct_marks=False, embed_model='w2v')   

#print('True scores min, max, mean and std:',np.min(scores), np.max(scores), np.mean(scores), np.std(scores))
#print('Sim scores min, max, mean and std:', np.min(sims*5), np.max(sims*5), np.mean(sims*5), np.std(sims*5))
print('Correlation:', corr)

Unified dataset size: 1004
Correlation: 0.41758350498863467


#### Tests summary

In [34]:
paper_refs = [0.42, 0.82, 0.52, 0.53]   # -> mean = .57 | W2V + BEST STR + F.INF + COS

# Individual datasets with Mihalcea similarity | This file: ok 20250425 | Rechecked 20250616
MSRpar =    [0.563]
MSRvid =    [0.712]
MSRanswer = [0.498]
def2def =   [0.476]           # Corr. mean=0.57
def2def_adjusted = [0.536]    # Corr. mean=0.58
def2def250_adjusted = [0.583] # Corr. mean=0.59

# Unified datasets with Mihalcea similarity | This file: ok 20250502 | Rechecked 20250616
with_original_DEF2DEF = [0.328]
with_adjusted_DEF2DEF = [0.365]
with_DEF2DEF_250      = [0.418]