In [3]:
from collections import defaultdict
import pandas as pd
import numpy as np
from scipy.stats import kendalltau
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr
from tools import Tools
import tqdm

target_similarity=defaultdict(list)

def calculate(target_similarity, pair_list):
    calculated_score=[]
    extracted_list = []
    original_score=[]
    word_pairs=[]
    
    for (x,y) in pair_list:
        if x in target_similarity:
            word1, word2=x
            word1_prof = target_similarity[x] 
            extracted_list.append((x, word1_prof))
            calculated_score.append(word1_prof)
            original_score.append(y)
            word_pairs.append(x)

    spearman_TM = spearmanr(original_score, calculated_score)
    spearman_TM = round(spearman_TM[0], 3)
    print(f'Spearman TM: {spearman_TM}')

    total_list=[]
    total_list.append(original_score)
    total_list.append(calculated_score)

    similarity = cosine_similarity(total_list)
    print(f'Cosine TM \n{similarity}')

    TM_corr= np.corrcoef(original_score, calculated_score)
    print(f'Pearson TM \n{TM_corr}')

    kendal_TM, _ = kendalltau(original_score, calculated_score)
    print(f'Kendal TM: {kendal_TM}')

    data = pd.DataFrame([original_score,calculated_score])
    data=data.transpose()
    data.columns=['Original','TM']
    correlation = data.corr()
    print("Pearson Corr \n", correlation)
    return spearman_TM

vectorizer_X = Tools.read_pickle_data("vectorizer_X.pickle")
number_of_features = vectorizer_X.get_feature_names_out().shape[0]
feature_names = vectorizer_X.get_feature_names_out()
omni_embeddings = Tools.read_pickle_data("omni_embeddings.pickle")

target_dataset_name = "rg-65"
# target_dataset_name = "wordsim353-sim"
# target_dataset_name = "mturk-287"
# target_dataset_name = "mturk-771"
# target_dataset_name = "simlex999"
# target_dataset_name = "men"

pair_list = Tools.get_dataset_pairs(target_dataset_name + ".csv")
output_active, target_words = Tools.get_dataset_targets(target_dataset_name, vectorizer_X, pair_list)
print(f"Dataset: {target_dataset_name}, Number of pairs: {len(pair_list)}, Number of words: {len(target_words)}")
token_embeddings = {}
for word in tqdm.tqdm(target_words, desc="Loading token embeddings"):
    word_id = vectorizer_X.vocabulary_.get(word, None)
    if word_id is not None:
        token_embeddings[word_id] = omni_embeddings[word_id]

profile = np.empty((len(target_words), number_of_features))
for i, word in enumerate(target_words):
    word_id = vectorizer_X.vocabulary_.get(word, None)
    if word_id is not None and word_id in token_embeddings:
        profile[i, :] = token_embeddings[word_id]
    else:
        print(f"Word '{word}' not found in vocabulary or embeddings.")

for i, word1 in enumerate(target_words):
    for j, word2 in enumerate(target_words):
        if i != j:
            word2_index = vectorizer_X.vocabulary_.get(word2, None)
            if word2_index is not None:
                target_similarity[(word1, word2)] = profile[i, word2_index]
            else:
                target_similarity[(word1, word2)] = 0.0

spearman = calculate(target_similarity,pair_list)
token_embeddings = None

Dataset words count:  48
Dataset: rg-65, Number of pairs: 65, Number of words: 37


Loading token embeddings: 100%|██████████| 37/37 [00:00<00:00, 384131.80it/s]

Spearman TM: 0.471
Cosine TM 
[[ 1.         -0.18389329]
 [-0.18389329  1.        ]]
Pearson TM 
[[1.         0.45475507]
 [0.45475507 1.        ]]
Kendal TM: 0.3503817574348747
Pearson Corr 
           Original        TM
Original  1.000000  0.454755
TM        0.454755  1.000000



