In [1]:
import numpy as np
from scipy.stats import spearmanr
file_path = 'D:/Study and Projects/School Work/Year 25 - PhD 1/Data/Word Embeddings//'

In [2]:
# Function to import a word embedding model from a file
def import_model(model_name, full_import=False, vocab_set=[]):
    """ string -> None
    Imports an embedding model, storing it in the model_embed_storage dictionary.
    """
        
    # open relevant file
    filename = file_path+model_name
    with open(filename) as file:
        lines = [line.rstrip('\n') for line in file]

    model_dict = {} # create word dictionary for specific model
    for line in lines:
        word_list = line.split()
        word = word_list[0]
        if full_import==False and word in vocab_set: # only  words for testing if full_import==False
            embedding_list = [float(x) for x in word_list[1:-1]] # store embeddings
            embedding_np = np.array(embedding_list)
            model_dict[word] = embedding_np
        elif full_import==True: # this will import all words in the vocab set, not just those for testing
            embedding_list = [float(x) for x in word_list[1:-1]] # store embeddings
            embedding_np = np.array(embedding_list)
            model_dict[word] = embedding_np
        else:
            continue

    return(model_dict)


# Function to calculate cosine similarity between two embeddings
def cosine_sim(embed_1, embed_2):
    """ numpy_array, numpy_array -> float
    Returns the cosine similarity (-1 to 1) between two embeddings, inputted as vectors.
    """
    if np.dot(embed_1,embed_2) == 0:
        similarity = 0 # don't normalise if similarity is zero
    else:
        similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
        #similarity, _ = spearmanr(embed_1, embed_2)
    return(similarity)


# Function to load word similarity data for specified dataset
def import_dataset(dataset_name):
    """ string -> None
    Imports a dataset, storing a value of the form (list, numpy_array) in the dataset_storage dictionary.
    """
    file_loc = 'D:/Study and Projects/School Work/Year 25 - PhD 1/Data/Word Similarity Data/Word Similarities Final//'
    filename = file_loc+dataset_name
    with open(filename) as file:
        lines = file.readlines()

    wordpairs = [None]*len(lines) # initialise storage
    ratings = [None]*len(lines)
    i=0
    for line in lines:
        line = line.strip() # remove new line chars
        wordpairs[i] = line.split() # split at any whitespace chars
        ratings[i] = float(wordpairs[i][2])
        wordpair_str = wordpairs[i][0]+' '+wordpairs[i][1]
        i=i+1
    ratings = np.array(ratings)

    return(wordpairs,ratings)

In [3]:
# Load sense embeddings
model_name = 'ARES Embeddings/ares_bert_large_english.txt'
embeds = import_model(model_name, full_import=True)
word_sense_list = list(embeds.keys())

# Load wordnet
from nltk.corpus import wordnet as wn
from nltk.data import path # need to specify the location of the nltk data
path.append("D:\Study and Projects\School Work\Year 25 - PhD 1\Data\Frames and Structured Data\\nltk_data")

# Construct a dictionary of words from embeddings file with all their wordnet senses
word_sense_dict = {}
for word_sense in word_sense_list:
    word = word_sense.split('%')[0] # get base word
    try: # add subsequent senses to dictionary
        word_sense_dict[word].append(word_sense)
    except KeyError: # add first element
        word_sense_dict[word] = [word_sense]

print(len(word_sense_dict.keys()))

147307


In [68]:
# Import word similarity dataset and calculate correlation between dataset and sense embeddings
simverb = import_dataset('EN-SimVerb-3200-mod.txt')
calc_sims = []
expr_sims = []

for word_pair in simverb[0]:
    if word_pair[0] in word_sense_dict.keys() and word_pair[1] in word_sense_dict.keys():
        word_1 = word_pair[0]
        word_2 = word_pair[1]

        calc_sims_temp = [] # temporary storage
        for word_1_sense in word_sense_dict[word_1]:
            for word_2_sense in word_sense_dict[word_2]:
                sense_sim = cosine_sim(embeds[word_1_sense],embeds[word_2_sense])
                calc_sims_temp.append(sense_sim)
        
        # print(word_1_sense,word_2_sense,sense_sim)
        calc_sims.append(np.mean(calc_sims_temp))
        expr_sims.append(float(word_pair[2]))
    else:
        continue

spearman_r, p = spearmanr(calc_sims, expr_sims)
print(spearman_r)

0.45938633790649047


In [59]:
# Compare all senses of two words
for word_1_sense in word_sense_dict['admire']:
    for word_2_sense in word_sense_dict['like']:
        sense_sim = cosine_sim(embeds[word_1_sense],embeds[word_2_sense])
        print(word_1_sense, word_2_sense, sense_sim)

admire%2:39:00:: like%2:37:01:: 0.696030222379393
admire%2:39:00:: like%2:37:04:: 0.790853941551968
admire%2:39:00:: like%2:37:05:: 0.7961704219091083
admire%2:39:00:: like%2:37:06:: 0.6848265046387817
admire%2:39:00:: like%1:09:01:: 0.7021524368519955
admire%2:39:00:: like%2:31:00:: 0.6488576485727974
admire%2:39:00:: like%1:09:00:: 0.6962509991598497
admire%2:39:00:: like%3:00:04:: 0.5908880032517394
admire%2:39:00:: like%3:00:00:: 0.6379227243642132
admire%2:39:00:: like%3:00:02:: 0.5443994644041646
admire%2:39:00:: like%5:00:00:same:00 0.6742458614572683
admire%2:37:00:: like%2:37:01:: 0.7154149842796691
admire%2:37:00:: like%2:37:04:: 0.8526063352951069
admire%2:37:00:: like%2:37:05:: 0.8343727944158179
admire%2:37:00:: like%2:37:06:: 0.7002974664207727
admire%2:37:00:: like%1:09:01:: 0.7263593137316249
admire%2:37:00:: like%2:31:00:: 0.6884834533866895
admire%2:37:00:: like%1:09:00:: 0.7129674077552316
admire%2:37:00:: like%3:00:04:: 0.6240282241184786
admire%2:37:00:: like%3:00:

In [65]:
wn.lemma_from_key("admire%2:37:00::").synset().definition()

'feel admiration for'

In [45]:
wn.synset('curse.v.03').definition()

'wish harm upon; invoke evil upon'

In [89]:
# Saving results to a file
save_file = open('D:\Study and Projects\School Work\Year 25 - PhD 1\Data\Analysis Results\SimVerb_mod_ARES_sense_embed.txt', "a", encoding='utf-8')
all_data = np.column_stack([calc_sims, expr_sims])
i=0
for line in all_data:
    save_file.writelines(simverb[0][i][0]+' '+simverb[0][i][1]+','+str(line[0])+','+str(line[1]))
    save_file.write('\n')
    i=i+1
save_file.close()