In [1]:
import numpy as np
import json
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score, r2_score

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

path_base = 'D:/Study and Projects/School Work/Year 25 - PhD 1/Data/'

In [18]:
## Functions for sense embedding analysis

# Function to import a word embedding model from a file
def import_model(model_name, full_import=False, vocab_set=[]):
    """ string -> None
    Imports an embedding model, storing it in the model_embed_storage dictionary.
    """
        
    # open relevant file
    file_loc = 'Sense Embeddings/Dictionary Sense Embeddings/Combined Embeddings/'
    filename = path_base+file_loc+model_name
    with open(filename) as file:
        lines = [line.rstrip('\n') for line in file]

    model_dict = {} # create word dictionary for specific model
    for line in lines:
        word_list = line.split()
        word = word_list[0]
        if full_import==False and word in vocab_set: # only  words for testing if full_import==False
            embedding_list = [float(x) for x in word_list[1:-1]] # store embeddings
            embedding_np = np.array(embedding_list)
            model_dict[word] = embedding_np
        elif full_import==True: # this will import all words in the vocab set, not just those for testing
            embedding_list = [float(x) for x in word_list[1:-1]] # store embeddings
            embedding_np = np.array(embedding_list)
            model_dict[word] = embedding_np
        else:
            continue

    return(model_dict)


# Function to calculate cosine similarity between two embeddings
def cosine_sim(embed_1, embed_2):
    """ numpy_array, numpy_array -> float
    Returns the cosine similarity (-1 to 1) between two embeddings, inputted as vectors.
    """
    if np.dot(embed_1,embed_2) == 0:
        similarity = 0 # don't normalise if similarity is zero
    else:
        similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
        #similarity, _ = spearmanr(embed_1, embed_2)
    return(similarity)


# Function to load word similarity data for specified dataset
def import_dataset(dataset_name):
    """ string -> None
    Imports a dataset, storing a value of the form (list, numpy_array) in the dataset_storage dictionary.
    """
    file_loc = 'Word Similarity Data/Word Similarities Final/'
    filename = path_base+file_loc+dataset_name
    with open(filename) as file:
        lines = file.readlines()

    wordpairs = [None]*len(lines) # initialise storage
    ratings = [None]*len(lines)
    i=0
    for line in lines:
        line = line.strip() # remove new line chars
        wordpairs[i] = line.split() # split at any whitespace chars
        ratings[i] = float(wordpairs[i][2])
        wordpair_str = wordpairs[i][0]+' '+wordpairs[i][1]
        i=i+1
    ratings = np.array(ratings)

    return(wordpairs,ratings)

# Function to load a specific word embedding model
def import_word_model(model_path):
    """ string -> None
    Imports an embedding model, storing it in the model_embed_storage dictionary.
    """
    # open relevant file
    with open(model_path, encoding='utf-8') as file:
        lines = [line.rstrip('\n') for line in file]
    
    # create word dictionary for specific model
    model_dict = {}  
    for line in lines:
        word_list = line.split()
        word = word_list[0]
        embedding_list = [float(x) for x in word_list[1:-1]] # store embeddings
        embedding_np = np.array(embedding_list)
        model_dict[word] = embedding_np
        
    return(model_dict)

### Load and calculate correlations of Dictionary SimVerb embeddings

In [15]:
## Load transformer embeddings preparatory to computing correlation
full_word_sense_dict = {}
full_word_embeds_dict = {}

for transformer_layer in np.arange(1,13):
    # Load sense embeddings
    model_name = 'normalised_'+str(transformer_layer)+'.txt'
    embeds = import_model(model_name, full_import=True)
    word_sense_list = list(embeds.keys())

    # Construct a dictionary of words from embeddings file with all their dictionary senses
    word_sense_dict = {}
    for word_sense in word_sense_list:
        word = word_sense.split('_')[0] # get base word
        try: # add subsequent senses to dictionary
            word_sense_dict[word].append(word_sense)
        except KeyError: # add first element
            word_sense_dict[word] = [word_sense]

    # Store in dict
    full_word_sense_dict[transformer_layer] = word_sense_dict
    full_word_embeds_dict[transformer_layer] = embeds
    # print(transformer_layer,len(word_sense_dict.keys()))

In [77]:
## Compute correlation between experimental and sense embeddings

# Import word similarity dataset 
dataset = import_dataset('EN-SimVerb-3200-mod.txt')
# dataset = import_dataset('combined_dataset_verbs.txt')
all_data_dict = {}
correls_dict = {}
expr_sims = np.array(np.array(dataset[0])[:,2],dtype=float) # experimental similarities

# loop over all layers in transformer
for transformer_layer in np.arange(1,13):
    all_data = []
    expr_sims_included = []
    embeds = full_word_embeds_dict[transformer_layer] # get relevant embeddings
    word_sense_dict = full_word_sense_dict[transformer_layer] # get word senses

    # loop over word pairs in dataset
    for word_pair in dataset[0]:
        if word_pair[0] in word_sense_dict.keys() and word_pair[1] in word_sense_dict.keys():
            word_1 = word_pair[0]
            word_2 = word_pair[1]
            calc_sims_temp = [] # temporary storage
            for word_1_sense in word_sense_dict[word_1]:
                for word_2_sense in word_sense_dict[word_2]:
                    sense_sim = cosine_sim(embeds[word_1_sense],embeds[word_2_sense])
                    calc_sims_temp.append(sense_sim)
            
            # print(word_1_sense,word_2_sense,sense_sim)
            all_data.append((word_pair[0],word_pair[1],np.max(calc_sims_temp),float(word_pair[2]))) # define method to select similarity across senses
            expr_sims_included.append(word_pair[2])
        else:
            if transformer_layer==1:
                print(word_pair) # print missing word pairs
            continue # skip word pairs without sense embeddings available

    # store results
    spearman_r, p = spearmanr(np.array(all_data)[:,2], expr_sims_included)
    correls_dict[transformer_layer] = spearman_r
    all_data_dict[int(transformer_layer)] = all_data



In [78]:
## Code for saving results

# Save Dictionary similarities and word pairs to file
for transformer_layer in all_data_dict.keys():
    data_single_layer = all_data_dict[transformer_layer]
    save_file = open(path_base+'\Analysis Results\ernie_dictionary_max_'+str(transformer_layer)+'_SimVerb_mod_results.txt', "w", encoding='utf-8')
    # np.savetxt(save_file, data_single_layer, fmt='%s')
    
    for line in data_single_layer:
        diff = line[3]-line[2]
        save_file.writelines(line[0]+' '+line[1]+','+str(line[2])[0:7]+','+str(line[3])+','+str(diff))
        save_file.write('\n')
    save_file.close()

In [54]:
## Compute the number of senses for each wordpair (for Simverb)

# Load dataset
dataset = import_dataset('EN-SimVerb-3200-mod.txt')

# Load number of senses for each word in SimVerb corpus
with open(path_base+'Corpus Data//Dictionary Verb Corpus//Vocab lists//verb_polysemy_scores.txt', encoding='utf-8') as file:
    lines = [line.rstrip('\n') for line in file]

# dictionary with number of senses for each word
polysemy_dict = {}
for line in lines:
    word_list = line.split()
    word = word_list[0]
    polysemy_score = word_list[1]
    polysemy_dict[word] = int(polysemy_score)

# dictionary with sum of word senses for each word pair
full_polysemy_score_list = []
for word_pair in dataset[0]:
    total_polysemy_score = polysemy_dict[word_pair[0]] + polysemy_dict[word_pair[1]]
    full_polysemy_score_list.append(total_polysemy_score)