In [7]:
## Code for calculating Dictionary Sense Embeddings using ERNIE transformer
## James Fodor 2022
## Python 3.8

import numpy as np
import re
import os

from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import logging
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Set print option for numpy, needed for saving embeddings
np.set_printoptions(precision=4, threshold=10000, linewidth=100000, suppress=True, floatmode='fixed')

# Define base path location for data
path_base = 'D:\Study and Projects\School Work\Year 25 - PhD 1\Data\\'

# Get wordnet to work
from nltk.data import path # need to specify the location of the nltk data
path.append(path_base+"\Frames and Structured Data\FrameNet\\nltk_data")

### Functions for getting sentence embeddings

In [2]:
# Function to calculate cosine similarity between two embeddings
def cosine_sim(embed_1, embed_2):
    """ numpy_array, numpy_array -> float
    Returns the cosine similarity (-1 to 1) between two embeddings, inputted as vectors.
    """
    if np.dot(embed_1,embed_2) == 0:
        similarity = 0 # don't normalise if similarity is zero
    else:
        similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
    return(similarity)


# Get decontextualised transformer embedding for given word
def transformer_embed_decontext(model, tokenizer, word, layer=0, embed_type='decontext'):
    """ pytorch_model, pytorch_tokenzier, str, int, str -> np_array
    Extracts a word embedding given a pythorch model and tokenizer. Three modes of operation
    depending on how to get the word embedding.
    """
    encoded_input = tokenizer(word, return_tensors='pt') #pt = pytorch
    model_output = model(**encoded_input)
    word_embedding_raw = model_output.hidden_states[layer].detach().numpy()[0]
     
    if embed_type=='mean': # take the mean of all tokens
        word_embedding = word_embedding_raw.mean(axis=0)
    elif embed_type=='cls': # use the 'CLS' token
        word_embedding = word_embedding_raw[0]
    elif embed_type=='decontext': # take the mean of word tokens only
        word_embedding = word_embedding_raw[1:-1].mean(axis=0)

    return(word_embedding)


# Load word similarity dataset
def load_sim_dataset(dataset):
    """ str -> (list_str, np_array)
    Loads a dataset of word similarities, returning the word pairs and similarity ratings.
    """
    path = path_base+'Word Similarity Data\Word Similarities Final\\'
    filename = path+dataset+'.txt'
    with open(filename) as file:
        lines = file.readlines()

    wordpairs = [None]*len(lines) # initialise storage
    ratings = [None]*len(lines)
    i=0
    for line in lines:
        line = line.strip() # remove new line chars
        wordpairs[i] = line.split() # split at any whitespace chars
        ratings[i] = float(wordpairs[i][2])
        i=i+1
    ratings = np.array(ratings)

    return(wordpairs, ratings)


# Save contextual embeddings for a given word to a new file for each word
def save_embeddings_word(word, word_embeddings, layer, path):
    """ str, np_array, int, str -> None
    Saves word_embeddings for specified word to a specified path.
    """
    save_path = path+word+'_'+str(layer)+'.txt'
    with open(save_path, "a", encoding='utf-8') as save_file:
        final_string = str(word_embeddings)[2:-1] # don't include brackets in string
        save_file.writelines(final_string)
        save_file.write('\n')


# Function to return a lemmatised list and dictionary for a given sentence
def lemmatise_sent(sentence):
    """ str -> list_str, dict
    Takes in a sentence and returns a tokenised and lemmatised list and dictionary
    for all words in the sentence.
    """
    lemmatised_sentence_list = []
    lemmatised_sentence_dict = {}
    lemmatizer = WordNetLemmatizer()
    for original_word in sentence.split():
        fixed_original_word = re.sub('[‘`’\"\',;.?!\)\(]', '', original_word) # remove punctuation
        lemmatised_word = lemmatizer.lemmatize(fixed_original_word, wordnet.VERB) # lematise all words in sentence
        lemmatised_sentence_list.append(lemmatised_word)
        lemmatised_sentence_dict[lemmatised_word] = original_word
    return(lemmatised_sentence_list, lemmatised_sentence_dict)


# get encoding number for a specific word
def get_word_code(word):
    encoded_word = tokenizer(word, return_tensors='pt') #pt = pytorch
    word_code = int(encoded_word.input_ids[0][1]) # get token code for target word
    return(word_code)


# Function to import a set of word embeddings from a file
def import_model(layer, type={'raw','normalised'}, full_import=False, vocab_set=[]):
    """ int, string, boolean, list_str -> dict
    Imports a given layer of an embedding model, storing it in the model_embed_storage dictionary.
    The variable vocab_set specifies the words to load if full_import is false.
    """
    # open relevant file
    filename = path_base+'Combined Embeddings\\'+type+'_'+str(layer)+'.txt'
    with open(filename) as file:
        lines = [line.rstrip('\n') for line in file]

    # loop over file and store embeddings in a dictionary
    model_dict = {} # create word dictionary for specific model
    for line in lines:
        word_list = line.split()
        word = word_list[0]
        if full_import==False and word in vocab_set: # only words used for testing if full_import==False
            embedding_list = [float(x) for x in word_list[1:-1]] # store embeddings
            embedding_np = np.array(embedding_list)
            model_dict[word] = embedding_np
        elif full_import==True: # this will import all words in the vocab set, not just those for testing
            embedding_list = [float(x) for x in word_list[1:-1]] # store embeddings
            embedding_np = np.array(embedding_list)
            model_dict[word] = embedding_np
        else:
            continue

    return(model_dict)


# Get list of verbs that have already had their sense embeddings saved
def get_done_senses(path_to_done_senses):
    verb_file_list = os.listdir(path=path_to_done_senses)
    done_verbs = []
    for verb in verb_file_list:
        verb_list = verb.split('_')
        done_verbs.append(verb_list[0]+'_'+verb_list[1])
    done_verbs_set = list(set(done_verbs))
    done_verbs_set.sort()
    return(done_verbs_set)

### Load embedding model and sentences dataset

In [3]:
# Load transformer model
model_name = 'ernie-2.0-base-en' # specify model to load
ernie_path = path_base+'Sentence Embeddings//'+model_name
logging.set_verbosity_error() # turn off annoying model initialisation warning
config_state = AutoConfig.from_pretrained(ernie_path, output_hidden_states=True) # get hidden states
tokenizer = AutoTokenizer.from_pretrained(ernie_path)
model = AutoModel.from_pretrained(ernie_path, config=config_state)
print(model_name+' model loaded')

ernie-2.0-base-en model loaded


In [4]:
# Load vocab set
dataset_name = 'SimVerb_mod' # specify vocal set to load
dataset, _ = load_sim_dataset('EN-SimVerb-3200-mod-uk')
vocab = []
for word_pair in dataset:
    vocab.append(word_pair[0])
    vocab.append(word_pair[1])
vocab_set = list(set(vocab))
vocab_set.sort()
print(dataset_name+' vocab loaded')
print(str(len(vocab_set))+' words')

SimVerb_mod vocab loaded
822 words


### Compute and save contextual embeddings

In [None]:
# Get contextual embeddings for each word in vocab set and save each to separate file
irregular_verbs = {'bear':'bore', 'cope':'coping', 'depends':'depends'} # default lemmatiser doesn't work for these
verb_senses_corpus_folder = path_base+'Corpus Data\Dictionary Verb Corpus\\'
verb_senses_list = os.listdir(path=verb_senses_corpus_folder)

for verb_sense_file in verb_senses_list:
    if verb_sense_file.endswith(".txt"):
        
        # Get target verb and verb sense
        target_verb = verb_sense_file[0:-6] # remove filetype and sense ID to get just the target verb
        verb_sense = verb_sense_file[0:-4] # get the verb sense
        
        # Or just get the word if we aren't using sense embeddings
        if verb_sense in get_done_senses(verb_senses_corpus_folder+'Embeddings'): # skip verbs we've already got the embeddings for
            continue

        # Open list of corpus sentences for target word
        with open(verb_senses_corpus_folder+verb_sense_file, encoding="utf-8") as file:
            sentence_list = file.readlines()
        
        # Get embedding from each sentence in list
        if len(sentence_list)>0:
            for sentence in sentence_list:
                # get lemmatiseed list of words in sentence
                sentence = sentence.lower()
                sent_encoded_input = tokenizer(sentence, return_tensors='pt') # note use of sentence not lemmatised sentence
                encoded_word_ids = np.array(sent_encoded_input.input_ids[0])
                sent_model_output = model(**sent_encoded_input)
                lemmatised_sentence_list, lemmatised_sentence_dict = lemmatise_sent(sentence) # get lemmatised version of the sentence

                # get sense embedding from the sentence
                try:
                    target_verb_conj = lemmatised_sentence_dict[target_verb] # get the conjugated form of the target verb
                except KeyError:
                    try:
                        target_verb_conj = irregular_verbs[target_verb] # get the conjugated form of the target verb if the verb has irregular conjugation
                    except KeyError:
                        continue
                try:
                    target_verb_conj_code = int(tokenizer(target_verb_conj, return_tensors='pt').input_ids[0][1]) # extract the token code for the target verb
                    target_index = list(np.array(sent_encoded_input.input_ids[0])).index(target_verb_conj_code) # find the index of the target token in our sentence
                except:
                    print('Couldn\'t find the word: '+target_verb_conj)
                # print(encoded_word_ids[target_index])
                # print(target_word_conjugated)
                # print(lemmatised_sentence_list)

                # get embeddings for each layer of network
                for layer in range(1,13):
                    save_path = verb_senses_corpus_folder+'Embeddings\\'
                    sent_embedding_raw = sent_model_output.hidden_states[layer].detach().numpy()[0] # get sentence embeddings
                    word_embedding = sent_embedding_raw[target_index]
                    save_embeddings_word(verb_sense, word_embedding, layer, save_path) # save embeddings to file by layer

### Put together full set of embeddings into single file

In [25]:
# Construct dictionary of all contextualised word embeddings
file_list = os.listdir(path=verb_senses_corpus_folder+'Embeddings')
word_embed_dict = {}
for layer in range(1,13):
    layer_dict = {}
    for file in file_list: # read all words that we have sense embeddings for
        file_base = file.split('.')[0] # remove file type
        sense = file_base[0:-2].strip('_') # remove layer
        filename = verb_senses_corpus_folder+'Embeddings\\'+sense+'_'+str(layer)+'.txt' # get the file for words
        try:
            with open(filename) as file:
                np_lines = np.loadtxt(file)
                layer_dict[sense] = np_lines
        except FileNotFoundError:
            continue
    word_embed_dict[layer] = layer_dict

In [32]:
# Save raw contextualised embeddings to a single text file per ERNIE layer
for layer in range(1,13):
    for word in word_embed_dict[layer].keys():
        if len(np.ndarray.flatten(word_embed_dict[layer][word])) > 800: # only if we only have multiple embeddings
            contextual_embedding = np.mean(word_embed_dict[layer][word], axis=0) # average over all saved embeddings
        else:
            contextual_embedding = word_embed_dict[layer][word]
        save_path = 'raw_ernie_dict_embed_'+str(layer)+'.txt'

        with open(save_path, "a", encoding='utf-8') as save_file:
            final_string = word+' '+str(contextual_embedding)[2:-1] # don't include brackets in string
            save_file.writelines(final_string)
            save_file.write('\n')

In [33]:
# Save normalised transformer embeddings to a single text file per ERNIE layer
for layer in range(1,13):

    # Open file with unnormalised embeddings
    filename = 'raw_ernie_dict_embed_'+str(layer)+'.txt'
    with open(filename) as file:
        lines = [line.rstrip('\n') for line in file]

    # Load values into dictionary
    model_dict = {}
    for line in lines:
        word_list = line.split()
        word = word_list[0]
        embedding_list = [float(x) for x in word_list[1:-1]]
        embedding_np = np.array(embedding_list)
        model_dict[word] = embedding_np

    # Convert to numpy array
    first_key = list(model_dict.keys())[0]
    length = len(model_dict[first_key])
    model_np = np.empty((0,length), float)
    for word in model_dict.keys():
        model_np = np.vstack([model_np, model_dict[word]])

    # Normalise array
    mean_np = np.mean(model_np,axis=0)
    std_np = np.std(model_np, axis=1)
    mean_tp_np = np.transpose(model_np - mean_np)
    model_final_np = np.transpose(mean_tp_np/std_np)

    # Save normalised embeddings to new file
    save_path = 'normalised_ernie_dict_embed_'+str(layer)+'.txt'
    i=0
    with open(save_path, "a", encoding='utf-8') as save_file:
        for word in model_dict.keys():
            final_string = word+' '+str(model_final_np[i,:])[1:-1] # remove brackets from numpy
            save_file.writelines(final_string)
            save_file.write('\n')
            i=i+1

### Combine senses together (senses to words)

In [67]:
# Load all the senses into a dictionary for each word
file_location = path_base+'Sense Embeddings\Dictionary Sense Embeddings\Embeddings\\'
verb_senses_list = os.listdir(file_location)
sentence_storage = {}
for verb_sense in verb_senses_list:
    with open(file_location+verb_sense, encoding="utf-8") as file:
        sentence_list = [line.rstrip('\n') for line in file]
        # sentence_list = np.loadtxt(filename,  delimiter=',', dtype='str', encoding='utf-8')
    verb = verb_sense.split('.')[0].split('_')[0].split('_')[0] # extract verb from filename
    try:
        sentence_storage[verb] = sentence_storage[verb] + sentence_list
    except:
        sentence_storage[verb] = sentence_list

In [None]:
# Save the sentences to a single file for each word
for word in sentence_storage.keys():
    with open(word+'.txt', "a", encoding='utf-8') as save_file:
        for sentence in sentence_storage[word]:
            save_file.writelines(sentence.strip('\n'))
            save_file.write('\n')

### Other code

In [122]:
# Generate a list of word pairs with available senses

# load list of verbs with senses available
done_verbs = []
for verb_sense in verb_senses_list:
    verb_sense = verb_sense.split('.')[0] # remove tile extension
    done_verbs.append(verb_sense[0:-2].strip('_'))
done_verbs = list(set(done_verbs))

# store verb pairs with both words available
simverb = load_sim_dataset('EN-SimVerb-3200-mod')
verb_dataset = []
for word_pair in simverb[0]:
    if word_pair[0] in done_verbs and word_pair[1] in done_verbs:
        verb_dataset.append(word_pair)
        
# Save a list of the words that have been used for further analysis
save_file = open(path_base+'\\vocab_mini.txt', "a", encoding='utf-8')
i=0
for line in verb_dataset:
    save_file.writelines(line[0]+'\t'+line[1]+'\t'+line[2])
    save_file.write('\n')
    i=i+1
save_file.close()