In [16]:
## Code to extract example sentences from Wikipedia articles
## James Fodor 2022
## Python 3.8

import numpy as np
import re
import os

from collections import Counter
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import logging
from nltk.stem import WordNetLemmatizer

# Set print option for numpy, needed for saving embeddings
np.set_printoptions(precision=4, threshold=10000, linewidth=10000, suppress=True, floatmode='fixed')

# Define base path location for data
path_base = 'D:\Study and Projects\School Work\Year 25 - PhD 1\Data\\'

# Get wordnet to work
from nltk.corpus import wordnet
from nltk.data import path # need to specify the location of the nltk data
path.append(path_base+"\Frames and Structured Data\\FrameNet\\nltk_data")

### Functions for getting sentence embeddings

In [2]:
# Function to calculate cosine similarity between two embeddings
def cosine_sim(embed_1, embed_2):
    """ numpy_array, numpy_array -> float
    Returns the cosine similarity (-1 to 1) between two embeddings, inputted as vectors.
    """
    if np.dot(embed_1,embed_2) == 0:
        similarity = 0 # don't normalise if similarity is zero
    else:
        similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
    return(similarity)


# Get decontextualised transformer embedding for given word
def transformer_embed_decontext(model, tokenizer, word, layer=0, embed_type='decontext'):
    """ pytorch_model, pytorch_tokenzier, str, int, str -> np_array
    Extracts a word embedding given a pythorch model and tokenizer. Three modes of operation
    depending on how to get the word embedding.
    """
    encoded_input = tokenizer(word, return_tensors='pt') #pt = pytorch
    model_output = model(**encoded_input)
    word_embedding_raw = model_output.hidden_states[layer].detach().numpy()[0]
     
    if embed_type=='mean': # take the mean of all tokens
        word_embedding = word_embedding_raw.mean(axis=0)
    elif embed_type=='cls': # use the 'CLS' token
        word_embedding = word_embedding_raw[0]
    elif embed_type=='decontext': # take the mean of word tokens only
        word_embedding = word_embedding_raw[1:-1].mean(axis=0)

    return(word_embedding)


# Load word similarity dataset
def load_sim_dataset(dataset):
    """ str -> (list_str, np_array)
    Loads a dataset of word similarities, returning the word pairs and similarity ratings.
    """
    path = path_base+'Word Similarity Data\Word Similarities Final\\'
    filename = path+dataset+'.txt'
    with open(filename) as file:
        lines = file.readlines()

    wordpairs = [None]*len(lines) # initialise storage
    ratings = [None]*len(lines)
    i=0
    for line in lines:
        line = line.strip() # remove new line chars
        wordpairs[i] = line.split() # split at any whitespace chars
        ratings[i] = float(wordpairs[i][2])
        i=i+1
    ratings = np.array(ratings)

    return(wordpairs, ratings)


# Save contextual embeddings for a given word to a new file for each word
def save_embeddings_word(word, word_embeddings, layer, path):
    """ str, np_array, int, str -> None
    Saves word_embeddings for specified word to a specified path.
    """
    save_path = path+word+'_'+str(layer)+'.txt'
    with open(save_path, "a", encoding='utf-8') as save_file:
        final_string = str(word_embeddings)[2:-1] # don't include brackets in string
        save_file.writelines(final_string)
        save_file.write('\n')


# Function to return a lemmatised list and dictionary for a given sentence
def lemmatise_sent(sentence):
    """ str -> list_str, dict
    Takes in a sentence and returns a tokenised and lemmatised list and dictionary
    for all words in the sentence.
    """
    lemmatised_sentence_list = []
    lemmatised_sentence_dict = {}
    lemmatizer = WordNetLemmatizer()
    for original_word in sentence.split():
        fixed_original_word = re.sub('[,;.?!\)\(]', '', original_word) # remove punctuation
        lemmatised_word = lemmatizer.lemmatize(fixed_original_word, wordnet.VERB) # lematise all words in sentence
        lemmatised_sentence_list.append(lemmatised_word)
        lemmatised_sentence_dict[lemmatised_word] = original_word
    return(lemmatised_sentence_list, lemmatised_sentence_dict)


# get token number for a specific word
def get_word_code(word):
    encoded_word = tokenizer(word, return_tensors='pt') #pt = pytorch
    word_code = int(encoded_word.input_ids[0][1]) # get token code for target word
    return(word_code)

### Analyse the sentences and get embeddings

In [3]:
# Load full corpus of sentences from file
filename = path_base+'Corpus Data/Wikipedia 10k corpus/full_corpus.txt'
with open(filename, encoding='utf-8') as file:
    sentence_corpus = [line.rstrip('\n') for line in file]
print(str(len(sentence_corpus))+' sentences loaded')

2002787 sentences loaded


In [4]:
# Load transformer model
model_name = 'ernie-2.0-base-en'
path = path_base+'Sentence Embeddings//'+model_name
logging.set_verbosity_error() # turn off annoying model initialisation warning
config_state = AutoConfig.from_pretrained(path, output_hidden_states=True) # get hidden states
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(path, config=config_state)
print(model_name+' model loaded')

ernie-2.0-base-en model loaded


In [17]:
# Load vocab set
dataset_name = 'EN-SIMLEX-999-VERB'
dataset, _ = load_sim_dataset(dataset_name)
vocab = []
for word_pair in dataset:
    vocab.append(word_pair[0])
    vocab.append(word_pair[1])
vocab_set = list(set(vocab))
vocab_set.sort()
print(dataset_name+' vocab loaded')
print(str(len(vocab_set))+' words')

EN-SIMLEX-999-VERB vocab loaded
170 words


In [18]:
# Initialise vocab counter
vocab_storage_count = Counter()
save_path = '' # location to save word embeddings

for word in list(vocab_set): # looping over words in vocab set
    filename = path+word+'_1.txt'
    if os.path.isfile(filename): # check if that word has any saved embeddings
        with open(filename) as file:
            lines = [line.rstrip('\n') for line in file]
            vocab_storage_count[word] = len(lines) # get count of number of embeddings saved
    else:
        vocab_storage_count[word] = 0 # no embeddings saved yet

In [None]:
# Get contextual embeddings for each word in vocab set and save each to separate file
start_num = 0 # number to start at if not the beginning
count = start_num
count_limit = 100 # max number of contextualised embeddings per word

# loop over all sentences in corpus
for sentence in sentence_corpus[start_num:]:

    # get list of vocab that we still need to get embeddings if limit not reached
    remaining_vocab = [word for word in vocab_set if vocab_storage_count[word]<count_limit]

    # get lemmatiseed list of words in sentence
    sent_encoded_input = tokenizer(sentence, return_tensors='pt') # note use of sentence not lemmatised sentence
    encoded_word_ids = np.array(sent_encoded_input.input_ids[0])
    sent_model_output = model(**sent_encoded_input)
    lemmatised_sentence_list, lemmatised_sentence_dict = lemmatise_sent(sentence) # get lemmatised version of the sentence

    # get embeddings for all words from the vocab set
    for lemmatised_word in list(set(lemmatised_sentence_list) & set(remaining_vocab)): # get words in the vocab set
        original_word = lemmatised_sentence_dict[lemmatised_word] # undo lemmatisation of matching word
        encoded_word = tokenizer(original_word, return_tensors='pt')
        word_code = int(encoded_word.input_ids[0][1]) # get token code for target word
        target_index = list(np.array(sent_encoded_input.input_ids[0])).index(word_code) # look for token code in sentence to find the right word embedding

        # get embeddings for each layer of network
        for layer in range(1,13):
            sent_embedding_raw = sent_model_output.hidden_states[layer].detach().numpy()[0] # get sentence embeddings
            word_embedding = sent_embedding_raw[target_index]
            save_embeddings_word(lemmatised_word, word_embedding, layer, save_path) # save embeddings to file by layer

        vocab_storage_count[lemmatised_word] += 1


### Put together full set of embeddings

In [20]:
# Construct dictionary of all contextualised word embeddings
word_embed_dict = {}
for layer in range(1,13):
    layer_dict = {}
    for word in vocab_set: # read all words in vocab set
        filename = save_path+word+'_'+str(layer)+'.txt' # get the file
        try:
            with open(filename) as file:
                np_lines = np.loadtxt(file)
                layer_dict[word] = np_lines
        except: # use non-contextual embedding if no contextual available
            embedding = transformer_embed_decontext(model, tokenizer, word, layer=0)
            layer_dict[word] = embedding
    word_embed_dict[layer] = layer_dict

In [32]:
# Save raw contextualised embeddings to a single text file
for layer in range(1,13):
    for word in word_embed_dict[layer].keys():
        if len(np.ndarray.flatten(word_embed_dict[layer][word])) > 800: # only if we only have multiple embeddings
            contextual_embedding = np.mean(word_embed_dict[layer][word], axis=0) # average over all saved embeddings
        else:
            contextual_embedding = word_embed_dict[layer][word]
        raw_embeds_file = save_path+'contextual_embeddings_layer_'+str(layer)+'.txt'

        with open(raw_embeds_file, "a", encoding='utf-8') as file:
            final_string = word+' '+str(contextual_embedding)[2:-1] # don't include brackets in string
            file.writelines(final_string)
            file.write('\n')

In [26]:
# Save normalised transformer embeddings; see paper 'all bark and no bite'
for layer in range(1,13):

    # Open file with unnormalised embeddings
    raw_embeds_file = save_path+'contextual_embeddings_layer_'+str(layer)+'.txt'
    with open(raw_embeds_file) as file:
        lines = [line.rstrip('\n') for line in file]

    # Load values into dictionary
    model_dict = {}
    for line in lines:
        word_list = line.split()
        word = word_list[0]
        embedding_list = [float(x) for x in word_list[1:-1]]
        embedding_np = np.array(embedding_list)
        model_dict[word] = embedding_np

    # Convert to numpy array
    first_key = list(model_dict.keys())[0]
    length = len(model_dict[first_key])
    model_np = np.empty((0,length), float)
    for word in model_dict.keys():
        model_np = np.vstack([model_np, model_dict[word]])

    # Normalise array
    mean_np = np.mean(model_np,axis=0)
    std_np = np.std(model_np, axis=1)
    mean_tp_np = np.transpose(model_np - mean_np)
    model_final_np = np.transpose(mean_tp_np/std_np)

    # Save normalised embeddings to new file
    norm_embeds_file = save_path+'contextual_embeddings_layer_normalised_'+str(layer)+'.txt'
    i=0
    with open(norm_embeds_file, "a", encoding='utf-8') as file:
        for word in model_dict.keys():
            final_string = word+' '+str(model_final_np[i,:])[1:-1] # remove brackets from numpy
            file.writelines(final_string)
            file.write('\n')
            i=i+1

### Testing code

In [6]:
# Print the encoding ids for a whole sentence
sentence = 'here is a test sentence.'
encoded_input = tokenizer(sentence, return_tensors='pt') # note use of sentence not lemmatised sentence
model_output = model(**encoded_input)
sent_embedding_raw = model_output.hidden_states[12].detach().numpy()[0] # get sentence embeddings
encoded_ids = np.array(encoded_input.input_ids[0])
print(encoded_ids)

[ 101 2182 2003 1037 3231 6251 1012  102]
