In [1]:
## Code to extract example sentences from Wikipedia articles
## James Fodor 2022
## Python 3.8

import numpy as np

from transformers import AutoModel, AutoTokenizer, AutoConfig
# from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import logging
from transformers import RobertaTokenizerFast

# Turn off annoying model initialisation warning
logging.set_verbosity_error() 

# Set print option for numpy, needed for saving embeddings
np.set_printoptions(precision=4, threshold=10000, linewidth=10000, suppress=True, floatmode='fixed')

# Define base path location for data
path_base = 'D:\Study and Projects\School Work\Year 25 - PhD 1\Data\\'

In [2]:
## Key functions

# Function to calculate cosine similarity between two embeddings
def cosine_sim(embed_1, embed_2):
    """ numpy_array, numpy_array -> float
    Returns the cosine similarity (-1 to 1) between two embeddings, inputted as vectors.
    """
    if np.dot(embed_1,embed_2) == 0:
        similarity = 0 # don't normalise if similarity is zero
    else:
        similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
        #similarity, _ = spearmanr(embed_1, embed_2)
    return(similarity)

# Get decontextualised transformer embedding for given word
def transformer_embed_decontext(model_name, model, tokenizer, word, layer=0, embed_type='decontext'):
    all_hidden_layers = []
    
    if model_name=='gpt2' or model_name=='gpt2-large': # gpt2 model uses a different code
        encoded_input = tokenizer.encode(word, add_prefix_space=True, return_tensors='pt')
        word_embedding_raw = model.transformer.wte.weight[encoded_input,:].detach().numpy()[0][0]
        word_embedding = word_embedding_raw.reshape(-1)
        
    elif model_name=='t5-base' or model_name=='t5-large' or model_name=='comet-atomic' or model_name=='libert-2m': # also uses different code
        encoded_input = tokenizer(word, return_tensors='pt') # note use of sentence not lemmatised sentence
        model_output = model(**encoded_input)
        word_embedding_raw = model_output.last_hidden_state.detach().numpy()[0] # layer=0 here gives gibberish
        
        if embed_type=='mean': # take the mean of all tokens
            word_embedding = word_embedding_raw.mean(axis=0)
        elif embed_type=='cls': # use the 'CLS' token
            word_embedding = word_embedding_raw[0]
        elif embed_type=='decontext': # take the mean of word tokens only
            word_embedding = word_embedding_raw[1:-1].mean(axis=0)

    else: #most models use this code
        encoded_input = tokenizer(word, return_tensors='pt') #pt = pytorch
        model_output = model(**encoded_input)
        word_embedding_raw = model_output.hidden_states[layer].detach().numpy()[0]
        all_hidden_layers = model_output.hidden_states

        if embed_type=='mean': # take the mean of all tokens
            word_embedding = word_embedding_raw.mean(axis=0)
        elif embed_type=='cls': # use the 'CLS' token
            word_embedding = word_embedding_raw[0]
        elif embed_type=='decontext': # take the mean of word tokens only
            word_embedding = word_embedding_raw[1:-1].mean(axis=0)

    return(word_embedding, all_hidden_layers)

In [3]:
# Load transformer model
model_name = 'ernie-2.0-base-en' #name of subfolder for model
filename = path_base+'Sentence Embeddings//'+model_name

if model_name=='gpt2' or model_name=='gpt2-large':
    tokenizer = GPT2Tokenizer.from_pretrained(filename)
    model = GPT2LMHeadModel.from_pretrained(filename)
elif model_name=='t5-base' or model_name=='t5-large':
    config_state = AutoConfig.from_pretrained(filename, output_hidden_states=True) # get hidden states
    tokenizer = T5Tokenizer.from_pretrained(filename)
    model = T5ForConditionalGeneration.from_pretrained(filename, config=config_state)
elif model_name=='libert-2m':
    config_state = AutoConfig.from_pretrained(filename, output_hidden_states=True) # get hidden states
    tokenizer = RobertaTokenizerFast.from_pretrained('kssteven/ibert-roberta-base') # needs a different tokenizer for some reason
    model = AutoModel.from_pretrained(filename, config=config_state)
else: # for BERT-based models
    config_state = AutoConfig.from_pretrained(filename, output_hidden_states=True) # get hidden states
    tokenizer = AutoTokenizer.from_pretrained(filename)
    model = AutoModel.from_pretrained(filename, config=config_state)

In [4]:
# Generate and save transformer single word embeddings (decontextualised)
vocab_file = 'Vocab_lists/combined_corpus_66k.txt' # from combined vocab set
vocab_path = path_base + vocab_file

# Loop over all vocab in vocab file
with open(vocab_path, 'rt', encoding='utf-8') as vocab_file:
    for line in vocab_file:
        word = line.strip()
        try:
            # Get word embedding
            word_embed, all_hidden_layers = transformer_embed_decontext(model_name, model, tokenizer, word) 
            
            # Save embeddings for all layers
            for layer in np.arange(0,13): 
                word_embedding_raw = all_hidden_layers[layer].detach().numpy()[0]
                word_embedding = word_embedding_raw[1:-1].mean(axis=0)
                embed_string = np.array_str(word_embedding) # convert np_array to string
                final_string = (word+' '+embed_string[2:-1]) # add word to front of embed string
                
                save_path = path_base + 'Word Embeddings/'+model_name+'-layer-'+str(layer)+'.txt'
                save_file = open(save_path, "a", encoding='utf-8')
                save_file.writelines(final_string)
                save_file.write('\n')
                save_file.close()
        except:
                continue # skip if we can't find the word embedding

In [7]:
# Save normalised transformer embeddings; see paper 'all bark and no bite'
for layer in range(1,13):

    # Open file with unnormalised embeddings
    raw_embeds_file = save_path+'contextual_embeddings_layer_'+str(layer)+'.txt'
    with open(raw_embeds_file) as file:
        lines = [line.rstrip('\n') for line in file]

    # Load values into dictionary
    model_dict = {}
    for line in lines:
        word_list = line.split()
        word = word_list[0]
        embedding_list = [float(x) for x in word_list[1:-1]]
        embedding_np = np.array(embedding_list)
        model_dict[word] = embedding_np

    # Convert to numpy array
    first_key = list(model_dict.keys())[0]
    length = len(model_dict[first_key])
    model_np = np.empty((0,length), float)
    for word in model_dict.keys():
        model_np = np.vstack([model_np, model_dict[word]])

    # Normalise array
    mean_np = np.mean(model_np,axis=0)
    std_np = np.std(model_np, axis=1)
    mean_tp_np = np.transpose(model_np - mean_np)
    model_final_np = np.transpose(mean_tp_np/std_np)

    # Save normalised embeddings to new file
    norm_embeds_file = save_path+'contextual_embeddings_layer_normalised_'+str(layer)+'.txt'
    i=0
    with open(norm_embeds_file, "a", encoding='utf-8') as file:
        for word in model_dict.keys():
            final_string = word+' '+str(model_final_np[i,:])[1:-1] # remove brackets from numpy
            file.writelines(final_string)
            file.write('\n')
            i=i+1