In [1]:
## Code to extract embeddings from static word embedding models
## James Fodor 2022
## Python 3.8

import pandas as pd
import numpy as np

# Set numpy display properties needed for printing to file
np.set_printoptions(precision=5, threshold=10000, linewidth=10000, suppress=True, floatmode='fixed')

# Define base path location for data
path_base = 'D:\Study and Projects\School Work\Year 25 - PhD 1\Data\\'

In [4]:
## Key functions

# Function to load a specific word embedding model
def import_model(file_loc, vocab, full_import=False):
    """ string -> None
    Imports an embedding model, storing it in the model_embed_storage dictionary.
    """
        
    # open relevant file
    filename = path_base+file_loc
    with open(filename) as file:
        lines = [line.rstrip('\n') for line in file]

    model_dict = {} # create word dictionary for specific model
    for line in lines:
        word_list = line.split()
        word = word_list[0]
        if full_import==False and word in vocab: # only  words for testing if full_import==False
            embedding_list = [float(x) for x in word_list[1:-1]] # store embeddings
            embedding_np = np.array(embedding_list)
            model_dict[word] = embedding_np
        elif full_import==True: # this will import all words in the vocab set, not just those for testing
            embedding_list = [float(x) for x in word_list[1:-1]] # store embeddings
            embedding_np = np.array(embedding_list)
            model_dict[word] = embedding_np
        else:
            continue

    return model_dict # store model dictionary in the models dictionary
    
# Load word similarity dataset
def load_sim_dataset(dataset):
    """ str -> (list_str, np_array)
    Loads a dataset of word similarities, returning the word pairs and similarity ratings.
    """
    filename = 'Vocab_lists/'+dataset+'.txt'
    with open(filename) as file:
        lines = file.readlines()

    wordpairs = [None]*len(lines) # initialise storage
    ratings = [None]*len(lines)
    i=0
    for line in lines:
        line = line.strip() # remove new line chars
        wordpairs[i] = line.split() # split at any whitespace chars
        ratings[i] = float(wordpairs[i][2])
        i=i+1
    ratings = np.array(ratings)

    return wordpairs, ratings


In [3]:
# Load vocab set
dataset_name = 'EN-SIMLEX-999-VERB'
dataset, _ = load_sim_dataset(dataset_name)
vocab = []
for word_pair in dataset:
    vocab.append(word_pair[0])
    vocab.append(word_pair[1])
vocab_set = list(set(vocab))
vocab_set.sort()
print(dataset_name+' vocab loaded')
print(str(len(vocab_set))+' words')

EN-SIMLEX-999-VERB vocab loaded
170 words


In [6]:
# Load word embeddings for set vocabulary
embed_file = 'Word Embeddings//WordNet Word Embeddings//wn2vec.txt'
embeddings = import_model(embed_file, vocab_set)

In [51]:
# Special code for certain models with unusual formatting

# Special code for Gensim Skipgram BNC
# data = np.loadtxt(embedding_loc, dtype = str) # special code for loading Gensim SKipgram BNC (regular code not working for some reason)
# embedding_model = pd.DataFrame(data[:,1:].astype(float), index = data[:,0])
# embedding_model.index = [x.replace('::',' ') for x in embedding_model.index.values]

# Special code for processing BNC data; remove _ and :: annotations
# embedding_model = pd.read_table(embedding_loc, index_col=0, header=None, delim_whitespace=True, quoting=csv.QUOTE_NONE, skip_blank_lines=True)
# embedding_model.index = [x.split('_')[0].replace('::',' ') for x in embedding_model.index.values] 
# embedding_model.index = remove_trailing_chars(embedding_model) # needed to remove trailing characters when present

# Code for removing trailing characters from words in index values; input is pandas df; needed for some models only
# def remove_trailing_chars(embedding_model):
#     new_indices = []
#     for word in embedding_model.index.values:
#         new_indices.append(word[0:-2])
#     return(new_indices)

In [11]:
# Save embeddings to new file
save_path = 'wordnet.txt' # specify name of save file
with open(save_path, "a", encoding='utf-8') as file:
    for word in vocab_set:
        try:
            embedding = embeddings[word]
            file.writelines(word+' '+str(embedding)[1:-1]) # remove brackets
            file.write('\n')
        except:
            continue
            print('missing '+word)