In [16]:
import numpy as np
import torch
import pandas as pd
import csv
import matplotlib.pyplot as plt
import tensorflow as tf
import simple_elmo

from transformers import AutoModel, AutoTokenizer, AutoConfig
from allennlp.modules.elmo import Elmo, batch_to_ids
from scipy.stats import spearmanr
from simple_elmo import ElmoModel

In [32]:
class word_analysis(object):
    
    def __init__(self):
        self.models = list(self.model_storage.keys())
        self.datasets = list(self.dataset_storage.keys())
        return(None)
    
    # Define file location variables
    folder_loc= 'D:/Study and Projects/School Work/Year 25 - PhD 1/PhD Work/Data//'
    model_loc = 'Word Embeddings//'
    dataset_loc = 'Word Similarity Data/Collection of Word Similarity Benchmarks//'
    path_base = 'D:/Study and Projects/School Work/Year 25 - PhD 1/PhD Work/Data/Sentence Embeddings' #must use '/'

    model_files = {'glove':'Glove Word Embeddings\glove.6B.300d.txt',
                   'fasttext':'fastText_Skipgram_true_wiki.txt',
                   'conceptnet':'conceptnet-numberbatch-300-en.txt',
                   'word2vec_skip':'Word2vec Skipgram CoNLL17\model.txt',
                   'lexvec':'lexvec_embeddings_wiki+newscrawl_300d.txt',
                   'bert':'bert-base-uncased-vocab.txt',
                   'elmo':'Elmo Embeddings\elmo_false_wiki2019',
                  }
        
    dataset_files = {'RG65':'EN-RG-65.txt',
                     'YP130':'EN-YP-130.txt',
                     'MTurk287':'EN-MTurk-287.txt',
                     'MTurk771':'EN-MTurk-771.txt',
                     'WS353':'EN-WS-353-ALL.txt',
                     'RW':'EN-RW-STANFORD.txt',
                     'MEN':'EN-MEN-TR-3k.txt',
                     'SimVerb':'EN-SimVerb-3500.txt',
                     'Simlex':'EN-SIMLEX-999.txt'
                    }
    
    model_storage = {'glove':0, 'fasttext':0, 'conceptnet':0, 
                     'word2vec_skip':0, 'lexvec':0, 'bert':0, 'elmo':0, 'gensim_skip':0}
     
    dataset_storage = {'RG65':0, 'YP130':0, 'MTurk287':0, 'MTurk771':0, 
                       'WS353':0, 'RW':0, 'MEN':0, 'SimVerb':0, 'Simlex':0}
    
        
    # Function to load a specific word embedding model
    def import_model(self, model_name):
        """ string -> None
        Imports an embedding model, storing it in the model_storage dictionary.
        """
        if self.model_storage[model_name]==0:
            if model_name=='bert':
                model_path = self.folder_loc + 'Sentence Embeddings/bert-base-uncased'
                config_state = AutoConfig.from_pretrained(model_path, output_hidden_states=True) # get hidden states
                tokenizer = AutoTokenizer.from_pretrained(model_path)
                model = AutoModel.from_pretrained(model_path, config=config_state)
                self.model_storage[model_name] = [(model,tokenizer)] # need the model and tokeniser
                           
            elif model_name=='elmo':
                options_path = folder_loc + model_loc + 'Elmo Embeddings/elmo_false_wiki2019/options.json'
                weights_path = folder_loc + model_loc + 'Elmo Embeddings/elmo_false_wiki2019/model.hdf5'
                elmo = Elmo(options_path, weights_path, 3, dropout=0)
                self.model_storage[model_name] = [elmo]
      #          model_path = self.folder_loc + self.model_loc + 'Elmo Embeddings/elmo_false_wiki2019'
      #          graph = tf.Graph()
      #          with graph.as_default() as elmo_graph:
      #              elmo_model = ElmoModel()
      #              elmo_model.load(model_path)
      #          with elmo_graph.as_default() as current_graph: # need this part so we can load multiple times
      #              tf_session = tf.compat.v1.Session(graph=elmo_graph) # TF_session must be passed with the model
      #              with tf_session.as_default() as sess:
      #                  elmo_model.elmo_sentence_input = simple_elmo.elmo.weight_layers("input", elmo_model.sentence_embeddings_op)
      #                  sess.run(tf.compat.v1.global_variables_initializer())
      #          self.model_storage[model_name] = (elmo_model, tf_session)
                        
            else: # for static word embedding models
                file_loc = self.model_files[model_name]
                filename = self.folder_loc+self.model_loc+file_loc
                self.model_storage[model_name] = [pd.read_table(filename, sep=' ', index_col=0, header=None, quoting=csv.QUOTE_NONE)]
        
            print(model_name+' loaded')
            
    
    # Function to load word similarity data for specified dataset
    def import_dataset(self, dataset_name):
        """ string -> None
        Imports a dataset, storing a value of the form (list, numpy_array) in the dataset_storage dictionary.
        """
        if self.dataset_storage[dataset_name]==0: # if dataset not yet loaded
            file_loc = self.dataset_files[dataset_name]
            filename = self.folder_loc+self.dataset_loc+file_loc
            with open(filename) as file:
                lines = file.readlines()

            wordpairs = [None]*len(lines) # initialise storage
            ratings = [None]*len(lines)
            i=0
            for line in lines:
                line = line.strip() # remove new line chars
                wordpairs[i] = line.split() # split at any whitespace chars
                ratings[i] = float(wordpairs[i][2])
                i=i+1
            ratings = np.array(ratings)

            self.dataset_storage[dataset_name] = (wordpairs, ratings)
            print(dataset_name+' loaded')   
        
    
    # Function to get the embedding for a specific word, given a model
    def get_word_embed(self, model_name, word, layer, comp_method):
        self.import_model(model_name)
        model = self.model_storage[model_name][0] # [0] to reach inside the array its in
        
        if model_name=='bert':
            model_main = model[0]
            tokenizer = model[1] # transformers need a tokenizer as well
            encoded_input = tokenizer(word, return_tensors='pt') #pt = pytorch
            model_output = model_main(**encoded_input)
            word_embedding_raw = np.array(model_output[2][layer].detach()[0])
            # embeddings depend on the comp_method chosen
            if comp_method=='mean': # take the mean of all tokens
                word_embedding = word_embedding_raw.mean(axis=0)
            elif comp_method=='cls': # use the 'CLS' token
                word_embedding = word_embedding_raw[0]
            elif comp_method=='decontext': # take the mean of word tokens only
                word_embedding = word_embedding_raw[1:-1].mean(axis=0)

        elif model_name=='elmo':
            character_ids = batch_to_ids(list([word]))
            embeddings_allennlp = model(character_ids)['elmo_representations'][0]
            word_embedding = embeddings_allennlp[0].mean(axis=0).detach().numpy()
            #tf_session = self.model_storage[model_name][1] # need TF session information for elmo
            #char_embeddings = model.get_elmo_vector_average(word, layers='average', warmup=False, session=tf_session)
            #word_embedding = char_embeddings.mean(axis=0)
            
        else: # for static word embedding models
            embed_dim = round(model.loc[['man']].shape[1],-2) # get embedding length
            if word in list(model.index.values):
                word_embed = model.loc[[word]] # get embedding from pandas array
                word_embedding = np.array(word_embed)[0][0:embed_dim]
            else: # if the word can't be found in the model
                print('missing '+word)
                word_embedding = 4*(np.random.rand(1,embed_dim))[0] # random embedding
                
        return(word_embedding)
    
    
    # Function to compute the similarities of all word pairs from a given database, for a given model
    def compute_model_sims(self, model_name, dataset_name, layer, comp_method):
        self.import_dataset(dataset_name) # load dataset if needed
        dataset_words = self.dataset_storage[dataset_name][0] # word pairs in [0]
        
        embed_sims = [None]*len(dataset_words)
        i=0
        for word_pair in dataset_words:
            word_embed_1 = self.get_word_embed(model_name, word_pair[0], layer, comp_method)
            word_embed_2 = self.get_word_embed(model_name, word_pair[1], layer, comp_method)
            embed_sims[i] = self.cosine_sim(word_embed_1, word_embed_2)
            i=i+1
        return(embed_sims)
    
    
    # Function to calculate cosine similarity between two embeddings
    def cosine_sim(self, embed_1, embed_2):
        """ numpy_array, numpy_array -> float
        Returns the cosine similarity (-1 to 1) between two embeddings, inputted as vectors.
        """
        if np.dot(embed_1,embed_2) == 0:
            similarity = 0 # don't normalise if similarity is zero
        else:
            similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
        return(similarity)
       
    
    # Function to compute the correlation between model and dataset embedding similarities
    def compute_embed_correls(self, model_name, dataset_name, layer, comp_method):
        """ string, string, int, string -> (float, float)
        Computes the pearson_r and spearman_r between word similarities for a dataset and model.
        """
        self.import_dataset(dataset_name) # load model and dataset if needed
        self.import_model(model_name)
        model_sims = self.compute_model_sims(model_name, dataset_name, layer, comp_method)
        dataset_sims = self.dataset_storage[dataset_name][1] # similarities stored in [1]
        pearson_r = np.corrcoef(model_sims, dataset_sims)[0,1]
        spearman_r, p = spearmanr(model_sims, dataset_sims)
        return(pearson_r, spearman_r)
    
    def model_vs_data(self, model_name, dataset_name, layer=0, comp_method='mean'):
        correlations = self.compute_embed_correls(model_name, dataset_name, layer, comp_method)
        print("Evaluating "+model_name+" against "+dataset_name)
        print("pearson: {:.3f}".format(correlations[0]), "\nspearman: {:.3f}\n".format(correlations[1]))

In [33]:
embedding_analysis = word_analysis()

In [34]:
embedding_analysis.import_dataset('RG65')

RG65 loaded


In [35]:
embedding_analysis.model_vs_data('elmo','RG65')

2022-02-18 16:21:46,874 : INFO : Initializing ELMo


elmo loaded
Evaluating elmo against RG65
pearson: -0.005 
spearman: -0.035



In [11]:
def cosine_sim(embed_1, embed_2):
    """ numpy_array, numpy_array -> float
    Returns the cosine similarity (-1 to 1) between two embeddings, inputted as vectors.
    """
    if np.dot(embed_1,embed_2) == 0:
        similarity = 0 # don't normalise if similarity is zero
    else:
        similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
    return(similarity)

In [40]:
folder_loc= 'D:/Study and Projects/School Work/Year 25 - PhD 1/PhD Work/Data//'
model_loc = 'Word Embeddings//'
model_path = folder_loc + model_loc + 'Elmo Embeddings/elmo_false_wiki2019'
graph = tf.Graph()
with graph.as_default() as elmo_graph:
    elmo_model = ElmoModel()
    elmo_model.load(model_path)
with elmo_graph.as_default() as current_graph: # need this part so we can load multiple times
    tf_session = tf.compat.v1.Session(graph=elmo_graph) # TF_session must be passed with the model
    with tf_session.as_default() as sess:
        elmo_model.elmo_sentence_input = simple_elmo.elmo.weight_layers("input", elmo_model.sentence_embeddings_op)
        sess.run(tf.compat.v1.global_variables_initializer())

2022-02-18 16:31:02,763 : INFO : Loading model from D:/Study and Projects/School Work/Year 25 - PhD 1/PhD Work/Data//Word Embeddings//Elmo Embeddings/elmo_false_wiki2019...
2022-02-18 16:31:02,764 : INFO : We will cache the vocabulary of 100 tokens.
  lstm_cell = tf.compat.v1.nn.rnn_cell.LSTMCell(
  self._kernel = self.add_variable(
  self._bias = self.add_variable(
  self._proj_kernel = self.add_variable(


In [46]:
char_embeddings1 = elmo_model.get_elmo_vectors('cord', layers='top', warmup=False, session=tf_session)
word_1 = char_embeddings1.mean(axis=0)[0]
char_embeddings2 = elmo_model.get_elmo_vectors('smile', layers='top', warmup=False, session=tf_session)
word_2 = char_embeddings2.mean(axis=0)[0]

cosine_sim(word_1, word_2)

0.7654833868676729

In [50]:
folder_loc= 'D:/Study and Projects/School Work/Year 25 - PhD 1/PhD Work/Data//'
model_loc = 'Word Embeddings//'
options_path = folder_loc + model_loc + 'Elmo Embeddings/elmo_false_wiki2019/options.json'
weights_path = folder_loc + model_loc + 'Elmo Embeddings/elmo_false_wiki2019/model.hdf5'
elmo = Elmo(options_path, weights_path, num_output_representations=1, dropout=0)

2022-02-18 16:45:56,886 : INFO : Initializing ELMo


In [43]:
character_ids_1 = batch_to_ids(list(['gem']))
embeddings_allennlp_1 = elmo(character_ids_1)['elmo_representations'][0]
embed_1 = embeddings_allennlp_1[0].mean(axis=0).detach().numpy()

character_ids_2 = batch_to_ids(list(['jewel']))
embeddings_allennlp_2 = elmo(character_ids_2)['elmo_representations'][0]
embed_2 = embeddings_allennlp_2[0].mean(axis=0).detach().numpy()

cosine_sim(embed_1, embed_2)

0.7927116

In [44]:
character_ids_1 = batch_to_ids(list(['cord']))
embeddings_allennlp_1 = elmo(character_ids_1)['elmo_representations'][0]
embed_1 = embeddings_allennlp_1[0].mean(axis=0).detach().numpy()

character_ids_2 = batch_to_ids(list(['smile']))
embeddings_allennlp_2 = elmo(character_ids_2)['elmo_representations'][0]
embed_2 = embeddings_allennlp_2[0].mean(axis=0).detach().numpy()

cosine_sim(embed_1, embed_2)

0.7310033