In [1]:
import numpy as np
import pandas as pd
import csv
import nltk
import re
import wikipedia
import os

from collections import Counter
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import logging
from scipy.stats import spearmanr
from nltk.stem import WordNetLemmatizer
from alive_progress import alive_bar

np.set_printoptions(precision=4, threshold=1000, linewidth=10000, suppress=True, floatmode='fixed')
path_base = 'D:\Study and Projects\School Work\Year 25 - PhD 1\Data\\'

# Load wordnet
from nltk.corpus import wordnet
from nltk.data import path # need to specify the location of the nltk data
path.append("D:\Study and Projects\School Work\Year 25 - PhD 1\Data\Frames and Structured Data\\nltk_data")

### Functions for loading wiki articles

In [2]:
# Check to see if given wiki article can be loaded, needed to avoid crashes for loading non-existent articles
def check_wiki_article(title, printing=False):
    """ string -> bool
    Returns True if the article corresponding to the inputted title can be loaded, False if not.
    """
    try:
        article = wikipedia.page(title) # load the wiki article
        if printing==True:
            print('Loaded: '+article.title)
            print(article.content[0:100]) # show preview of article
        loaded = True
    except:
        loaded = False
    
    if loaded==False:
        try:
            alt_title = title+'s' # plural sometimes works
            article = wikipedia.page(alt_title)
            if printing==True:
                print('Loaded: '+article.title)
                print('Title used: '+alt_title)
                print(article.content[0:100]) # print a preview
            loaded = True
        except:
            loaded = False
    
    if loaded==False:
        try:
            alt_title = title+title[-1] # sometimes this works for some reason
            article = wikipedia.page(alt_title)
            if printing==True:
                print('Loaded: '+article.title)
                print('Title used: '+alt_title)
                print(article.content[0:100]) # print a preview
            loaded = True
        except:
            if printing==True: 
                print('not found')
            loaded = False
            
    return(loaded)


# Load plain text of single wiki article
def load_wiki_article(article_title):
    """ string -> list
    Loads the wikipedia article corresponding to the given title, returning its content as a list of sentences.
    """
    try:
        article = wikipedia.page(article_title) # load the wiki article
        article_sentences = split_to_list(article)
    except:
        try:
            alt_title = article_title+'s' # plural sometimes works
            article = wikipedia.page(alt_title)
            article_sentences = split_to_list(article)
        except:
            print(article_title+' not found')
            article_sentences = [] # return empty list
    finally:
        return(article_sentences)


# Split article content into list of one sentence per line
def split_to_list(article):
    """ article_object -> list
    Takes a wikipedia article object and extracts the contents as text, splitting to one sentence per line 
    and removing some irrelevant punctuation and short sentences. Returns a list of sentences.
    """
    sentences = nltk.sent_tokenize(article.content, language="english") # split article by paragraph
    sentences_final = []
    skip=False
    skip_set = ('i.e.','e.g.')
    min_sentence_len = 50
    i=0

    for sentence in sentences:
        if skip==True: # skip if needed
            skip=False
            i=i+1
            continue

        else:
            l = len(sentence)
            if sentence[l-4:l] in skip_set: # if last four chars match anything in the skip set (e.g. or i.e.)
                sentence = sentence+' '+sentences[i+1] # combine with next line
                skip=True # skip the next line as we just added it on to this line

            sentence = re.sub('\[.+\]', '', sentence) # remove anything in square brackets (mostly the pronunciation guide)
            sentence = re.sub('(\W);', '\\1', sentence)
            sentence = re.sub('([a-z]{2,}\.)([A-Z][a-z])', '\\1\n\\2', sentence) # split the weird sentences with .New format
            sentence = re.sub(':\s(\d|,|\s|\–){2,}', '', sentence) # remove lingering page numbers
            new_sentences = sentence.split('\n') # split multi-line paragraphs

            for new_sentence in new_sentences:
                if new_sentence=='': # remove blank lines
                    continue
                elif new_sentence[0]=='=': # remove headings
                    continue
                elif len(new_sentence)<min_sentence_len: # remove very short lines
                    continue
                elif new_sentence[-1]!='.': # must end with full stop
                    continue
                elif new_sentence.find('ISBN ')>0: # ignore lines with ISBNs
                    continue
                else:
                    new_sentence = new_sentence.replace('"','') # remove quotation marks
                    sentences_final.append(new_sentence)
            i=i+1

    print('Loaded: '+article.title+', Sentences: '+str(len(sentences_final))) # number of sentences
    return(sentences_final)


# Save list of stentences from a given article to a file
def save_sentences(sentences_list, filename, path):
    save_path = path+filename+'.txt'
    save_file = open(save_path, "a", encoding='utf-8')

    for sentence in sentences_list:
        #print(sentence)
        save_file.writelines(sentence)
        save_file.write('\n')            
    save_file.close()

### Functions for getting sentence embeddings

In [3]:
# Function to calculate cosine similarity between two embeddings
def cosine_sim(embed_1, embed_2):
    """ numpy_array, numpy_array -> float
    Returns the cosine similarity (-1 to 1) between two embeddings, inputted as vectors.
    """
    if np.dot(embed_1,embed_2) == 0:
        similarity = 0 # don't normalise if similarity is zero
    else:
        similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
    return(similarity)


# Get decontextualised transformer embedding for given word
def transformer_embed_decontext(model, tokenizer, word, layer=0, embed_type='decontext'):
    encoded_input = tokenizer(word, return_tensors='pt') #pt = pytorch
    model_output = model(**encoded_input)
    word_embedding_raw = model_output.hidden_states[layer].detach().numpy()[0]
     
    if embed_type=='mean': # take the mean of all tokens
        word_embedding = word_embedding_raw.mean(axis=0)
    elif embed_type=='cls': # use the 'CLS' token
        word_embedding = word_embedding_raw[0]
    elif embed_type=='decontext': # take the mean of word tokens only
        word_embedding = word_embedding_raw[1:-1].mean(axis=0)

    return(word_embedding)


# Get contextualised transformer embedding for single word over the entire corpus
def transformer_embed_context(model, tokenizer, target_word, sentence_corpus, count_limit=100, layer=0):
    embeddings_storage = []
    lemmatizer = WordNetLemmatizer()

    count=0
    for sentence in sentence_corpus:

        lemmatised_sentence_list = []
        lemmatised_sentence_dict = {}
        for original_word in sentence.split():
            fixed_original_word = re.sub('[,;.?!\)\(]', '', original_word) # remove punctuation
            lemmatised_word = lemmatizer.lemmatize(fixed_original_word) # lematise all words in sentence
            lemmatised_sentence_list.append(lemmatised_word)
            lemmatised_sentence_dict[lemmatised_word] = original_word

        if count>count_limit: # don't need more than 500 sentences
            break
        if target_word in lemmatised_sentence_list:
            count=count+1 
            encoded_input = tokenizer(sentence, return_tensors='pt') # note use of sentence not lemmatised sentence
            model_output = model(**encoded_input)
            sent_embedding_raw = model_output.hidden_states[12].detach().numpy()[0] # get sentence embeddings

            original_word = lemmatised_sentence_dict[target_word] # undo lemmatisation of matching word
            encoded_target_word = tokenizer(original_word, return_tensors='pt') #pt = pytorch
            target_code = int(encoded_target_word.input_ids[0][1]) # get token code for target word 
            target_index = list(np.array(encoded_input.input_ids[0])).index(target_code) # look for token code in sentence to find the right word embedding
            embeddings_storage.append(sent_embedding_raw[target_index]) # get embedding of target word from sentence
    
    return(np.array(embeddings_storage), count)


# Load word similarity dataset
def load_sim_dataset(model):
    path = path_base+'Word Similarity Data\Word Similarities Final\\'
    filename = path+model+'.txt'
    with open(filename) as file:
        lines = file.readlines()

    wordpairs = [None]*len(lines) # initialise storage
    ratings = [None]*len(lines)
    i=0
    for line in lines:
        line = line.strip() # remove new line chars
        wordpairs[i] = line.split() # split at any whitespace chars
        ratings[i] = float(wordpairs[i][2])
        i=i+1
    ratings = np.array(ratings)

    return(wordpairs, ratings)


# Save contextual embeddings for a given word to a new file for each word
def save_embeddings_word(word, word_embeddings, layer, path):
    save_path = path+word+'_'+str(layer)+'.txt'
    with open(save_path, "a", encoding='utf-8') as save_file:
        final_string = str(word_embeddings)[2:-1] # don't include brackets in string
        save_file.writelines(final_string)
        save_file.write('\n')


# Function to return a lemmatised list and dictionary for a given sentence
def lemmatise_sent(sentence):
    lemmatised_sentence_list = []
    lemmatised_sentence_dict = {}
    lemmatizer = WordNetLemmatizer()
    for original_word in sentence.split():
        fixed_original_word = re.sub('[,;.?!\)\(]', '', original_word) # remove punctuation
        lemmatised_word = lemmatizer.lemmatize(fixed_original_word, wordnet.VERB) # lematise all words in sentence
        lemmatised_sentence_list.append(lemmatised_word)
        lemmatised_sentence_dict[lemmatised_word] = original_word
    return(lemmatised_sentence_list, lemmatised_sentence_dict)


# get encoding number for a specific word
def get_word_code(word):
    encoded_word = tokenizer(word, return_tensors='pt') #pt = pytorch
    word_code = int(encoded_word.input_ids[0][1]) # get token code for target word
    return(word_code)

### Load list of articles and save the sentences to file

In [5]:
# Load list of wikipedia articles to use
titles_file = path_base+'Corpus Data\Wikipedia 10k corpus\\article_list.txt'
article_titles_pd = pd.read_table(titles_file, index_col=0, header=None, quoting=csv.QUOTE_NONE, skip_blank_lines=True)
article_titles_list = article_titles_pd.index.values
print('List of '+str(len(article_titles_list))+' articles loaded')

List of 10001 articles loaded


In [8]:
# Trial loading and printing article
article_title = 'Königstiger'
if check_wiki_article(article_title):
    article_content = load_wiki_article(article_title)
else:
    print('not found')

Loaded: Tiger II, Sentences: 169


In [9]:
# Load plain text of all wiki articles from list and save sentences to file
save_path = path_base+'/Corpus Data/'
for article_title in article_titles_list:
    sentences_list = load_wiki_article(article_title)
    save_sentences(sentences_list, 'full_corpus_4', save_path)

Loaded: Lillian Gish, Sentences: 139
Loaded: Buster Keaton, Sentences: 246
Loaded: Harold Lloyd, Sentences: 155
Loaded: Mary Pickford, Sentences: 221
Loaded: Gloria Swanson, Sentences: 236
Loaded: Asta Nielsen, Sentences: 78
Loaded: Fred Astaire, Sentences: 267


### Analyse the sentences and get embeddings

In [4]:
# Load full corpus of sentences from file
path = path_base+'Corpus Data/Wikipedia 10k corpus/full_corpus.txt'
sentence_corpus_pd = pd.read_table(path, index_col=0, header=None, sep="\n", quoting=csv.QUOTE_NONE, skip_blank_lines=True)
sentence_corpus_list = sentence_corpus_pd.index.values
print(str(len(sentence_corpus_list))+' sentences loaded')

2002787 sentences loaded


In [5]:
# Load transformer model
model_name = 'ernie-2.0-en'
path = path_base+'Sentence Embeddings//'+model_name
logging.set_verbosity_error() # turn off annoying model initialisation warning
config_state = AutoConfig.from_pretrained(path, output_hidden_states=True) # get hidden states
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(path, config=config_state)
print(model_name+' model loaded')

ernie-2.0-en model loaded


In [9]:
# Load vocab set
dataset_name = 'all_nouns'
dataset, _ = load_sim_dataset('combined_dataset_nouns')
vocab = []
for word_pair in dataset:
    vocab.append(word_pair[0])
    vocab.append(word_pair[1])
vocab_set = list(set(vocab))
vocab_set.sort()
print(dataset_name+' vocab loaded')
print(str(len(vocab_set))+' words')

all_nouns vocab loaded
5824 words


In [10]:
# Initialise vocab counter
vocab_storage_count = Counter()
path = path_base+'Word Embeddings\Ernie Contextual Verbs\\'

for word in list(vocab_set): # looping over words in vocab set
    filename = path+word+'_1.txt'
    if os.path.isfile(filename): # check if that word has any saved embeddings
        with open(filename) as file:
            lines = [line.rstrip('\n') for line in file]
            vocab_storage_count[word] = len(lines) # get count of number of embeddings saved
    else:
        vocab_storage_count[word] = 0 # no embeddings saved yet

In [13]:
# Get contextual embeddings for each word in vocab set and save each to separate file
start_num = 0 # number to start at if not the beginning
count = start_num
count_limit = 100 # max number of contextualised embeddings per word

with alive_bar(len(sentence_corpus_list)-start_num, force_tty=True) as bar: # progress bar to show progress

    for sentence in sentence_corpus_list[start_num:]:

        # get list of vocab that we still need to get embeddings for (limit not reached)
        remaining_vocab = [word for word in vocab_set if vocab_storage_count[word]<count_limit]

        # get lemmatiseed list of words in sentence
        sent_encoded_input = tokenizer(sentence, return_tensors='pt') # note use of sentence not lemmatised sentence
        encoded_word_ids = np.array(sent_encoded_input.input_ids[0])
        sent_model_output = model(**sent_encoded_input)
        lemmatised_sentence_list, lemmatised_sentence_dict = lemmatise_sent(sentence) # get lemmatised version of the sentence

        # get embeddings for all words from the vocab set
        for lemmatised_word in list(set(lemmatised_sentence_list) & set(remaining_vocab)): # get words in the vocab set
            original_word = lemmatised_sentence_dict[lemmatised_word] # undo lemmatisation of matching word
            encoded_word = tokenizer(original_word, return_tensors='pt')
            word_code = int(encoded_word.input_ids[0][1]) # get token code for target word
            target_index = list(np.array(sent_encoded_input.input_ids[0])).index(word_code) # look for token code in sentence to find the right word embedding

            # get embeddings for each layer of network
            for layer in range(1,13):
                sent_embedding_raw = sent_model_output.hidden_states[layer].detach().numpy()[0] # get sentence embeddings
                word_embedding = sent_embedding_raw[target_index]
                save_embeddings_word(lemmatised_word, word_embedding, layer, path) # save embeddings to file by layer

            vocab_storage_count[lemmatised_word] += 1

        bar() # needed for progress bar

|██████▊                                 | ▄▆█ 336397/2002787 [17%] in 4:09:04 (22.5/s, eta: 20:33:46) 18547/2002787 [1%] in 15:15 (20.3/s, eta: 27:11:21) in 35:07 (20.5/s, eta: 26:36:50) (20.4/s, eta: 26:21:32) in 1:02:38 (20.7/s, eta: 25:48:29) (21.2/s, eta: 24:50:13)  ▅▃▁ 115060/2002787 [6%] in 1:29:45 (21.4/s, eta: 24:32:37)  ▅▇▇ 139488/2002787 [7%] in 1:47:19 (21.7/s, eta: 23:53:33) in 2:48:08 (22.2/s, eta: 22:13:12) ▃▁▃ 229093/2002787 [11%] in 2:51:40 (22.2/s, eta: 22:09:07)  ▆█▆ 230097/2002787 [11%] in 2:52:26 (22.2/s, eta: 22:08:28) 232277/2002787 [12%] in 2:53:59 (22.3/s, eta: 22:06:12) (22.3/s, eta: 21:48:46) (22.4/s, eta: 21:26:50) 278053/2002787 [14%] in 3:27:18 (22.4/s, eta: 21:25:52)  279805/2002787 [14%] in 3:28:33 (22.4/s, eta: 21:24:13) in 3:47:49 (22.4/s, eta: 20:59:16) ▆▄▂ 319652/2002787 [16%] in 3:57:04 (22.5/s, eta: 20:48:18) ▃▁▃ 329118/2002787 [16%] in 4:03:42 (22.5/s, eta: 20:39:19) (22.5/s, eta: 20:34:52) 

### Put together full set of embeddings

In [10]:
# Construct dictionary of all contextualised word embeddings
word_embed_dict = {}
path = path_base+'Word Embeddings\Ernie Contextual Nouns\\'
for layer in range(1,13):
    layer_dict = {}
    for word in vocab_set: # read all words in vocab set
        filename = path+word+'_'+str(layer)+'.txt' # get the file
        try:
            with open(filename) as file:
                np_lines = np.loadtxt(file)
                layer_dict[word] = np_lines
        except: # use non-contextual embedding if no contextual available
            embedding = transformer_embed_decontext(model, tokenizer, word, layer=0)
            layer_dict[word] = embedding
    word_embed_dict[layer] = layer_dict

In [11]:
# Save raw contextualised embeddings to a single text file
for layer in range(1,13):
    for word in word_embed_dict[layer].keys():
        if len(np.ndarray.flatten(word_embed_dict[layer][word])) > 800: # only if we only have multiple embeddings
            contextual_embedding = np.mean(word_embed_dict[layer][word], axis=0) # average over all saved embeddings
        else:
            contextual_embedding = word_embed_dict[layer][word]
        save_path = path+'\contextual_embeddings_layer_'+str(layer)+'.txt'

        with open(save_path, "a", encoding='utf-8') as save_file:
            final_string = word+' '+str(contextual_embedding)[2:-1] # don't include brackets in string
            save_file.writelines(final_string)
            save_file.write('\n')

In [12]:
# Save normalised transformer embeddings; see paper 'all bark and no bite'
for layer in range(1,13):

    # Open file with unnormalised embeddings
    filename = path+'\contextual_embeddings_layer_'+str(layer)+'.txt'
    with open(filename) as file:
        lines = [line.rstrip('\n') for line in file]

    # Load values into dictionary
    model_dict = {}
    for line in lines:
        word_list = line.split()
        word = word_list[0]
        embedding_list = [float(x) for x in word_list[1:-1]]
        embedding_np = np.array(embedding_list)
        model_dict[word] = embedding_np

    # Convert to numpy array
    first_key = list(model_dict.keys())[0]
    length = len(model_dict[first_key])
    model_np = np.empty((0,length), float)
    for word in model_dict.keys():
        model_np = np.vstack([model_np, model_dict[word]])

    # Normalise array
    mean_np = np.mean(model_np,axis=0)
    std_np = np.std(model_np, axis=1)
    mean_tp_np = np.transpose(model_np - mean_np)
    model_final_np = np.transpose(mean_tp_np/std_np)

    # Save normalised embeddings to new file
    save_path = path+'contextual_embeddings_layer_normalised_'+str(layer)+'.txt'
    i=0
    with open(save_path, "a", encoding='utf-8') as save_file:
        for word in model_dict.keys():
            final_string = word+' '+str(model_final_np[i,:])[1:-1] # remove brackets from numpy
            save_file.writelines(final_string)
            save_file.write('\n')
            i=i+1

### Testing code

In [6]:
# Print the encoding ids for a whole sentence
sentence = 'here is a test sentence.'
encoded_input = tokenizer(sentence, return_tensors='pt') # note use of sentence not lemmatised sentence
model_output = model(**encoded_input)
sent_embedding_raw = model_output.hidden_states[12].detach().numpy()[0] # get sentence embeddings
encoded_ids = np.array(encoded_input.input_ids[0])
print(encoded_ids)

[ 101 2182 2003 1037 3231 6251 1012  102]


In [82]:
# Test single words in BERT
embed_1 = transformer_embed_decontext(model, tokenizer, 'machine', layer=0)
embed_2 = transformer_embed_decontext(model, tokenizer, 'industry', layer=0)
print(cosine_sim(embed_1,embed_2))

# Test single words in BERT with context
embed_all_1, count_1 = transformer_embed_context(model, tokenizer, 'machine', count_limit=100, layer=12)
embed_all_2, count_2 = transformer_embed_context(model, tokenizer, 'industry', count_limit=100, layer=12)
mean_embed_1 = np.mean(embed_all_1, axis=0)
mean_embed_2 = np.mean(embed_all_2, axis=0)
print(cosine_sim(mean_embed_1, mean_embed_2))

0.19416906
