# Dependencies

In [58]:
from transformers import BertTokenizer, BertModel
import pandas as pd
pd.options.display.max_colwidth = 100
import numpy as np
from scipy.spatial.distance import cosine
import nltk
import torch

In [59]:
# Loading the pre-trained BERT model
###################################
# Embeddings will be derived from
# the outputs of this model
model = BertModel.from_pretrained('bert-large-uncased',
                                  output_hidden_states = True,
                                  )

# Setting up the tokenizer
###################################
# This is the same tokenizer that
# was used in the model to generate 
# embeddings to ensure consistency
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [60]:
###Others
from ipynb.fs.full.SQL_wikidata import find_Qid_search


# Functions

In [61]:
###Inspiration from
####https://towardsdatascience.com/3-types-of-contextualized-word-embeddings-from-bert-using-transfer-learning-81fcefe3fe6d
###https://github.com/arushiprakash/MachineLearning/blob/main/BERT%20Word%20Embeddings.ipynb

def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors


In [62]:
    
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer ( out of 12, summing on last 2)
    
    token_embeddings = (hidden_states[-1] + hidden_states[-2])/2# + hidden_states[-3] + hidden_states[-4]
    #print(token_embeddings.shape)

    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)

    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [63]:
###Recompose words that were splitted in multiple tokens for searching purpose.
def recompose_tokens(tokenized_text_, list_token_embeddings_ = None):

    if list_token_embeddings_ == None:
        tokenized_text = []
        
        i = 0
        while True:

            t = tokenized_text_[i]

            if '##' not in t: tokenized_text.append(t) 

            else: tokenized_text[-1] = str(tokenized_text[-1]) + str(t).replace('##','')

            i += 1

            if i == len(tokenized_text_): return  tokenized_text

    else:
        
        tokenized_text = []
        list_token_embeddings = []
        i = 0
        c = 1
        while True:

            t = tokenized_text_[i]
            e = list_token_embeddings_[i]
            
            if '##' not in t:
                ###Normalize recomposed element when the recomposition is over
                if c > 1:
                    list_token_embeddings[-1] = [x/c for x in list_token_embeddings[-1]]
                    c = 1

                tokenized_text.append(t) 
                list_token_embeddings.append(e)

            else:

                tokenized_text[-1] = str(tokenized_text[-1]) + str(t).replace('##','')
                list_token_embeddings[-1] = [float(x_) + float(e_) for x_, e_ in zip(list_token_embeddings[-1], e)]
                c += 1

            i += 1
            if i == len(tokenized_text_):

                return  tokenized_text, list_token_embeddings

    return



In [64]:
###Return position of the search object inside the text
def find_s_object(s_object, tokenized_text):

    s_object = tokenizer.tokenize(s_object)
    s_object = recompose_tokens(s_object)
    s_object = [s.lower() for s in s_object]

    if len(s_object) == 1:
        return [tokenized_text.index(s_object[0])]
    
    else:
        
        indices = sublist_indices(s_object, tokenized_text)
        return indices
        
    return

def sublist_indices(sub, lst):
    
    ln = len(sub)
    for i in range(len(lst) - ln + 1):

        if all(sub[j] == lst[i+j] for j in range(ln)):
            return [ind for ind in range(i, i + ln)]
        
    return [np.nan for ind in range(i, i + ln)]



In [1]:
###Search entity s_object inside sentence, and understand what it refers to based on context
def contextual_entity(s_object, news, unlemmatized_s_object = None, verbose = 0):
    
    if unlemmatized_s_object == None:
        unlemmatized_s_object = s_object
        
    ###Possible descriptions
    s_results = find_Qid_search(s_object)    
    snippets = [snippet for snippet in s_results["snippet"] if "Wikimedia disambiguation page" not in snippet]# and len(snippet.replace('the App Store is a ', '')) > 0]

    """
    if verbose:
        display(s_results)
"""
    # Getting embeddings for the target
    # word in all given search results contexts
    snip_word_embeddings = []

    ###Find embedding of s_object in news
    tokenized_text_, tokens_tensor, segments_tensors = bert_text_preparation(news, tokenizer)
    list_token_embeddings_ = get_bert_embeddings(tokens_tensor, segments_tensors, model)

    ###Need to recompose tokens that were decomposed in subtokens:
    tokenized_text, list_token_embeddings = recompose_tokens(tokenized_text_, list_token_embeddings_)

    # Find the position of s_object in list of tokens
    word_index = find_s_object(unlemmatized_s_object, tokenized_text)
    
    #Embedding for the target word in news (could be more than one)
    news_word_embedding = [sum(x)/len(word_index) for x in zip(*list_token_embeddings[word_index[0]:word_index[-1]+1])]



    ###Find embeddings of s_object in search results
    for res in snippets:

        tokenized_text_, tokens_tensor, segments_tensors = bert_text_preparation(res, tokenizer)
        list_token_embeddings_ = get_bert_embeddings(tokens_tensor, segments_tensors, model)

        ###Need to recompose tokens that were decomposed in subtokens:
        tokenized_text, list_token_embeddings = recompose_tokens(tokenized_text_, list_token_embeddings_)
        
        # Find the position of s_object in list of tokens    
        #word_index = find_s_object(unlemmatized_s_object, tokenized_text)#tokenized_text.index(s_object.lower())
        word_index = find_s_object(s_object, tokenized_text)#tokenized_text.index(s_object.lower())
    
        # Embedding for the target word in snippets


        word_embedding = [sum(x)/len(word_index) for x in zip(*list_token_embeddings[word_index[0]:word_index[-1]+1])]

        snip_word_embeddings.append(word_embedding)

    list_of_distances = []
    for snippet, embed_s in zip(snippets, snip_word_embeddings):

        cos_dist = 1 - cosine(embed_s, news_word_embedding)
        list_of_distances.append([snippet, cos_dist])

    distances_df = pd.DataFrame(list_of_distances, columns=['snippet', 'distance'])
    
    if verbose == 1:
        indices = np.argsort(distances_df["distance"].values)[::-1]
        for ind in indices:
            print(distances_df.iloc[ind]["snippet"], distances_df.iloc[ind]["distance"])
        print("")    
    return distances_df

# Example

In [2]:
if __name__ == "__main__":
    
    s_object = 'MacBook'
    news = "Apple Unleashes New MacBooks , AirPods, Low-Cost Apple Music Plan."

    distances_df = contextual_entity(s_object, news,  unlemmatized_s_object = 'MacBooks')
    #display(distances_df)
    indices = np.argsort(distances_df["distance"].values)[::-1]
    for ind in indices:
        print(distances_df.iloc[ind]["snippet"], distances_df.iloc[ind]["distance"])

NameError: name 'find_Qid_search' is not defined