In [2]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine

import torch
from transformers import BertModel, BertTokenizer

import matplotlib.pyplot as plt
import seaborn as sn

## Example sentences

In [4]:
words_of_interest = ['man', 'woman', 'king', 'queen']
sentences = ["A man is an adult male person",
           "A woman is an adult female person.",
           "A king is a male sovereign or monarch.",
           "A queen is a female sovereign or monarch."]

## Encode words

In [9]:
def encode_words(model_name_or_path, sentences, words_of_interest):
    """
    Given a list of sentences and a model, get the embeddings of each word in the sentences.
    """
    print("   Loading pretrained model/tokenizer...")
    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
    model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True, cache_dir ='../_cache') # Will output all hidden_states.

    print("   Tokenizing sentences...")
    marked_text = ["[CLS] " + sent + " [SEP]" for sent in sentences]
    tokenized_text = [tokenizer.tokenize(sent) for sent in marked_text]
    indexed_tokens = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_text]
    #indexed_tokens = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

    max_len = 0
    for i in indexed_tokens:
        if len(i) > max_len:
            max_len = len(i)
    print("   Maximum length in dataset: {}".format(max_len))

    print("   Padding/Truncating sentences according to the maximum length...")
    padded = np.array([i + [0]*(max_len-len(i)) for i in indexed_tokens])

    print("   Creating attention masks...")
    attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.

    print("   Converting inputs to torch tensors...")
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    print("   Encoding sentences...")
    with torch.no_grad():
        # output is a 2-tuple where:
        #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
        #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
        #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
        output = model(input_ids, attention_mask=attention_mask)

    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
    hidden_states = torch.stack(output[2], dim=0)

    # Switch around the “layers” and “tokens” dimensions with permute.
    hidden_states = hidden_states.permute(1,2,0,3)
    
    # For each sentence, um the last four layers of each token as their embbeding.
    sentence_vecs = []
    for sent in hidden_states:
        token_vecs = []
        for token in sent:
            sum_vec = torch.sum(token[-4:], dim=0)
            token_vecs.append(np.array(sum_vec))
        sentence_vecs.append(token_vecs)
    sentence_vecs = np.array(sentence_vecs)
        
    # Create pandas dataframe.
    cols = ['feat'+str(i) for i in range(sentence_vecs.shape[2])]
    df = pd.DataFrame(columns=cols)
    df['Token'] = None
    df['Sentence'] = None
    
    for i, sent in enumerate(sentences):
        tmp_df = pd.DataFrame(data=sentence_vecs[i,:,:], columns=cols)
        tmp_df['Token'] = list(np.append(tokenized_text[i], ['[PAD]']*(max_len-len(tokenized_text[i]))))
        tmp_df['Sentence'] = [sent]*max_len
        df = pd.concat([df, tmp_df], axis=0)
        
    #df = df[df.Token == word_of_interest]
    df = df.loc[df['Token'].isin(words_of_interest)]
    df.reset_index(drop=True, inplace=True)
    return df


print("BERT-base")
bert_df = encode_words('bert-base-cased', sentences, words_of_interest)
print("NetBERT")
netbert_df = encode_words('../_models/netbert/checkpoint-1027000/', sentences, words_of_interest)

BERT-base
   Loading pretrained model/tokenizer...
   Tokenizing sentences...
   Maximum length in dataset: 11
   Padding/Truncating sentences according to the maximum length...
   Creating attention masks...
   Converting inputs to torch tensors...
   Encoding sentences...
NetBERT
   Loading pretrained model/tokenizer...
   Tokenizing sentences...
   Maximum length in dataset: 11
   Padding/Truncating sentences according to the maximum length...
   Creating attention masks...
   Converting inputs to torch tensors...
   Encoding sentences...


In [None]:
def compute_cosine_matrix(embeddings):
    """
    """
    matrix = np.zeros((len(sentences), len(sentences)))
    for i, sent in enumerate(sentences):
        for j, sent in enumerate(sentences):
            matrix[i,j] = 1 - cosine(embeddings[i], embeddings[j])
    return matrix


# Get embeddings.
bert_embeddings = bert_df.loc[:, ~bert_df.columns.isin(['Token','Sentence'])].values
netbert_embeddings = netbert_df.loc[:, ~netbert_df.columns.isin(['Token','Sentence'])].values

# Compute matrix.
bert_matrix = compute_cosine_matrix(bert_embeddings)
netbert_matrix = compute_cosine_matrix(netbert_embeddings)

In [12]:
man_embeddings = bert_df.loc[bert_df.Token=='man', ~bert_df.columns.isin(['Token','Sentence'])].values
man_embeddings

array([[-8.3719724e-01, -3.0992615e+00,  1.5333366e+00,  1.1977975e+00,
        -2.7423725e+00,  8.6249113e-01, -6.6017288e-01, -5.5595934e-01,
        -1.6599344e+00, -1.6755519e+00,  1.8551106e+00,  1.7558801e+00,
         1.8281502e+00,  9.6179539e-01,  1.8294655e+00, -1.5017356e-01,
         1.1233219e+00, -2.1325142e+00, -1.7152760e+00,  4.1317788e-01,
         2.9944005e+00, -3.1503868e+00, -2.0412574e+00, -2.0504624e-02,
         3.1516033e-01,  2.9274256e+00,  5.2282352e+00,  2.7195036e+00,
        -9.3632948e-01,  1.6387001e+00, -1.6784500e+00,  3.1639194e-01,
         4.6398473e+00,  7.8341359e-01, -1.6638021e+00,  6.4319801e-01,
        -1.6758496e-01,  4.2268267e+00,  1.9415925e-01,  8.4248018e-01,
         2.7305216e-02, -2.9846883e+00, -2.9454646e+00, -2.4893360e-01,
         6.1950266e-02,  3.9726882e+00,  1.6171968e+00, -4.4875637e-01,
        -4.6565962e+00,  1.8632925e-01,  5.5215921e+00,  7.3540235e-01,
         1.1853963e+00,  4.2297858e-01,  3.6933780e-01, -2.84467

In [None]:
# List Mark
DNS
mDNS

LAN
WAN
CAN
MAN

ACL
PACL
VACL

HSRP
VRRP
GLBP

EIGRP
IGRP
BGP
EBGP

STP
DTP
VTP