In [1]:
import torch
from transformers import BertTokenizer,BertForSequenceClassification

In [2]:
# The implementation is based on https://github.com/abhijeet3922/finbert_embedding
class FinbertEmbedding(object):
    def __init__(self):
        self.tokens = ""
        self.sentence_tokens = ""
        self.tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
        # Load pre-trained model (weights)
        self.model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', output_attentions=True, output_hidden_states=True)
        print("Initialization Done !!")

    def process_text(self, text):
        tokenized_text = ['[CLS]'] + self.tokenizer.tokenize(text)[:510] + ['[SEP]']
        # Tokenize our sentence with the BERT tokenizer
        return tokenized_text

    def handle_oov(self, tokenized_text, word_embeddings):
        embeddings = []
        tokens = []
        oov_len = 1
        for token,word_embedding in zip(tokenized_text, word_embeddings):
            if token.startswith('##'):
                token = token[2:]
                tokens[-1] += token
                oov_len += 1
                embeddings[-1] += word_embedding
            else:
                if oov_len > 1:
                    embeddings[-1] /= oov_len
                tokens.append(token)
                embeddings.append(word_embedding)
        return tokens,embeddings

    def eval_fwdprop_finbert(self, tokenized_text):
        # Mark each of the tokens as belonging to sentence "1".
        segments_ids = [1] * len(tokenized_text)
        # Map the token strings to their vocabulary indeces.
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # Put the model in "evaluation" mode, meaning feed-forward operation.
        self.model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers = self.model(tokens_tensor, segments_tensors)
        return encoded_layers.hidden_states


    def word_vector(self, text, handle_oov=True, filter_extra_tokens=True):

        tokenized_text = self.process_text(text)

        encoded_layers = self.eval_fwdprop_finbert(tokenized_text)

        # Concatenate the tensors for all layers. We use `stack` here to
        # create a new dimension in the tensor.
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        # Swap dimensions 0 and 1.
        token_embeddings = token_embeddings.permute(1,0,2)

        # Stores the token vectors, with shape [22 x 768]
        word_embeddings = []
        print("Summing last 4 layers for each token")
        # For each token in the sentence...
        for token in token_embeddings:

            # `token` is a [12 x 768] tensor
            # Sum the vectors from the last four layers.
            sum_vec = torch.sum(token[-4:], dim=0)

            # Use `sum_vec` to represent `token`.
            word_embeddings.append(sum_vec)

        self.tokens = tokenized_text
        if filter_extra_tokens:
            # filter_spec_tokens: filter [CLS], [SEP] tokens.
            word_embeddings = word_embeddings[1:-1]
            self.tokens = tokenized_text[1:-1]

        if handle_oov:
            self.tokens, word_embeddings = self.handle_oov(self.tokens,word_embeddings)
        print(self.tokens)
        print("Shape of Word Embeddings = %s",str(len(word_embeddings)))
        return word_embeddings



    def sentence_vector(self,text):

        print("Taking last layer embedding of each word.")
        print("Mean of all words for sentence embedding.")
        tokenized_text = self.process_text(text)
        self.sentence_tokens = tokenized_text
        encoded_layers = self.eval_fwdprop_finbert(tokenized_text)

        # `encoded_layers` has shape [12 x 1 x 22 x 768]
        # `token_vecs` is a tensor with shape [22 x 768]
        token_vecs = encoded_layers[11][0]

        # Calculate the average of all 22 token vectors.
        sentence_embedding = torch.mean(token_vecs, dim=0)
        print("Shape of Sentence Embeddings = %s",str(len(sentence_embedding)))
        return sentence_embedding

In [3]:
text = "Another PSU bank, Punjab National Bank which also reported numbers " \
        "managed to see a slight improvement in asset quality."

finbert = FinbertEmbedding()
word_embeddings = finbert.word_vector(text)
sentence_embedding = finbert.sentence_vector(text)



Initialization Done !!
Summing last 4 layers for each token
['another', 'psu', 'bank', ',', 'punjab', 'national', 'bank', 'which', 'also', 'reported', 'numbers', 'managed', 'to', 'see', 'a', 'slight', 'improvement', 'in', 'asset', 'quality', '.']
Shape of Word Embeddings = %s 21
Taking last layer embedding of each word.
Mean of all words for sentence embedding.
Shape of Sentence Embeddings = %s 768


In [14]:
# Calculate the Euclidean distance between two vectors
def euclidean_distance(vec1, vec2):
    return torch.dist(vec1[0], vec2[0])

# Calculate the cosine distance between two vectors
def cosine_distance(vec1, vec2):
    cos = torch.nn.functional.cosine_similarity(vec1[0], vec2[0], dim=0)
    return 1 - cos

In [19]:
w1 = "Kongo"
w2 = "investment"
w3 = "debt"

w1e = finbert.word_vector(w1)
w2e = finbert.word_vector(w2)
w3e = finbert.word_vector(w3)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))


Summing last 4 layers for each token
['kongo']
Shape of Word Embeddings = %s 1
Summing last 4 layers for each token
['investment']
Shape of Word Embeddings = %s 1
Summing last 4 layers for each token
['debt']
Shape of Word Embeddings = %s 1
Distance between 'Kongo' and 'investment' - euclidean: 151.991226 , cosine: 0.563182 
Distance between 'Kongo' and 'debt':  - euclidean: 154.009674 , cosine: 0.583373 


In [20]:
w1 = "Belgium"
w2 = "investment"
w3 = "debt"

w1e = finbert.word_vector(w1)
w2e = finbert.word_vector(w2)
w3e = finbert.word_vector(w3)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))


Summing last 4 layers for each token
['belgium']
Shape of Word Embeddings = %s 1
Summing last 4 layers for each token
['investment']
Shape of Word Embeddings = %s 1
Summing last 4 layers for each token
['debt']
Shape of Word Embeddings = %s 1
Distance between 'Belgium' and 'investment' - euclidean: 95.267677 , cosine: 0.450154 
Distance between 'Belgium' and 'debt':  - euclidean: 88.436066 , cosine: 0.391294 
