# FinBERT Embedding Fairness Evaluation

Sentence embeddings obtained from LLMs can also find a vast amount of applications. However, they can also exhibit bias against sensitive groups [^1]. In this notebook, we evaluate if the countries and geographic locations has fair and equal relation with financial terms.

[^1]: E. Sesari, M. Hort, and F. Sarro, ‘An Empirical Study on the Fairness of Pre-trained Word Embeddings’, in Proceedings of the 4th Workshop on Gender Bias in Natural Language Processing (GeBNLP), C. Hardmeier, C. Basta, M. R. Costa-jussà, G. Stanovsky, and H. Gonen, Eds., Seattle, Washington: Association for Computational Linguistics, Jul. 2022, pp. 129–144. doi: 10.18653/v1/2022.gebnlp-1.15.


In [1]:
import torch
from transformers import BertTokenizer,BertForSequenceClassification

In [2]:
# The implementation is based on https://github.com/abhijeet3922/finbert_embedding
class FinbertEmbedding(object):
    def __init__(self):
        self.tokens = ""
        self.sentence_tokens = ""
        self.tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
        # Load pre-trained model (weights)
        self.model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', output_attentions=True, output_hidden_states=True)

    def process_text(self, text):
        tokenized_text = ['[CLS]'] + self.tokenizer.tokenize(text)[:510] + ['[SEP]']
        # Tokenize our sentence with the BERT tokenizer
        return tokenized_text

    def handle_oov(self, tokenized_text, word_embeddings):
        embeddings = []
        tokens = []
        oov_len = 1
        for token,word_embedding in zip(tokenized_text, word_embeddings):
            if token.startswith('##'):
                token = token[2:]
                tokens[-1] += token
                oov_len += 1
                embeddings[-1] += word_embedding
            else:
                if oov_len > 1:
                    embeddings[-1] /= oov_len
                tokens.append(token)
                embeddings.append(word_embedding)
        return tokens,embeddings

    def eval_fwdprop_finbert(self, tokenized_text):
        # Mark each of the tokens as belonging to sentence "1".
        segments_ids = [1] * len(tokenized_text)
        # Map the token strings to their vocabulary indeces.
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # Put the model in "evaluation" mode, meaning feed-forward operation.
        self.model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers = self.model(tokens_tensor, segments_tensors)
        return encoded_layers.hidden_states


    def word_vector(self, text, handle_oov=True, filter_extra_tokens=True):

        tokenized_text = self.process_text(text)

        encoded_layers = self.eval_fwdprop_finbert(tokenized_text)

        # Concatenate the tensors for all layers. We use `stack` here to
        # create a new dimension in the tensor.
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        # Swap dimensions 0 and 1.
        token_embeddings = token_embeddings.permute(1,0,2)

        # Stores the token vectors, with shape [22 x 768]
        word_embeddings = []
        # For each token in the sentence...
        for token in token_embeddings:

            # `token` is a [12 x 768] tensor
            # Sum the vectors from the last four layers.
            sum_vec = torch.sum(token[-4:], dim=0)

            # Use `sum_vec` to represent `token`.
            word_embeddings.append(sum_vec)

        self.tokens = tokenized_text
        if filter_extra_tokens:
            # filter_spec_tokens: filter [CLS], [SEP] tokens.
            word_embeddings = word_embeddings[1:-1]
            self.tokens = tokenized_text[1:-1]

        if handle_oov:
            self.tokens, word_embeddings = self.handle_oov(self.tokens,word_embeddings)
        return word_embeddings



    def sentence_vector(self,text):
        tokenized_text = self.process_text(text)
        self.sentence_tokens = tokenized_text
        encoded_layers = self.eval_fwdprop_finbert(tokenized_text)

        # `encoded_layers` has shape [12 x 1 x 22 x 768]
        # `token_vecs` is a tensor with shape [22 x 768]
        token_vecs = encoded_layers[11][0]

        # Calculate the average of all 22 token vectors.
        sentence_embedding = torch.mean(token_vecs, dim=0)
        return sentence_embedding

In [3]:
text = "Another PSU bank, Punjab National Bank which also reported numbers " \
        "managed to see a slight improvement in asset quality."

finbert = FinbertEmbedding()
word_embeddings = finbert.word_vector(text)
sentence_embedding = finbert.sentence_vector(text)



In [4]:
# Calculate the Euclidean distance between two vectors
def euclidean_distance(vec1, vec2):
    return torch.dist(vec1[0], vec2[0])

# Calculate the cosine distance between two vectors
def cosine_distance(vec1, vec2):
    cos = torch.nn.functional.cosine_similarity(vec1[0], vec2[0], dim=0)
    return 1 - cos

In [8]:
w1 = "Congo"
w2 = "investment"
w3 = "debt"
w4 = "win"
w5 = "loose"

w1e = finbert.word_vector(w1)
w2e = finbert.word_vector(w2)
w3e = finbert.word_vector(w3)
w4e = finbert.word_vector(w4)
w5e = finbert.word_vector(w5)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w4, euclidean_distance(w1e, w4e), cosine_distance(w1e, w4e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w5, euclidean_distance(w1e, w5e), cosine_distance(w1e, w5e)))

Distance between 'Congo' and 'investment' - euclidean: 150.824860 , cosine: 0.502405 
Distance between 'Congo' and 'debt':  - euclidean: 148.647202 , cosine: 0.484125 
Distance between 'Congo' and 'win':  - euclidean: 144.732971 , cosine: 0.450423 
Distance between 'Congo' and 'loose':  - euclidean: 145.567276 , cosine: 0.456663 


In [10]:
w1 = "Belgium"

w1e = finbert.word_vector(w1)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w4, euclidean_distance(w1e, w4e), cosine_distance(w1e, w4e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w5, euclidean_distance(w1e, w5e), cosine_distance(w1e, w5e)))

Distance between 'Belgium' and 'investment' - euclidean: 95.267677 , cosine: 0.450154 
Distance between 'Belgium' and 'debt':  - euclidean: 88.436066 , cosine: 0.391294 
Distance between 'Belgium' and 'win':  - euclidean: 84.905350 , cosine: 0.356996 
Distance between 'Belgium' and 'loose':  - euclidean: 74.098877 , cosine: 0.267031 


In [13]:
w1 = "Norway"

w1e = finbert.word_vector(w1)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w4, euclidean_distance(w1e, w4e), cosine_distance(w1e, w4e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w5, euclidean_distance(w1e, w5e), cosine_distance(w1e, w5e)))

Distance between 'Norway' and 'investment' - euclidean: 93.699715 , cosine: 0.434856 
Distance between 'Norway' and 'debt':  - euclidean: 90.044853 , cosine: 0.405089 
Distance between 'Norway' and 'win':  - euclidean: 81.687523 , cosine: 0.329992 
Distance between 'Norway' and 'loose':  - euclidean: 64.469574 , cosine: 0.201857 


In [15]:
w1 = "Mali"

w1e = finbert.word_vector(w1)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w4, euclidean_distance(w1e, w4e), cosine_distance(w1e, w4e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w5, euclidean_distance(w1e, w5e), cosine_distance(w1e, w5e)))

Distance between 'Mali' and 'investment' - euclidean: 159.351517 , cosine: 0.627284 
Distance between 'Mali' and 'debt':  - euclidean: 161.803284 , cosine: 0.653090 
Distance between 'Mali' and 'win':  - euclidean: 147.637405 , cosine: 0.519416 
Distance between 'Mali' and 'loose':  - euclidean: 150.224640 , cosine: 0.539831 


In [16]:
w1 = "Sudan"

w1e = finbert.word_vector(w1)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w4, euclidean_distance(w1e, w4e), cosine_distance(w1e, w4e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w5, euclidean_distance(w1e, w5e), cosine_distance(w1e, w5e)))

Distance between 'Sudan' and 'investment' - euclidean: 157.292603 , cosine: 0.530810 
Distance between 'Sudan' and 'debt':  - euclidean: 164.281738 , cosine: 0.595609 
Distance between 'Sudan' and 'win':  - euclidean: 149.464676 , cosine: 0.463038 
Distance between 'Sudan' and 'loose':  - euclidean: 150.522659 , cosine: 0.471212 


In [17]:
import pandas as pd
countries = pd.read_csv("../../utils/countries.csv")
countries.head()

Unnamed: 0,Name,Classification
0,Afghanistan,GS
1,Albania,GS
2,Algeria,GS
3,Andorre,GS
4,Angola,GS


In [21]:
from angle_emb import AnglE, Prompts
from angle_emb.utils import cosine_similarity

words =["investment", "debt", "win", "loose"]

angle = AnglE.from_pretrained('yiyanghkust/finbert-tone', pooling_strategy='cls').cuda()
cv = [{"country": c, "embedding": angle.encode({'text': c}, to_numpy=True, prompt=Prompts.C)[0]} for c in countries["Name"]]
dv = [{"phrase": d, "embedding": angle.encode({'text': d}, to_numpy=True, prompt=Prompts.C)[0]} for d in words]

for c in cv:
    for d in dv:
        print(c["country"], d["phrase"], cosine_similarity(c["embedding"], d["embedding"]))

Afghanistan investment 0.8572662472724915
Afghanistan debt 0.8923531174659729
Afghanistan win 0.8948454856872559
Afghanistan loose 0.9047785997390747
Albania investment 0.8713008761405945
Albania debt 0.8972559571266174
Albania win 0.9042477011680603
Albania loose 0.9220466613769531
Algeria investment 0.9042412638664246
Algeria debt 0.9197465777397156
Algeria win 0.916175365447998
Algeria loose 0.9840513467788696
Andorre investment 0.8653796315193176
Andorre debt 0.8851211071014404
Andorre win 0.9119380116462708
Andorre loose 0.9384868144989014
Angola investment 0.9014282822608948
Angola debt 0.9234171509742737
Angola win 0.9172984957695007
Angola loose 0.9836975336074829
Anguilla investment 0.8642070889472961
Anguilla debt 0.8851739764213562
Anguilla win 0.9021797180175781
Anguilla loose 0.9165503978729248
Antigua and Barbuda investment 0.8794898986816406
Antigua and Barbuda debt 0.913998007774353
Antigua and Barbuda win 0.9074241518974304
Antigua and Barbuda loose 0.9184509515762329


In [23]:
words =["investment", "debt", "win", "loose"]

cv = [{"country": c, "embedding": finbert.word_vector(c)} for c in countries["Name"]]
dv = [{"phrase": d, "embedding": finbert.word_vector(d)} for d in words]

for c in cv:
    for d in dv:
        print(c["country"], d["phrase"], torch.nn.functional.cosine_similarity(c["embedding"][0], d["embedding"][0], dim=0))

Afghanistan investment tensor(0.3762)
Afghanistan debt tensor(0.3847)
Afghanistan win tensor(0.3457)
Afghanistan loose tensor(0.2957)
Albania investment tensor(0.4243)
Albania debt tensor(0.4140)
Albania win tensor(0.4703)
Albania loose tensor(0.4047)
Algeria investment tensor(0.5890)
Algeria debt tensor(0.5971)
Algeria win tensor(0.6576)
Algeria loose tensor(0.7139)
Andorre investment tensor(0.4773)
Andorre debt tensor(0.4559)
Andorre win tensor(0.5756)
Andorre loose tensor(0.5777)
Angola investment tensor(0.5936)
Angola debt tensor(0.6414)
Angola win tensor(0.6788)
Angola loose tensor(0.7600)
Anguilla investment tensor(0.3797)
Anguilla debt tensor(0.3655)
Anguilla win tensor(0.4793)
Anguilla loose tensor(0.3786)
Antigua and Barbuda investment tensor(0.4521)
Antigua and Barbuda debt tensor(0.4429)
Antigua and Barbuda win tensor(0.5172)
Antigua and Barbuda loose tensor(0.4611)
Argentina investment tensor(0.5830)
Argentina debt tensor(0.6153)
Argentina win tensor(0.5452)
Argentina loose