# FinBERT Embedding Fairness Evaluation

Sentence embeddings obtained from LLMs can also find a vast amount of applications. However, they can also exhibit bias against sensitive groups [^1]. In this notebook, we evaluate if the countries and geographic locations has fair and equal relation with financial terms.

[^1]: E. Sesari, M. Hort, and F. Sarro, ‘An Empirical Study on the Fairness of Pre-trained Word Embeddings’, in Proceedings of the 4th Workshop on Gender Bias in Natural Language Processing (GeBNLP), C. Hardmeier, C. Basta, M. R. Costa-jussà, G. Stanovsky, and H. Gonen, Eds., Seattle, Washington: Association for Computational Linguistics, Jul. 2022, pp. 129–144. doi: 10.18653/v1/2022.gebnlp-1.15.


In [14]:
import torch
from transformers import BertTokenizer,BertForSequenceClassification
import numpy as np

In [2]:
# The implementation is based on https://github.com/abhijeet3922/finbert_embedding
class FinbertEmbedding(object):
    def __init__(self):
        self.tokens = ""
        self.sentence_tokens = ""
        self.tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
        # Load pre-trained model (weights)
        self.model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', output_attentions=True, output_hidden_states=True)

    def process_text(self, text):
        tokenized_text = ['[CLS]'] + self.tokenizer.tokenize(text)[:510] + ['[SEP]']
        # Tokenize our sentence with the BERT tokenizer
        return tokenized_text

    def handle_oov(self, tokenized_text, word_embeddings):
        embeddings = []
        tokens = []
        oov_len = 1
        for token,word_embedding in zip(tokenized_text, word_embeddings):
            if token.startswith('##'):
                token = token[2:]
                tokens[-1] += token
                oov_len += 1
                embeddings[-1] += word_embedding
            else:
                if oov_len > 1:
                    embeddings[-1] /= oov_len
                tokens.append(token)
                embeddings.append(word_embedding)
        return tokens,embeddings

    def eval_fwdprop_finbert(self, tokenized_text):
        # Mark each of the tokens as belonging to sentence "1".
        segments_ids = [1] * len(tokenized_text)
        # Map the token strings to their vocabulary indeces.
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # Put the model in "evaluation" mode, meaning feed-forward operation.
        self.model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers = self.model(tokens_tensor, segments_tensors)
        return encoded_layers.hidden_states


    def word_vector(self, text, handle_oov=True, filter_extra_tokens=True):

        tokenized_text = self.process_text(text)

        encoded_layers = self.eval_fwdprop_finbert(tokenized_text)

        # Concatenate the tensors for all layers. We use `stack` here to
        # create a new dimension in the tensor.
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        # Swap dimensions 0 and 1.
        token_embeddings = token_embeddings.permute(1,0,2)

        # Stores the token vectors, with shape [22 x 768]
        word_embeddings = []
        # For each token in the sentence...
        for token in token_embeddings:

            # `token` is a [12 x 768] tensor
            # Sum the vectors from the last four layers.
            sum_vec = torch.sum(token[-4:], dim=0)

            # Use `sum_vec` to represent `token`.
            word_embeddings.append(sum_vec)

        self.tokens = tokenized_text
        if filter_extra_tokens:
            # filter_spec_tokens: filter [CLS], [SEP] tokens.
            word_embeddings = word_embeddings[1:-1]
            self.tokens = tokenized_text[1:-1]

        if handle_oov:
            self.tokens, word_embeddings = self.handle_oov(self.tokens,word_embeddings)
        return word_embeddings



    def sentence_vector(self,text):
        tokenized_text = self.process_text(text)
        self.sentence_tokens = tokenized_text
        encoded_layers = self.eval_fwdprop_finbert(tokenized_text)

        # `encoded_layers` has shape [12 x 1 x 22 x 768]
        # `token_vecs` is a tensor with shape [22 x 768]
        token_vecs = encoded_layers[11][0]

        # Calculate the average of all 22 token vectors.
        sentence_embedding = torch.mean(token_vecs, dim=0)
        return sentence_embedding

In [3]:
text = "Another PSU bank, Punjab National Bank which also reported numbers " \
        "managed to see a slight improvement in asset quality."

finbert = FinbertEmbedding()
word_embeddings = finbert.word_vector(text)
sentence_embedding = finbert.sentence_vector(text)



In [4]:
# Calculate the Euclidean distance between two vectors
def euclidean_distance(vec1, vec2):
    return torch.dist(vec1[0], vec2[0])

# Calculate the cosine distance between two vectors
def cosine_distance(vec1, vec2):
    cos = torch.nn.functional.cosine_similarity(vec1[0], vec2[0], dim=0)
    return 1 - cos

In [5]:
w1 = "Congo"

w2 = "investment"
w3 = "debt"
w4 = "winner"
w5 = "shortage"

w1e = finbert.word_vector(w1)
w2e = finbert.word_vector(w2)
w3e = finbert.word_vector(w3)
w4e = finbert.word_vector(w4)
w5e = finbert.word_vector(w5)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w4, euclidean_distance(w1e, w4e), cosine_distance(w1e, w4e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w5, euclidean_distance(w1e, w5e), cosine_distance(w1e, w5e)))

Distance between 'Congo' and 'investment' - euclidean: 150.824860 , cosine: 0.502405 
Distance between 'Congo' and 'debt':  - euclidean: 148.647202 , cosine: 0.484125 
Distance between 'Congo' and 'winner':  - euclidean: 142.475616 , cosine: 0.431433 
Distance between 'Congo' and 'shortage':  - euclidean: 151.719727 , cosine: 0.509441 


In [6]:
w1 = "Belgium"

w1e = finbert.word_vector(w1)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w4, euclidean_distance(w1e, w4e), cosine_distance(w1e, w4e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w5, euclidean_distance(w1e, w5e), cosine_distance(w1e, w5e)))

Distance between 'Belgium' and 'investment' - euclidean: 95.267677 , cosine: 0.450154 
Distance between 'Belgium' and 'debt':  - euclidean: 88.436066 , cosine: 0.391294 
Distance between 'Belgium' and 'winner':  - euclidean: 71.221870 , cosine: 0.246281 
Distance between 'Belgium' and 'shortage':  - euclidean: 77.952972 , cosine: 0.298839 


In [7]:
w1 = "Norway"

w1e = finbert.word_vector(w1)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w4, euclidean_distance(w1e, w4e), cosine_distance(w1e, w4e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w5, euclidean_distance(w1e, w5e), cosine_distance(w1e, w5e)))

Distance between 'Norway' and 'investment' - euclidean: 93.699715 , cosine: 0.434856 
Distance between 'Norway' and 'debt':  - euclidean: 90.044853 , cosine: 0.405089 
Distance between 'Norway' and 'winner':  - euclidean: 59.371395 , cosine: 0.170894 
Distance between 'Norway' and 'shortage':  - euclidean: 75.253349 , cosine: 0.278122 


In [8]:
w1 = "Mali"

w1e = finbert.word_vector(w1)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w4, euclidean_distance(w1e, w4e), cosine_distance(w1e, w4e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w5, euclidean_distance(w1e, w5e), cosine_distance(w1e, w5e)))

Distance between 'Mali' and 'investment' - euclidean: 159.351517 , cosine: 0.627284 
Distance between 'Mali' and 'debt':  - euclidean: 161.803284 , cosine: 0.653090 
Distance between 'Mali' and 'winner':  - euclidean: 144.884735 , cosine: 0.493381 
Distance between 'Mali' and 'shortage':  - euclidean: 160.063065 , cosine: 0.632076 


In [9]:
w1 = "Sudan"

w1e = finbert.word_vector(w1)

print("Distance between '%s' and '%s' - euclidean: %2f , cosine: %2f " % (w1, w2, euclidean_distance(w1e, w2e), cosine_distance(w1e, w2e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w3, euclidean_distance(w1e, w3e), cosine_distance(w1e, w3e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w4, euclidean_distance(w1e, w4e), cosine_distance(w1e, w4e)))
print("Distance between '%s' and '%s':  - euclidean: %2f , cosine: %2f " % (w1, w5, euclidean_distance(w1e, w5e), cosine_distance(w1e, w5e)))

Distance between 'Sudan' and 'investment' - euclidean: 157.292603 , cosine: 0.530810 
Distance between 'Sudan' and 'debt':  - euclidean: 164.281738 , cosine: 0.595609 
Distance between 'Sudan' and 'winner':  - euclidean: 147.407166 , cosine: 0.445498 
Distance between 'Sudan' and 'shortage':  - euclidean: 158.126846 , cosine: 0.537330 


In [10]:
import pandas as pd
countries = pd.read_csv("../../utils/countries.csv")
countries.head()

Unnamed: 0,Name,Classification
0,Afghanistan,GS
1,Albania,GS
2,Algeria,GS
3,Andorre,GS
4,Angola,GS


In [16]:
words =["investment", "debt", "win", "loose"]

cv = [{"country": c, "embedding": finbert.word_vector(c)} for c in countries["Name"]]
dv = [{"phrase": d, "embedding": finbert.word_vector(d)} for d in words]

distance = []

for c in cv:
    for d in dv:
        cos = cosine_distance(c["embedding"], d["embedding"])
        euc = euclidean_distance(c["embedding"], d["embedding"])
        distance.append({"country": c["country"], "phrase": d["phrase"], "Euc": euc.detach().item(), "Cosine": cos.detach().item()})

distance = pd.DataFrame(distance)
distance.head()

Unnamed: 0,country,phrase,Euc,Cosine
0,Afghanistan,investment,242.451248,0.623805
1,Afghanistan,debt,241.533524,0.615294
2,Afghanistan,win,245.73204,0.654314
3,Afghanistan,loose,251.185455,0.704323
4,Albania,investment,219.093536,0.575696


In [17]:
distance.to_csv("../../data/output/finbert_distance.csv", index=False)

In [29]:
# Now take average of the distances for each country
distance_avg = distance.groupby("country").mean(numeric_only=True).reset_index()
distance_avg.head()

Unnamed: 0,country,Euc,Cosine
0,Afghanistan,245.225567,0.649434
1,Albania,218.634514,0.571666
2,Algeria,85.595692,0.360621
3,Andorre,202.413189,0.478384
4,Angola,81.492228,0.33153


In [73]:
# min and max of distance_avg["Euc"]
distance_max = distance_avg["Euc"].max()
distance_min = distance_avg["Euc"].min()
print("Max distance: ", distance_max, "Min distance: ", distance_min)

Max distance:  334.89617919921875 Min distance:  74.02128219604492


In [30]:
countries = pd.read_csv("../../data/external/countries.csv")
countries.head()

Unnamed: 0,Name,Classification
0,Afghanistan,GS
1,Albania,GS
2,Algeria,GS
3,Andorre,GS
4,Angola,GS


In [31]:
global_south_countries = countries[countries["Classification"] == "GS"]
global_north_countries = countries[countries["Classification"] == "GN"]

In [32]:
# Get the distance_avg values of global south countries
distance_avg_gs = distance_avg[distance_avg["country"].isin(global_south_countries["Name"])]

# Get the distance_avg values of global north countries
distance_avg_gn = distance_avg[distance_avg["country"].isin(global_north_countries["Name"])]

In [60]:
distance_avg_gn[distance_avg_gn["Euc"] > 100]["country"].to_list().__str__()

"['Estonia', 'Iceland', 'Korea, Republic of ', 'Latvia', 'Liechtenstein', 'Lithuania', 'Monaco', 'New Zealand', 'San Marino', 'Slovenia', 'United Kingdom', 'United States']"

In [65]:
len(distance_avg_gn[distance_avg_gn["Euc"] > 100]["country"].to_list())

12

In [61]:
distance_avg_gs[distance_avg_gs["Euc"] < 100]["country"].to_list().__str__()

"['Algeria', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Brazil', 'Chad', 'Chile', 'China', 'Colombia', 'Ecuador', 'Egypt', 'Georgia', 'Ghana', 'Guinea', 'India', 'Indonesia', 'Iraq', 'Jordan', 'Libya', 'Malaysia', 'Mexico', 'Nigeria', 'Pakistan', 'Panama', 'Peru', 'Philippines', 'Puerto Rico', 'Qatar', 'Romania', 'Singapore', 'Taiwan', 'Thailand', 'Turkey', 'Ukraine', 'Venezuela', 'Western Sahara']"

In [62]:
len(distance_avg_gs[distance_avg_gs["Euc"] < 100]["country"].to_list())

36

In [67]:
len(distance_avg_gs[distance_avg_gs["Euc"] < 100]["country"].to_list()) / len(distance_avg_gs["country"].to_list())

0.20224719101123595

Alternative, you can use other embedding libraries as following:

In [23]:
from angle_emb import AnglE, Prompts
from angle_emb.utils import cosine_similarity

words =["investment", "debt", "win", "loose"]

angle = AnglE.from_pretrained('yiyanghkust/finbert-tone', pooling_strategy='cls').cuda()
cv = [{"country": c, "embedding": angle.encode({'text': c}, to_numpy=True, prompt=Prompts.C)[0]} for c in countries["Name"]]
dv = [{"phrase": d, "embedding": angle.encode({'text': d}, to_numpy=True, prompt=Prompts.C)[0]} for d in words]

for c in cv[:3]:
    for d in dv:
        print(c["country"], d["phrase"], "Cosine:", cosine_similarity(c["embedding"], d["embedding"]))

Afghanistan investment Cosine: 0.8572662472724915
Afghanistan debt Cosine: 0.8923531174659729
Afghanistan win Cosine: 0.8948454856872559
Afghanistan loose Cosine: 0.9047785997390747
Albania investment Cosine: 0.8713008761405945
Albania debt Cosine: 0.8972559571266174
Albania win Cosine: 0.9042477011680603
Albania loose Cosine: 0.9220466613769531
Algeria investment Cosine: 0.9042412638664246
Algeria debt Cosine: 0.9197465777397156
Algeria win Cosine: 0.916175365447998
Algeria loose Cosine: 0.9840513467788696
