# Use Case: Mitigating Bias with Perturbed Data

Use https://huggingface.co/facebook/perturber to perturb the sensitive attributes.

In [1]:
from transformers import BartForConditionalGeneration, BartTokenizer, pipeline
from tqdm import tqdm
import torch

In [2]:

model = BartForConditionalGeneration.from_pretrained('facebook/perturber')
tokenizer = BartTokenizer.from_pretrained('facebook/perturber')

In [7]:
# Check the model working mechanism here: https://huggingface.co/facebook/perturber
sentence_masked = "Europe, Africa <PERT_SEP> Growth is strong in the Europe and they have plenty of liquidity."
batch = tokenizer(sentence_masked, return_tensors="pt")
generated_ids = model.generate(batch["input_ids"])
print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))

['Growth is strong in the Europe and they have plenty of liquidity.']


In [4]:
# Or we can use the pipeline
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
sentences = [sentence_masked, 'his, woman <PERT_SEP> His growth is strong and he has plenty of liquidity.', 
               'there is a shortage of capital, and we need extra financing.',
              'formulation patents might protect Vasotec to a limited extent.']

results = pipe(sentences)
results

[{'generated_text': 'Growth is strong in the Europe and they have plenty of liquidity.'},
 {'generated_text': 'Her growth is strong and she has plenty of liquidity.'},
 {'generated_text': 'There is a shortage of capital, and we need extra financing.'},
 {'generated_text': 'Formulation patents might protect Vasotec to a limited extent.'}]

In [25]:
encoded_input = tokenizer(sentences[0], padding=True, return_tensors='pt')
output = model(**encoded_input)

In [8]:
unmasker = pipeline('fill-mask', model='roberta-base')

In [10]:
unmasker("Sarah was a much better surgeon than Maria so <mask> always got the easier cases.")

[{'score': 0.6202154159545898,
  'token': 79,
  'token_str': ' she',
  'sequence': 'Sarah was a much better surgeon than Maria so she always got the easier cases.'},
 {'score': 0.17070521414279938,
  'token': 4143,
  'token_str': ' Sarah',
  'sequence': 'Sarah was a much better surgeon than Maria so Sarah always got the easier cases.'},
 {'score': 0.1437147557735443,
  'token': 5011,
  'token_str': ' Maria',
  'sequence': 'Sarah was a much better surgeon than Maria so Maria always got the easier cases.'},
 {'score': 0.0211197342723608,
  'token': 51,
  'token_str': ' they',
  'sequence': 'Sarah was a much better surgeon than Maria so they always got the easier cases.'},
 {'score': 0.007806079462170601,
  'token': 38,
  'token_str': ' I',
  'sequence': 'Sarah was a much better surgeon than Maria so I always got the easier cases.'}]

In [10]:
# Also, let's test their FairBERTa model
fair_unmasker = pipeline('fill-mask', model='facebook/FairBERTa')

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at facebook/FairBERTa and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
fair_unmasker("Hello I'm a <mask> model.")

[{'score': 0.10161170363426208,
  'token': 5047,
  'token_str': ' referendum',
  'sequence': "Hello I'm a referendum model."},
 {'score': 0.08171280473470688,
  'token': 15147,
  'token_str': 'uity',
  'sequence': "Hello I'm auity model."},
 {'score': 0.04981056600809097,
  'token': 14547,
  'token_str': ' quo',
  'sequence': "Hello I'm a quo model."},
 {'score': 0.047292739152908325,
  'token': 12976,
  'token_str': ' ast',
  'sequence': "Hello I'm a ast model."},
 {'score': 0.04559711366891861,
  'token': 25482,
  'token_str': 'е',
  'sequence': "Hello I'm aе model."}]

In [12]:
unmasker("In 2020, <mask>'s growth was strong and had plenty of opportunities.")

[{'score': 0.5763139128684998,
  'token': 436,
  'token_str': ' China',
  'sequence': "In 2020, China's growth was strong and had plenty of opportunities."},
 {'score': 0.1911449283361435,
  'token': 666,
  'token_str': ' India',
  'sequence': "In 2020, India's growth was strong and had plenty of opportunities."},
 {'score': 0.014853106811642647,
  'token': 1429,
  'token_str': ' Japan',
  'sequence': "In 2020, Japan's growth was strong and had plenty of opportunities."},
 {'score': 0.013153301551938057,
  'token': 1645,
  'token_str': ' Amazon',
  'sequence': "In 2020, Amazon's growth was strong and had plenty of opportunities."},
 {'score': 0.0121165681630373,
  'token': 1257,
  'token_str': ' Apple',
  'sequence': "In 2020, Apple's growth was strong and had plenty of opportunities."}]

In [13]:
fair_unmasker("In 2020, <mask>'s growth was strong and had plenty of opportunities.")

[{'score': 0.1978205293416977,
  'token': 43375,
  'token_str': 'orem',
  'sequence': "In 2020,orem's growth was strong and had plenty of opportunities."},
 {'score': 0.06638503819704056,
  'token': 43674,
  'token_str': ' whis',
  'sequence': "In 2020, whis's growth was strong and had plenty of opportunities."},
 {'score': 0.05731979012489319,
  'token': 25327,
  'token_str': ' mater',
  'sequence': "In 2020, mater's growth was strong and had plenty of opportunities."},
 {'score': 0.03942045941948891,
  'token': 12607,
  'token_str': 'tle',
  'sequence': "In 2020,tle's growth was strong and had plenty of opportunities."},
 {'score': 0.035004548728466034,
  'token': 35681,
  'token_str': 'ulet',
  'sequence': "In 2020,ulet's growth was strong and had plenty of opportunities."}]

In [26]:
import numpy as np
import pandas as pd
from datasets import Dataset
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)

In [27]:
splits = {'train': 'train.jsonl', 'validation': 'valid.jsonl'}
df = pd.read_json("hf://datasets/facebook/panda/" + splits["validation"], lines=True)

In [28]:
df.head()

Unnamed: 0,original,selected_word,target_attribute,perturbed
0,Mike Leigh populates his movie with a wonderfu...,with,asian,Mike Leigh populates his movie with a wonderfu...
1,What Boy Scout merit badge did Spielberg help ...,Scout,woman,What Girl Scout merit badge did Spielberg help...
2,Meagen Marree Nay (born 5 October 1988) is a c...,Peter,woman,Meagen Marree Nay (born 5 October 1988) is a c...
3,Coming to power in the year 1966 after the bri...,Gandhi,woman,Coming to power in the year 1966 after the bri...
4,Brings awareness to an issue often overlooked ...,issue,white,Brings awareness to an issue often overlooked ...


In [30]:
df.original[0]

'Mike Leigh populates his movie with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life'

In [31]:
df.perturbed[0]

'Mike Leigh populates his movie with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life.'

In [34]:
filename = "../data/external/financialphrasebank.csv"
#DATASET_CONFIG = { "path": filename, "name": "sentiment"}
# LABEL_MAPPING = { 0: "negative", 1: "neutral", 2: "positive"}
TEXT_COLUMN = "text"
TARGET_COLUMN = "sentiment"
raw_data = pd.read_csv(filename, names=[TARGET_COLUMN, TEXT_COLUMN], encoding="utf-8", encoding_errors="replace")
raw_data.drop_duplicates(subset=["text"], inplace=True)
raw_data.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [36]:
from captum.attr import visualization as viz
from captum.attr import IntegratedGradients, LayerConductance, LayerIntegratedGradients
from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer

In [37]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def generate_perturbed_out(inputs):
    return model(inputs)[0]

ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence

def construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id):

    text_ids = tokenizer.encode(text, add_special_tokens=False)
    # construct input token ids
    input_ids = [cls_token_id] + text_ids + [sep_token_id]
    # construct reference token ids 
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(text_ids)

def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
    seq_len = input_ids.size(1)
    token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=device)
    ref_token_type_ids = torch.zeros_like(token_type_ids, device=device)# * -1
    return token_type_ids, ref_token_type_ids

def construct_input_ref_pos_id_pair(input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
    # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
    ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
    return position_ids, ref_position_ids
    
def construct_attention_mask(input_ids):
    return torch.ones_like(input_ids)

def custom_forward(inputs):
    preds = generate_perturbed_out(inputs)
    return torch.softmax(preds, dim = 1)[0][0].unsqueeze(-1)

In [48]:
lig = LayerIntegratedGradients(custom_forward, model.model.decoder.embed_tokens)

input_ids, ref_input_ids, sep_id = construct_input_ref_pair(sentence_masked, ref_token_id, sep_token_id, cls_token_id)
token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
attention_mask = construct_attention_mask(input_ids)

indices = input_ids[0].detach().tolist()
all_tokens = tokenizer.convert_ids_to_tokens(indices)

In [40]:
def summarize_attributions(attributions):
    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    return attributions

In [42]:
attributions, delta = lig.attribute(inputs=input_ids,
                                    baselines=ref_input_ids,
                                    return_convergence_delta=True)

attributions_sum = summarize_attributions(attributions)

In [44]:
# storing couple samples in an array for visualization purposes
score_vis = viz.VisualizationDataRecord(
                        attributions_sum,
                        torch.softmax(score, dim = 1)[0][0],
                        torch.argmax(torch.softmax(score, dim = 1)[0]),
                        1, # Positive Sentiment
                        sentence,
                        attributions_sum.sum(),       
                        all_tokens,
                        delta)

print('\033[1m', 'Visualization For Score', '\033[0m')
viz.visualize_text([score_vis])

[1m Visualization For Score [0m


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,1 (0.00),"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",3.01,"[CLS] for the last quarter of 2010 , component ##a ' s net sales doubled to eur ##13 ##1m from eur ##76 ##m for the same period a year earlier , while it moved to a zero pre - tax profit from a pre - tax loss of eur ##7m . [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,1 (0.00),"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",3.01,"[CLS] for the last quarter of 2010 , component ##a ' s net sales doubled to eur ##13 ##1m from eur ##76 ##m for the same period a year earlier , while it moved to a zero pre - tax profit from a pre - tax loss of eur ##7m . [SEP]"
,,,,


# 3. Mitigating Bias with Data Augmentation

In the analysis, "" and "" emerged as potential protected attributes in the training process. One way to improve fairness is by introducing counterfactual inputs to reduce the impact of protected attributes on the classification decision. For example, if the currency "EUR" biases the model towards a "positive" prediction, we can generate more samples with various currencies. For instance:

Original sentence: "For the last quarter of 2010, Componenta's net sales doubled to EUR131m from EUR76m for the same period a year earlier, while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m."
Sentiment: Positive

If all sentences with the EUR currency are labeled as positive, the model might incorrectly associate the occurrence of EUR with positivity. To mitigate this issue, we can introduce the same dataset instance with different currencies from around the world.


In [None]:
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../../utils/')
from counterfactual_generator import generate_random_counterfactual, generate_counterfactuals

In [None]:
sentence = "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m ."
vocab_path =  "../utils/codes-all.csv"
target = "AlphabeticCode"
example_cf = generate_random_counterfactual(sentence, vocab_path, target)
example_cf

In [None]:
# Now the example counterfactual is generated, we can use the pipeline to predict the sentiment of the counterfactual
# It is also important to note that the counterfactual is almost meaningless... It uses three different currencies and I have no idea if it is a positive or negative increase, but the overall statement is still positive.
print(pipe(sentence))
print(pipe(example_cf))

In [None]:
sentence = "According to Gran , the company has no plans to move all production to Germany, although that is where the company is growing ."
vocab_path =  "../../utils/codes-all.csv"
target = "Entity"
example_cf = generate_random_counterfactual(sentence, vocab_path, target)
example_cf

In [None]:
print(pipe(sentence))
print(pipe(example_cf))

In [None]:
vocab_path =  "../../utils/codes-all.csv"
target = "Entity"

# Save counterfactuals in a new dataframe with the sentiment

sents = []
cfarr = []

#for i in range(len(X_train)):
for i in range(1):
    sentiment = raw_data.iloc[i]['sentiment']
    cfs = generate_counterfactuals(raw_data.iloc[i]['text'], vocab_path, target)
    for cf in cfs:
        sents.append(sentiment)
        cfarr.append(cf)

cf_df = pd.DataFrame({'sentiment': sents, 'text': cfarr})

# Save it to file
cf_df.to_csv('../data/output/counterfactual/financialphrasebank_cfs.csv', index=False)

# Conclusion

Xing et al.'s recent review[^2] identifies six key challenges in financial sentiment analysis task: (1) irrealis mood, (2) rhetoric, (3) dependent opinions, (4) unspecified aspects, (5) unrecognized words, and (6) external references. 

- Irrealis mood (conditional mood, subjunctive mood, imperative mood): 
- Rhetoric (negative assertion, personification, sarcasm), 
- Dependent opinion, 
- Unspecified aspects, 
- Unrecognized words (entity, microtext, jargons), and e
- External reference.

![Financial Sentiment Analysis Overview Diagram](../../media/finsentiment-flow.png)

*Financial sentiment and impacting factor. Diagram is from [^1]*

I believe developing effective approaches in the financial domain can support both improving the accuracy and mitigating biases.

[^1]: Du, Kelvin, et al. "Financial Sentiment Analysis: Techniques and Applications." ACM Computing Surveys (2024). <https://dl.acm.org/doi/10.1145/3649451>
[^2]: Xing, Frank, et al. "Financial sentiment analysis: an investigation into common mistakes and silver bullets." Proceedings of the 28th international conference on computational linguistics. 2020. <https://aclanthology.org/2020.coling-main.85.pdf>