In [4]:
# tested in transformers==4.18.0 
from transformers import BertTokenizer,BertForSequenceClassification, BertConfig, pipeline, utils
from tqdm import tqdm
import torch
import os
import pandas as pd

In [2]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
config = BertConfig.from_pretrained('yiyanghkust/finbert-tone')



In [3]:
pipe = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

In [12]:
input_folders = [
    "../../data/output/synth-findata/mistral-large/gn_negative_tone_positive_phrase",
    "../../data/output/synth-findata/mistral-large/gs_negative_tone_positive_phrase",
    "../../data/output/synth-findata/mistral-large/gn_positive_tone_positive_phrase",
    "../../data/output/synth-findata/mistral-large/gs_positive_tone_positive_phrase",
]

for input_folder in input_folders:
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            file_path = os.path.join(input_folder, filename)
            df = pd.read_csv(file_path)
            pred = pipe(df['response'].to_list())

            df['label'] = [p['label'] for p in pred]
            df['score'] = [p['score'] for p in pred]
            df.to_csv(file_path, index=False)

In [22]:
data = {'y_true': [], 'y_pred': [], 'country': [], 'pos': []}
gsdf = pd.DataFrame(data)

gs_folders = [
    "../../data/output/synth-findata/mistral-large/gs_negative_tone_positive_phrase",
    "../../data/output/synth-findata/mistral-large/gs_positive_tone_positive_phrase",
]

out_file = "../../data/output/synth-findata/mistral-large/gs_predictions.csv"

y_true = []
y_pred = []
country = []

for input_folder in gs_folders:
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            file_path = os.path.join(input_folder, filename)
            df = pd.read_csv(file_path)
            y_true.extend(df["sentiment"].to_list())
            y_pred.extend(df["label"].str.lower().to_list())
            country.extend(df["country"].to_list())

gsdf["y_true"] = y_true
gsdf["y_pred"] = y_pred
gsdf["country"] = country
gsdf["pos"] = ["GS"] * len(gsdf)

gsdf.to_csv(out_file, index=False)

In [23]:
data = {'y_true': [], 'y_pred': [], 'country': [], 'pos': []}
gndf = pd.DataFrame(data)

gn_folders = [
    "../../data/output/synth-findata/mistral-large/gn_negative_tone_positive_phrase",
    "../../data/output/synth-findata/mistral-large/gn_positive_tone_positive_phrase",
]

out_file = "../../data/output/synth-findata/mistral-large/gn_predictions.csv"

y_true = []
y_pred = []
country = []

for input_folder in gn_folders:
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            file_path = os.path.join(input_folder, filename)
            df = pd.read_csv(file_path)
            y_true.extend(df["sentiment"].to_list())
            y_pred.extend(df["label"].str.lower().to_list())
            country.extend(df["country"].to_list())

gndf["y_true"] = y_true
gndf["y_pred"] = y_pred
gndf["country"] = country
gndf["pos"] = ["GN"] * len(gndf)

gndf.to_csv(out_file, index=False)

In [47]:
out_file = "../../data/output/synth-findata/mistral-large/all_predictions.csv"
merged_df = pd.concat([gndf, gsdf])
merged_df.to_csv(out_file, index=False)

In [48]:
y_true = merged_df["y_true"].map({"positive": 1, "negative": 0})
y_pred = merged_df["y_pred"].map({"positive": 1, "negative": 0})
sf_data = merged_df["pos"]

In [49]:
nan_locations = y_pred[y_pred.isna()].index
y_pred = y_pred.drop(nan_locations)
y_true = y_true.drop(nan_locations)
sf_data = sf_data.drop(nan_locations)

nan_locations = y_true[y_true.isna()].index
y_pred = y_pred.drop(nan_locations)
y_true = y_true.drop(nan_locations)
sf_data = sf_data.drop(nan_locations)

In [50]:
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import count, \
                              false_positive_rate, \
                              selection_rate

In [51]:
from sklearn.metrics import recall_score

# Define a custom recall function with average='macro' and pos_label='positive'
def recall_macro(y_true, y_pred):
    return recall_score(y_true, y_pred, average='macro')

# Construct a function dictionary
my_metrics = {
    'tpr' : recall_macro,
    'fpr' : false_positive_rate,
    'sel' : selection_rate,
    'count' : count
}

# Construct a MetricFrame
mf = MetricFrame(
    metrics=my_metrics,
    y_true=y_true,
    y_pred=y_pred,
    sensitive_features=sf_data,
)

In [52]:
mf.overall

tpr          0.987420
fpr          0.024713
sel          0.508557
count    13497.000000
dtype: float64

In [53]:
mf.by_group

Unnamed: 0_level_0,tpr,fpr,sel,count
pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GN,0.980725,0.038247,0.511867,6699.0
GS,0.994116,0.01118,0.505296,6798.0


In [54]:
mf.group_min()

tpr      0.980725
fpr       0.01118
sel      0.505296
count      6699.0
dtype: object

In [55]:
mf.group_max()

tpr      0.994116
fpr      0.038247
sel      0.511867
count      6798.0
dtype: object

In [56]:
mf.difference()

tpr       0.013391
fpr       0.027067
sel       0.006572
count    99.000000
dtype: float64

In [57]:
mf.ratio()

tpr      0.986530
fpr      0.292308
sel      0.987161
count    0.985437
dtype: float64

In [58]:
mf.difference(method='to_overall')

tpr         0.006696
fpr         0.013533
sel         0.003310
count    6798.000000
dtype: float64

In [59]:
mf.ratio(method='to_overall')

tpr      0.993220
fpr      0.452381
sel      0.993534
count    0.496333
dtype: float64

In [60]:
from fairlearn.metrics import demographic_parity_ratio
print(demographic_parity_ratio(y_true,
                               y_pred,
                               sensitive_features=sf_data))

0.9871611922296122


In [61]:
from fairlearn.metrics import equalized_odds_ratio
print(equalized_odds_ratio(y_true,
                               y_pred,
                               sensitive_features=sf_data))

0.2923076923076924


In [62]:
min(mf.ratio(method="between_groups"))

0.2923076923076924