# Results

In [7]:
MODELS = [
    'bert-base-uncased',
    'deepseek-ai/deepseek-llm-7b-base',
    'huggyllama/llama-7b'
]
TASKS = [
    "cola",
    "sst2",
    "mrpc",
    "stsb",
    "qqp",
    "mnli",
    "qnli",
    "rte",
    "wnli"
]
TASK_LABELS = {
    "cola": 2,
    "sst2": 2,
    "mrpc": 2,
    "qqp": 2,
    "stsb": 1,
    "mnli": 3,
    "qnli": 2,
    "rte": 2,
    "wnli": 2,
}
DEBIAS_METHODS = [
    "none",
    "cda",
    "blind",
    "embedding",
    "ear",
    "adele",
    "selective",
    "eat",
    "diff"
]
TASK_METRICS = {
    "cola": "eval_matthews_correlation",
    "sst2": "eval_accuracy",
    "mrpc": "eval_accuracy",
    "stsb": "eval_pearson",
    "mnli": "eval_accuracy",
    "qnli": "eval_accuracy",
    "rte": "eval_accuracy",
    "wnli": "eval_accuracy",
}
CDA_METHOD = {
    "none": False,
    "cda": True,
    "blind": False,
    "embedding": False,
    "ear": False,
    "adele": True,
    "selective": True,
    "eat": False,
    "diff": False
}

In [None]:
import json

MODEL_NAME = 'bert-base-uncased'
EVAL_OR_TEST = 'eval'

latex_eval = {task: {} for task in TASKS}
latex_bias = {task: {} for task in TASKS}

average = True
if average:
    latex_eval['Average'] = {debias: 0.0 for debias in DEBIAS_METHODS}
    latex_bias['Average'] = {debias: 0.0 for debias in DEBIAS_METHODS}

for task in TASKS:
    for debias in DEBIAS_METHODS:
        path = f"../output/{task}-{debias}-{MODEL_NAME.replace('/', '-')}/results.json"
        try:
            with open(path, "r") as f:
                resultsDict = json.load(f)
                
                print('='*50)
                print(task, debias)
                print('-'*50)
                print('eval: ', resultsDict[EVAL_OR_TEST][TASK_METRICS[task]])
                print('bias: ', resultsDict['bias']['effect_size'])

                latex_eval[task][debias] = resultsDict[EVAL_OR_TEST][TASK_METRICS[task]]
                latex_bias[task][debias] = resultsDict['bias']['effect_size']
                
                if average:
                    latex_eval['Average'][debias] += latex_eval[task][debias] / len(TASKS)
                    latex_bias['Average'][debias] += latex_bias[task][debias] / len(TASKS)

        except:
            pass



cola none
--------------------------------------------------
eval:  0.5573810919249996
bias:  0.004630552139133215
cola cda
--------------------------------------------------
eval:  0.5573810919249996
bias:  -0.08825618028640747
cola blind
--------------------------------------------------
eval:  0.5599279872250126
bias:  0.3914726972579956
cola embedding
--------------------------------------------------
eval:  0.5627810283916928
bias:  -0.20695161819458008
cola ear
--------------------------------------------------
eval:  0.559827888599451
bias:  0.17955416440963745
cola adele
--------------------------------------------------
eval:  0.0592680243795702
bias:  0.17894725501537323
cola selective
--------------------------------------------------
eval:  0.488445640649576
bias:  -0.09232144057750702
cola eat
--------------------------------------------------
eval:  0.4845725578053649
bias:  -0.013136533088982105
cola diff
--------------------------------------------------
eval:  0.0
bias

In [None]:
dashed = True

debias_names = {debias: debias for debias in DEBIAS_METHODS}
debias_names['selective'] = 'sel'
debias_names['embedding'] = 'emb'

header_cs = 'c|'*(len(latex_eval.keys())) + 'c'
header = '\\begin{table}[h] \n \t \\small \n \t \\centering \n \t \\begin{tabular}{' + header_cs + '}\n'
table_eval = header + '\t\t \\hline Debias & '
table_bias = header + '\t\t \\hline Debias & '

for (i, task) in enumerate(latex_eval.keys()):
    if i == len(latex_eval.keys())-1:
        table_eval += f'{task.upper() if task != 'Average' else 'Average'} \\\\ \\hline '
        table_bias += f'{task.upper() if task != 'Average' else 'Average'} \\\\ \\hline '
    else:
        table_eval += f'{task.upper()} & '
        table_bias += f'{task.upper()} & '

for debias in DEBIAS_METHODS:
    if dashed and debias == 'cda':
        table_eval += f'\n \t\t \\hdashline {debias} & '
        table_bias += f'\n \t\t \\hdashline {debias} & '
    else:
        table_eval += f'\n \t\t {debias_names[debias]} & '
        table_bias += f'\n \t\t {debias_names[debias]} & '
    for (i, task) in enumerate(latex_eval.keys()):
        if i == len(latex_eval.keys())-1:
            try:
                table_eval += f'{latex_eval[task][debias]:.4f} \\\\ '
                table_bias += f'{latex_bias[task][debias]:.4f} \\\\ '
            except:
                table_eval += r'- \\ '
                table_bias += r'- \\ '
                
        else:
            try:
                table_eval += f'{latex_eval[task][debias]:.4f} & '
                table_bias += f'{latex_bias[task][debias]:.4f} & '
            except:
                table_eval += f'- & '
                table_bias += f'- & '

table_eval += '\\hline \n \t \\end{tabular} \n \t \\caption{Performance of the different model on GLUE tasks.} \n \t \\label{tab:performance} \n \\end{table}'
table_bias += '\\hline \n \t \\end{tabular} \n \t \\caption{WEAT 7 test for the debiasing methods.} \n \t \\label{tab:bias} \n \\end{table}'

print('EVALUATION TABLE \n', '-'*50)
print(table_eval)
print('='*50, '\n'*3)
print('BIAS TABLE \n', '-'*50)
print(table_bias)
print('='*50, '\n'*3)

EVALUATION TABLE 
 --------------------------------------------------
\begin{table}[h] 
 	 \small 
 	 \centering 
 	 \begin{tabular}{c|c|c|c|c|c|c|c|c|c|c}
		 \hline Debias & COLA & SST2 & MRPC & STSB & QQP & MNLI & QNLI & RTE & WNLI & Average \\ \hline 
 		 none & 0.5574 & 0.9243 & 0.8284 & 0.8789 & - & - & 0.9141 & 0.6390 & 0.3803 & 0.5692 \\ 
 		 \hdashline cda & 0.5574 & 0.9255 & 0.8333 & 0.8802 & - & - & 0.9127 & 0.6606 & 0.3662 & 0.5707 \\ 
 		 blind & 0.5599 & 0.9232 & 0.8064 & - & - & - & 0.9131 & 0.6354 & 0.3944 & 0.4703 \\ 
 		 emb & 0.5628 & 0.9266 & 0.8186 & 0.5535 & - & - & 0.9131 & 0.6318 & 0.4366 & 0.5381 \\ 
 		 ear & 0.5598 & 0.9255 & 0.8235 & 0.8845 & - & - & 0.9149 & 0.6245 & 0.5352 & 0.5853 \\ 
 		 adele & 0.0593 & 0.8716 & 0.6838 & 0.5281 & - & - & 0.8342 & 0.5343 & 0.5634 & 0.4527 \\ 
 		 sel & 0.4884 & 0.9174 & 0.8039 & 0.8163 & - & - & 0.9001 & 0.6173 & 0.5634 & 0.5674 \\ 
 		 eat & 0.4846 & 0.9232 & 0.8039 & 0.8721 & - & - & 0.8931 & 0.6426 & 0.3803 & 0.5555 \\