# Results

In [1]:
MODELS = [
    'bert-base-uncased',
    'deepseek-ai/deepseek-llm-7b-base',
    'huggyllama/llama-7b'
]
TASKS = [
    "cola",
    "sst2",
    "mrpc",
    "stsb",
    "qqp",
    "mnli",
    "qnli",
    "rte",
    "wnli"
]
TASK_LABELS = {
    "cola": 2,
    "sst2": 2,
    "mrpc": 2,
    "qqp": 2,
    "stsb": 1,
    "mnli": 3,
    "qnli": 2,
    "rte": 2,
    "wnli": 2,
}
DEBIAS_METHODS = [
    "none",
    "cda",
    "blind",
    "embedding",
    "ear",
    "adele",
    "selective",
    "eat",
    "diff"
]
TASK_METRICS = {
    "cola": "eval_matthews_correlation",
    "sst2": "eval_accuracy",
    "mrpc": "eval_accuracy",
    "stsb": "eval_pearson",
    "mnli": "eval_accuracy",
    "qnli": "eval_accuracy",
    "rte": "eval_accuracy",
    "wnli": "eval_accuracy",
}
CDA_METHOD = {
    "none": False,
    "cda": True,
    "blind": False,
    "embedding": False,
    "ear": False,
    "adele": True,
    "selective": True,
    "eat": False,
    "diff": False
}

In [2]:
import json
MODEL_NAME = 'bert-base-uncased'

latex_eval = {}
latex_bias = {}

for task in TASKS:
    latex_eval[task] = {}
    latex_bias[task] = {}
    for debias in DEBIAS_METHODS:
        path = f"../output/{task}-{debias}-{MODEL_NAME.replace('/', '-')}/results.json"
        try:
            with open(path, "r") as f:
                resultsDict = json.load(f)
                
                print('='*50)
                print(task, debias)
                print('-'*50)
                print('eval: ', resultsDict['eval'][TASK_METRICS[task]])
                print('bias: ', resultsDict['bias']['effect_size'])

                latex_eval[task][debias] = resultsDict['eval'][TASK_METRICS[task]]
                latex_bias[task][debias] = resultsDict['bias']['effect_size']
        except:
            pass


table_eval = 'Task & '
table_bias = 'Task & '

for (i, task) in enumerate(latex_eval.keys()):
    if i == range(len(latex_eval.keys())):
        table_eval += f'{task.upper()} \\ '
        table_bias += f'{task.upper()} \\ '
    else:
        table_eval += f'{task.upper()} & '
        table_bias += f'{task.upper()} & '

cola none
--------------------------------------------------
eval:  0.5573810919249996
bias:  0.004630552139133215
cola cda
--------------------------------------------------
eval:  0.5573810919249996
bias:  -0.08825618028640747
cola blind
--------------------------------------------------
eval:  0.5599279872250126
bias:  0.3914726972579956
cola embedding
--------------------------------------------------
eval:  0.5627810283916928
bias:  -0.20695161819458008
cola ear
--------------------------------------------------
eval:  0.559827888599451
bias:  0.17955416440963745
cola selective
--------------------------------------------------
eval:  0.488445640649576
bias:  -0.09232144057750702
sst2 none
--------------------------------------------------
eval:  0.9243119266055045
bias:  0.1490212231874466
sst2 cda
--------------------------------------------------
eval:  0.9254587155963303
bias:  0.3088687062263489
sst2 blind
--------------------------------------------------
eval:  0.9231651376

In [13]:
header_cs = 'c|'*(len(latex_eval.keys())) + 'c'
header = '\\begin{table}[h] \n \t \\centering \n \t \\begin{tabular}{' + header_cs + '}\n'
table_eval = header + '\t\t \\hline Debias method & '
table_bias = header + '\t\t \\hline Debias method & '

for (i, task) in enumerate(latex_eval.keys()):
    if i == len(latex_eval.keys())-1:
        table_eval += f'{task.upper()} \\\\ \\hline '
        table_bias += f'{task.upper()} \\\\ \\hline '
    else:
        table_eval += f'{task.upper()} & '
        table_bias += f'{task.upper()} & '

for debias in DEBIAS_METHODS:
    table_eval += f'\n \t\t {debias} & '
    table_bias += f'\n \t\t {debias} & '
    for (i, task) in enumerate(latex_eval.keys()):
        if i == len(latex_eval.keys())-1:
            try:
                table_eval += f'{latex_eval[task][debias]:.4f} \\\\ '
                table_bias += f'{latex_bias[task][debias]:.4f} \\\\ '
            except:
                table_eval += r'- \\ '
                table_bias += r'- \\ '
                
        else:
            try:
                table_eval += f'{latex_eval[task][debias]:.4f} & '
                table_bias += f'{latex_bias[task][debias]:.4f} & '
            except:
                table_eval += f'- & '
                table_bias += f'- & '

table_eval += '\\hline \n \t \\end{tabular} \n \t \\caption{Performance of the different model on GLUE tasks.} \n \t \\label{tab:performance} \n \\end{table}'
table_bias += '\\hline \n \t \\end{tabular} \n \t \\caption{WEAT 7 test for the debiasing methods.} \n \t \\label{tab:performance} \n \\end{table}'

print('EVALUATION TABLE \n', '-'*50)
print(table_eval)
print('='*50, '\n'*3)
print('BIAS TABLE \n', '-'*50)
print(table_bias)

EVALUATION TABLE 
 --------------------------------------------------
\begin{table}[h] 
 	 \centering 
 	 \begin{tabular}{c|c|c|c|c|c|c|c|c|c}
		 \hline Debias method & COLA & SST2 & MRPC & STSB & QQP & MNLI & QNLI & RTE & WNLI \\ \hline 
 		 none & 0.5574 & 0.9243 & 0.8284 & 0.8789 & - & - & - & - & - \\ 
 		 cda & 0.5574 & 0.9255 & 0.8333 & 0.8802 & - & - & - & - & - \\ 
 		 blind & 0.5599 & 0.9232 & 0.8064 & - & - & - & - & - & - \\ 
 		 embedding & 0.5628 & 0.9266 & 0.8186 & - & - & - & - & - & - \\ 
 		 ear & 0.5598 & 0.9255 & 0.8235 & 0.8845 & - & - & - & - & - \\ 
 		 adele & - & 0.8716 & 0.6838 & 0.5281 & - & - & - & - & - \\ 
 		 selective & 0.4884 & 0.9174 & 0.8039 & 0.8163 & - & - & - & - & - \\ 
 		 eat & - & - & - & - & - & - & - & - & - \\ 
 		 diff & - & 0.4874 & 0.6838 & 0.0493 & - & - & - & - & - \\ \hline 
 	 \end{tabular} 
 	 \caption{Performance of the different model on GLUE tasks.} 
 	 \label{tab:performance} 
 \end{table}



BIAS TABLE 
 -------------------------