# Results

In [7]:
MODELS = [
    'bert-base-uncased',
    'deepseek-ai/deepseek-llm-7b-base',
    'huggyllama/llama-7b'
]
TASKS = [
    "cola",
    "sst2",
    "mrpc",
    "stsb",
    "qqp",
    "mnli",
    "qnli",
    "rte",
    "wnli"
]
TASK_LABELS = {
    "cola": 2,
    "sst2": 2,
    "mrpc": 2,
    "qqp": 2,
    "stsb": 1,
    "mnli": 3,
    "qnli": 2,
    "rte": 2,
    "wnli": 2,
}
DEBIAS_METHODS = [
    "none",
    "cda",
    "blind",
    "embedding",
    "ear",
    "adele",
    "selective",
    "eat",
    "diff"
]
TASK_METRICS = {
    "cola": "eval_matthews_correlation",
    "sst2": "eval_accuracy",
    "mrpc": "eval_f1",
    "qqp": "eval_f1",
    "stsb": "eval_pearson",
    "mnli": "eval_accuracy",
    "qnli": "eval_accuracy",
    "rte": "eval_accuracy",
    "wnli": "eval_accuracy",
}
CDA_METHOD = {
    "none": False,
    "cda": True,
    "blind": False,
    "embedding": False,
    "ear": False,
    "adele": True,
    "selective": True,
    "eat": False,
    "diff": False
}

In [48]:
import json

MODEL_NAME = 'bert-base-uncased'
EVAL_OR_TEST = 'eval'

latex_eval = {task: {} for task in TASKS}
latex_bias = {task: {} for task in TASKS}

average = True
if average:
    latex_eval['Average'] = {debias: 0.0 for debias in DEBIAS_METHODS}
    latex_bias['Average'] = {debias: 0.0 for debias in DEBIAS_METHODS}

for task in TASKS:
    for debias in DEBIAS_METHODS:
        path = f"../output/{task}-{debias}-{MODEL_NAME.replace('/', '-')}/results.json"
        try:
            with open(path, "r") as f:
                resultsDict = json.load(f)
                
                print('='*50)
                print(task, debias)
                print('-'*50)
                if task != 'mnli':
                    print('eval: ', resultsDict['eval'][TASK_METRICS[task]]*100)
                else:
                    for sufix in ['_matched', '_mismatched']:
                        print('eval' + sufix + ': ', resultsDict['eval' + sufix][TASK_METRICS[task]]*100)
                print('bias: ', resultsDict['bias']['effect_size'])

                if task != 'mnli':
                    latex_eval[task][debias] = resultsDict[EVAL_OR_TEST][TASK_METRICS[task]]*100
                elif task == 'mnli':
                    latex_eval[task][debias] = [resultsDict[EVAL_OR_TEST + sufix][TASK_METRICS[task]]*100 for sufix in ['_matched', '_mismatched']]*100
                    
                latex_bias[task][debias] = resultsDict['bias']['effect_size']
                
                if average:
                    latex_eval['Average'][debias] += latex_eval[task][debias] / len(TASKS)
                    latex_bias['Average'][debias] += latex_bias[task][debias] / len(TASKS)

        except:
            pass



cola none
--------------------------------------------------
eval:  55.73810919249996
bias:  0.004630552139133215
cola cda
--------------------------------------------------
eval:  55.73810919249996
bias:  -0.08825618028640747
cola blind
--------------------------------------------------
eval:  55.992798722501256
bias:  0.3914726972579956
cola embedding
--------------------------------------------------
eval:  56.27810283916928
bias:  -0.20695161819458008
cola ear
--------------------------------------------------
eval:  55.982788859945096
bias:  0.17955416440963745
cola adele
--------------------------------------------------
eval:  5.926802437957019
bias:  0.17894725501537323
cola selective
--------------------------------------------------
eval:  48.8445640649576
bias:  -0.09232144057750702
cola eat
--------------------------------------------------
eval:  48.45725578053649
bias:  -0.013136533088982105
cola diff
--------------------------------------------------
eval:  0.0
bias:  0.

In [49]:
dashed = True

debias_names = {debias: debias for debias in DEBIAS_METHODS}
debias_names['selective'] = 'sel'
debias_names['embedding'] = 'emb'

header_cs = 'c|'*(len(latex_eval.keys())) + 'c'
header = '\\begin{table}[h] \n \t \\small \n \t \\centering \n \t \\begin{tabular}{' + header_cs + '}\n'
table_eval = header + '\t\t \\hline Debias & '
table_bias = header + '\t\t \\hline Debias & '

for (i, task) in enumerate(latex_eval.keys()):
    if i == len(latex_eval.keys())-1:
        table_eval += f'{task.upper() if task != 'Average' else 'Average'} \\\\ \\hline '
        table_bias += f'{task.upper() if task != 'Average' else 'Average'} \\\\ \\hline '
    else:
        table_eval += f'{task.upper()} & '
        table_bias += f'{task.upper()} & '

for debias in DEBIAS_METHODS:
    if dashed and debias == 'cda':
        table_eval += f'\n \t\t \\hdashline {debias} & '
        table_bias += f'\n \t\t \\hdashline {debias} & '
    else:
        table_eval += f'\n \t\t {debias_names[debias]} & '
        table_bias += f'\n \t\t {debias_names[debias]} & '
    
    for (i, task) in enumerate(latex_eval.keys()):

        if i == len(latex_eval.keys())-1:
            try:
                table_eval += f'{latex_eval[task][debias]:.1f} \\\\ '
                table_bias += f'{latex_bias[task][debias]:.4f} \\\\ '
            except:
                table_eval += r'- \\ '
                table_bias += r'- \\ '
                
        else:
            entry_eval = f'- & '
            entry_bias = f'- & '
            try:
                if task != 'mnli':
                    entry_eval = f'{latex_eval[task][debias]:.1f} & '
                    entry_bias = f'{latex_bias[task][debias]:.4f} & '
                else:
                    entry_eval = f'{latex_eval[task][debias][0]:.1f}/{latex_eval[task][debias][1]:.1f} & '
                    entry_bias = f'{latex_bias[task][debias]:.4f} & '
            except:
                pass
            finally:
                table_eval += entry_eval
                table_bias += entry_bias

table_eval += '\\hline \n \t \\end{tabular} \n \t \\caption{Performance of the different model on GLUE tasks.} \n \t \\label{tab:performance} \n \\end{table}'
table_bias += '\\hline \n \t \\end{tabular} \n \t \\caption{WEAT 7 test for the debiasing methods.} \n \t \\label{tab:bias} \n \\end{table}'

print('EVALUATION TABLE \n', '-'*50)
print(table_eval)
print('='*50, '\n'*3)
print('BIAS TABLE \n', '-'*50)
print(table_bias)
print('='*50, '\n'*3)

EVALUATION TABLE 
 --------------------------------------------------
\begin{table}[h] 
 	 \small 
 	 \centering 
 	 \begin{tabular}{c|c|c|c|c|c|c|c|c|c|c}
		 \hline Debias & COLA & SST2 & MRPC & STSB & QQP & MNLI & QNLI & RTE & WNLI & Average \\ \hline 
 		 none & 55.7 & 92.4 & 88.2 & 87.9 & 87.2 & 84.1/84.7 & 91.4 & 63.9 & 38.0 & 67.2 \\ 
 		 \hdashline cda & 55.7 & 92.5 & 88.4 & 88.0 & 87.5 & 84.3/84.3 & 91.3 & 66.1 & 36.6 & 67.3 \\ 
 		 blind & 56.0 & 92.3 & 87.0 & - & 87.7 & - & 91.3 & 63.5 & 39.4 & 57.5 \\ 
 		 emb & 56.3 & 92.7 & 87.5 & 55.4 & 87.6 & 84.4/84.1 & 91.3 & 63.2 & 43.7 & 64.2 \\ 
 		 ear & 56.0 & 92.5 & 88.0 & 88.4 & - & 84.4/84.4 & 91.5 & 62.5 & 53.5 & 59.2 \\ 
 		 adele & 5.9 & 87.2 & 81.2 & 52.8 & - & - & 83.4 & 53.4 & 56.3 & 46.7 \\ 
 		 sel & 48.8 & 91.7 & 86.7 & 81.6 & - & - & 90.0 & 61.7 & 56.3 & 57.4 \\ 
 		 eat & 48.5 & 92.3 & 86.6 & 87.2 & - & - & 89.3 & 64.3 & 38.0 & 56.2 \\ 
 		 diff & 0.0 & 48.7 & 81.2 & 4.9 & - & - & 49.7 & 49.1 & 47.9 & 31.3 \\ \hline 