In [34]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [35]:
repo_dir = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"
results_dir = f"{repo_dir}/artifacts/results"
date = '2022-05-11'

master_data_location  = f"{repo_dir}/data/labelled_master_data_{date}.csv"

In [36]:
labels = ['severe_toxicity', 'toxicity', 'identity_attack', 'insult', 'profanity', 'threat']

truth_labels = {label : f'y_truth_{label}' for label in labels}
pred_labels = {label : f'y_pred_{label}' for label in labels}

selected_columns = ['example_id'] + labels

In [40]:
pred_sets = {
    'human': {
        "file_name": "baseline_human-1_2022-05-07.csv",
    },
#     'human_2': {
#         "file_name": "baseline_human-2_2022-05-07.csv",
#     },
    'BERT': {
        "file_name": "baseline_bert_5_11.csv",
    },
    'DeBERTa v3': {
        "file_name": "baseline_deberta_5_11.csv",
    },
    'RoBERTa': {
        "file_name": "baseline_roberta_5_11.csv",
    },
    'GPT-3': {
        "file_name": "baseline_gpt3_2022-05-12.csv",
    },
    
            }

In [41]:
for model in pred_sets:
    file_location = f"{results_dir}/{pred_sets[model]['file_name']}"
    pred_sets[model]["df"] = pd.read_csv( file_location) 

In [42]:
truth_set = pd.read_csv(master_data_location)
truth_set = truth_set[selected_columns]

overall_truth_set = pd.melt(truth_set, id_vars = ['example_id'], value_vars=labels).rename(columns={'value':'y_truth'})

In [43]:
results_list = []
for model in pred_sets:
    model_results = pred_sets[model]["df"]
    
    test_set_ids = list(model_results['example_id'].values)
    model_truth_set = truth_set[truth_set['example_id'].isin(test_set_ids)]
    model_truth_set_renamed = model_truth_set.rename(columns=truth_labels)
    
    model_preds = model_results[selected_columns]
    model_preds_renamed = model_results.rename(columns=pred_labels)
    overall_model_preds = pd.melt(model_results, id_vars = ['example_id'], value_vars=labels).rename(columns={'value':'y_pred'})
    
    y_labels = model_truth_set_renamed.merge(model_preds_renamed, on='example_id')
    overall_y_labels = overall_truth_set.merge(overall_model_preds, on='example_id')
    
    precision, recall, f1_score, support = precision_recall_fscore_support(
        overall_y_labels['y_truth'], overall_y_labels['y_pred'], 
        labels=[0.0,1.0], beta=1, zero_division=0)
    eval_results = {
            'model': model,
            'attribute': 'overall',
            'precision': precision[1],
            'recall': recall[1],
            'f1_score': f1_score[1],
            'count':len(overall_y_labels['y_truth'])
        }
    
    results_list.append(eval_results)
    for label in labels:
        y_truth_label = f'y_truth_{label}'
        y_pred_label = f'y_pred_{label}'
        precision, recall, f1_score, support = precision_recall_fscore_support(y_labels[y_truth_label].values, y_labels[y_pred_label].values,
                                                                               labels=[0.0,1.0], beta=1, zero_division=0)

        eval_results = {
            'model': model,
            'attribute': label,
            'precision': precision[1],
            'recall': recall[1],
            'f1_score': f1_score[1],
            'count':len(y_labels[y_truth_label].values)
        }

        results_list.append(eval_results)

In [44]:
full_model_results = pd.DataFrame(results_list)

In [45]:
model_results_melt = pd.melt(full_model_results, id_vars=['model','attribute'], value_vars=['precision','f1_score','recall'])

The below should be the Results Table

In [46]:
pd.pivot_table(
    model_results_melt, 
         values=['value'],
         index=['variable','attribute'],
         columns='model'
)

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value,value,value,value
Unnamed: 0_level_1,model,BERT,DeBERTa v3,GPT-3,RoBERTa,human
variable,attribute,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
f1_score,identity_attack,0.085714,0.08,0.16,0.111111,0.5
f1_score,insult,0.403226,0.343891,0.480211,0.421875,0.686869
f1_score,overall,0.270904,0.258457,0.264572,0.265495,0.48037
f1_score,profanity,0.788774,0.793558,0.827014,0.793558,0.882883
f1_score,severe_toxicity,0.074074,0.097561,0.388889,0.0,0.375
f1_score,threat,0.0,0.0,0.117647,0.4,0.666667
f1_score,toxicity,0.344371,0.290749,0.070796,0.309795,0.703704
precision,identity_attack,1.0,0.375,0.75,0.8,0.372093
precision,insult,0.423729,0.417582,0.365462,0.428571,0.641509
precision,overall,0.270355,0.258108,0.266758,0.264957,0.412698


In [18]:
pd.pivot_table(
    model_results_melt, 
         values=['value'],
         index=['variable','attribute'],
         columns='model'
).style.to_latex()

'\\begin{tabular}{llrrrrr}\n &  & \\multicolumn{5}{r}{value} \\\\\n & model & BERT & DeBERTa v3 & RoBERTa & human_1 & human_2 \\\\\nvariable & attribute &  &  &  &  &  \\\\\n\\multirow[c]{7}{*}{f1_score} & identity_attack & 0.085714 & 0.080000 & 0.111111 & 0.500000 & 0.413793 \\\\\n & insult & 0.403226 & 0.343891 & 0.421875 & 0.686869 & 0.708861 \\\\\n & overall & 0.270904 & 0.258457 & 0.265495 & 0.480370 & 0.336957 \\\\\n & profanity & 0.788774 & 0.793558 & 0.793558 & 0.882883 & 0.882353 \\\\\n & severe_toxicity & 0.074074 & 0.097561 & 0.000000 & 0.375000 & 0.000000 \\\\\n & threat & 0.000000 & 0.000000 & 0.400000 & 0.666667 & 0.000000 \\\\\n & toxicity & 0.344371 & 0.290749 & 0.309795 & 0.703704 & 0.285714 \\\\\n\\multirow[c]{7}{*}{precision} & identity_attack & 1.000000 & 0.375000 & 0.800000 & 0.372093 & 0.750000 \\\\\n & insult & 0.423729 & 0.417582 & 0.428571 & 0.641509 & 0.848485 \\\\\n & overall & 0.270355 & 0.258108 & 0.264957 & 0.412698 & 0.489474 \\\\\n & profanity & 0.981618