In [56]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [57]:
repo_dir = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"
results_dir = f"{repo_dir}/artifacts/results"
date = '2022-05-11'

master_data_location  = f"{repo_dir}/data/labelled_master_data_{date}.csv"

In [58]:
labels = ['severe_toxicity', 'toxicity', 'identity_attack', 'insult', 'profanity', 'threat']

truth_labels = {label : f'y_truth_{label}' for label in labels}
pred_labels = {label : f'y_pred_{label}' for label in labels}

selected_columns = ['example_id'] + labels

In [59]:
pred_sets = {
    'Human': {
        "file_name": "baseline_human-1_2022-05-07.csv",
    },
#     'human_2': {
#         "file_name": "baseline_human-2_2022-05-07.csv",
#     },
    'BERT': {
        "file_name": "baseline_bert_5_11.csv",
    },
    'DeBERTa v3': {
        "file_name": "baseline_deberta_5_11.csv",
    },
    'RoBERTa': {
        "file_name": "baseline_roberta_5_11.csv",
    },
    'GPT-3 (One-Shot)': {
        "file_name": "baseline_gpt3_2022-05-12.csv",
    },
    'GPT-3 (Few-Shot)': {
        "file_name": "baseline_GPT-3_FewShot_2022_5_11.csv",
    },
    
            }

In [60]:
for model in pred_sets:
    file_location = f"{results_dir}/{pred_sets[model]['file_name']}"
    pred_sets[model]["df"] = pd.read_csv( file_location) 

In [61]:
truth_set = pd.read_csv(master_data_location)
truth_set = truth_set[selected_columns]

overall_truth_set = pd.melt(truth_set, id_vars = ['example_id'], value_vars=labels).rename(columns={'value':'y_truth'})

In [62]:
results_list = []
for model in pred_sets:
    model_results = pred_sets[model]["df"]
    
    test_set_ids = list(model_results['example_id'].values)
    model_truth_set = truth_set[truth_set['example_id'].isin(test_set_ids)]
    model_truth_set_renamed = model_truth_set.rename(columns=truth_labels)
    
    model_preds = model_results[selected_columns]
    model_preds_renamed = model_results.rename(columns=pred_labels)
    overall_model_preds = pd.melt(model_results, id_vars = ['example_id'], value_vars=labels).rename(columns={'value':'y_pred'})
    
    y_labels = model_truth_set_renamed.merge(model_preds_renamed, on='example_id')
    overall_y_labels = overall_truth_set.merge(overall_model_preds, on='example_id')
    
    precision, recall, f1_score, support = precision_recall_fscore_support(
        overall_y_labels['y_truth'], overall_y_labels['y_pred'], 
        labels=[0.0,1.0], beta=1, zero_division=0)
    eval_results = {
            'model': model,
            'attribute': 'overall',
            'precision': precision[1],
            'recall': recall[1],
            'f1_score': f1_score[1],
            'count':len(overall_y_labels['y_truth'])
        }
    
    results_list.append(eval_results)
    for label in labels:
        y_truth_label = f'y_truth_{label}'
        y_pred_label = f'y_pred_{label}'
        precision, recall, f1_score, support = precision_recall_fscore_support(y_labels[y_truth_label].values, y_labels[y_pred_label].values,
                                                                               labels=[0.0,1.0], beta=1, zero_division=0)

        eval_results = {
            'model': model,
            'attribute': label,
            'precision': precision[1],
            'recall': recall[1],
            'f1_score': f1_score[1],
            'count':len(overall_y_labels['y_truth'])
        }

        results_list.append(eval_results)

In [63]:
full_model_results = pd.DataFrame(results_list)

In [64]:
model_results_melt = pd.melt(full_model_results, id_vars=['model','attribute'], value_vars=['precision','f1_score','recall'])

The below should be the Results Table

In [69]:
baseline_results_table = pd.pivot_table(
    model_results_melt, 
         values=['value'],
         index=['variable','attribute'],
         columns='model'
).reset_index()

In [73]:
baseline_results_table.columns = ['Metric','Attribute','BERT','DeBERTa v3','GPT-3 (Few-Shot)','GPT-3 (One-Shot)','Human','RoBERTa']

In [79]:
baseline_results_table = baseline_results_table[['metric','attribute','Human','BERT','RoBERTa','DeBERTa v3','GPT-3 (One-Shot)','GPT-3 (Few-Shot)']]

In [81]:
baseline_results_table.to_csv(f'{repo_dir}/reporting/assets/baseline-results.csv',index=False)

In [80]:
baseline_results_table

Unnamed: 0,metric,attribute,Human,BERT,RoBERTa,DeBERTa v3,GPT-3 (One-Shot),GPT-3 (Few-Shot)
0,f1_score,identity_attack,0.5,0.085714,0.111111,0.08,0.16,0.391304
1,f1_score,insult,0.686869,0.403226,0.421875,0.343891,0.480211,0.458472
2,f1_score,overall,0.48037,0.270904,0.265495,0.258457,0.264572,0.249668
3,f1_score,profanity,0.882883,0.788774,0.793558,0.793558,0.827014,0.513562
4,f1_score,severe_toxicity,0.375,0.074074,0.0,0.097561,0.388889,0.181818
5,f1_score,threat,0.666667,0.0,0.4,0.0,0.117647,0.25
6,f1_score,toxicity,0.703704,0.344371,0.309795,0.290749,0.070796,0.38806
7,precision,identity_attack,0.372093,1.0,0.8,0.375,0.75,0.72
8,precision,insult,0.641509,0.423729,0.428571,0.417582,0.365462,0.403509
9,precision,overall,0.412698,0.270355,0.264957,0.258108,0.266758,0.30303
