In [None]:
import pandas as pd
import json
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import metrics
from collections import defaultdict

In [None]:
def f1_score(true, pred_result):
    correct = 0
    total = len(true)
    correct_positive = 0
    pred_positive = 0
    gold_positive = 0

    for i in range(total):
        golden = true[i]
        if golden == pred_result[i]:
            correct += 1
            if golden not in ['NA', 'na', 'no_relation', 'Other', 'Others', 'false', 'unanswerable']:
                correct_positive += 1
        if golden not in ['NA', 'na', 'no_relation', 'Other', 'Others', 'false', 'unanswerable']:
            gold_positive +=1
        if pred_result[i] not in ['NA', 'na', 'no_relation', 'Other', 'Others', 'false', 'unanswerable']:
            pred_positive += 1
    acc = float(correct) / float(total)
    try:
        micro_p = float(correct_positive) / float(pred_positive)
    except:
        micro_p = 0
    try:
        micro_r = float(correct_positive) / float(gold_positive)
    except:
        micro_r = 0
    try:
        micro_f1 = 2 * micro_p * micro_r / (micro_p + micro_r)
    except:
        micro_f1 = 0
    result = {'acc': acc, 'p': micro_p, 'r': micro_r, 'f1': micro_f1}
    return result

In [None]:
def f1_score_na(true, pred_result):
    correct = 0
    total = len(true)
    correct_positive = 0
    pred_positive = 0
    gold_positive = 0

    for i in range(total):
        golden = true[i]
        if golden == pred_result[i]:
            correct += 1
            correct_positive += 1
        gold_positive +=1
        pred_positive += 1
    acc = float(correct) / float(total)
    try:
        micro_p = float(correct_positive) / float(pred_positive)
    except:
        micro_p = 0
    try:
        micro_r = float(correct_positive) / float(gold_positive)
    except:
        micro_r = 0
    try:
        micro_f1 = 2 * micro_p * micro_r / (micro_p + micro_r)
    except:
        micro_f1 = 0
    result = {'acc': acc, 'p': micro_p, 'r': micro_r, 'f1': micro_f1}
    return result

In [None]:
methods = ['009', '010', '027', '020', '025', '039', 'Roberta_base', 
           'KnowPrompt']
datasets = ['crossRE', 'NYT10', 'FewRel', 'tacred', 'retacred', 'WebNLG', 'sem_eval_task_8']

In [None]:
df = pd.DataFrame(columns=['Method','Dataset','fold','f1'])
for method in methods:
    print(method)
    if method.split('/')[0]=='GenPT':
        name = 'GenPT'
    else:
        name = method
        
    for data in ['crossRE', 'NYT10', 'FewRel', 'tacred', 'retacred', 'WebNLG', 'sem_eval_task_8']:
        for k in [1, 2, 3, 4, 5]:
            try:
                res_file = f'{base_path}/{method}/cv_output/{data}/Fold-{k}/{name}_test.jsonl'
                with open(res_file) as f:
                    batch = f.read().splitlines()
                batch = [json.loads(line) for line in batch if line != '']

                true_label = [x['label_true'] for x in batch]
                pred_label = [x['label_pred'] for x in batch]

                results = f1_score(true_label, pred_label)
                
                new_row = {
                'Method': method,
                'Dataset': data,
                'fold':k,
                'f1': results['f1'],
                }
                df = df.append(new_row, ignore_index=True)
            except:
                print(f'Missing {data}, {k}')
                continue

            
    print('\n')
        
    

In [None]:
g_df = df.groupby(['Method', 'Dataset'], as_index=False).mean()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create the boxplot with horizontal orientation and transparent boxes
plt.figure(figsize=(14, 5))
ax = sns.boxplot(y='Method', x='f1', data=g_df, palette='Set3',
                 boxprops={'facecolor': 'none', 'edgecolor': 'black'},  # Transparent box with black edges
                 whiskerprops={'color': 'black'},
                 capprops={'color': 'black'},
                 medianprops={'color': 'black'})

# Overlay with stripplot to show individual data points with different colors
ax = sns.stripplot(y='Method', x='f1', data=g_df, hue='Dataset', dodge=False, palette='bright',
                   marker='o', alpha=0.7, size=8)

# Change the y-tick labels to custom labels
new_labels = ['Att-BLSTM', 'Entity-Att', 'RBERT',
             'LUKE', 'PAWARE', 'ERNIE',
             'KnowPrompt', 'Roberta_base']  # Example custom labels
ax.set_yticklabels(new_labels)

ax.set_xlabel('F1 Score', fontsize=14)
ax.set_ylabel('Methods', fontsize=14)

# Increase fontsize of x and y ticks
ax.tick_params(axis='x', labelsize=14)  # Change labelsize as needed
ax.tick_params(axis='y', labelsize=14)  # Change labelsize as needed

# Set x-axis limits
ax.set_xlim(0.4, 1)  # Set x limits to 0-1

# Get the handles and labels for the legend
handles, labels = ax.get_legend_handles_labels()

# Define custom names for the 'Dataset' hue categories
custom_hue_labels = ['FewRel', 'NYT10', 'WebNLG', 'CrossRE', 'RETACRED', 'SemEval', 'TACRED']  # Replace with your custom hue labels

# Update the legend with custom hue labels
plt.legend(handles, custom_hue_labels, title='Datasets', loc='upper center', fontsize=14, title_fontsize=14, 
           bbox_to_anchor=(0.5, 1.2), ncol=len(custom_hue_labels), frameon=False)

# Rotate y-axis labels if needed
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)

# Save the plot as a PNG file
plt.savefig('./images/supervised_all.png', format='png', dpi=600, bbox_inches='tight')  # Save with 300 DPI and tight bounding box

# Show plot
plt.show()
