In [2]:
import pandas as pd
from sklearn.metrics import confusion_matrix

In [1]:
def get_dev_result(model_name: str):
    model_path = "..\evaluations\\" + model_name + "_dev.txt"
    with open(model_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        lines = [line.strip().split(" ") for line in lines]
        y_true = [line[1] for line in lines if len(line) == 3]
        y_pred = [line[2] for line in lines if len(line) == 3]
        return y_true, y_pred
y_true, y_pred = get_dev_result("PTOIE_pos")

In [3]:
def get_confusion_matrix(y_true, y_pred):
    labels = list(set(y_true))
    df = pd.DataFrame(
        data=confusion_matrix(y_true, y_pred, labels=labels),
        columns=labels,
        index=labels,
    )
    df.index.name = "Y-True"
    df.columns.name = "Y-Pred"
    return df
df = get_confusion_matrix(y_true, y_pred)
df.style.background_gradient(cmap="Purples")

Y-Pred,I-ARG1,O,I-V,I-ARG0,B-V,B-ARG0,B-ARG1
Y-True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
I-ARG1,25,25,0,1,0,0,5
O,88,522,0,7,15,3,32
I-V,13,45,0,0,4,0,7
I-ARG0,0,41,0,40,0,3,0
B-V,0,21,0,0,30,0,0
B-ARG0,0,31,0,0,0,20,0
B-ARG1,4,32,1,0,1,1,12


In [4]:
#
# Local (metrics per class)
#
labels = list(set(y_true))
tps = {}
fps = {}
fns = {}
for label in labels:
    tps[label] = df.loc[label, label]
    fps[label] = df[label].sum() - tps[label]
    fns[label] = df.loc[label].sum() - tps[label]

#
# Global
#
micro_averages = {}
macro_averages = {}

correct_predictions = sum(tps.values())

total_predictions = df.values.sum()
accuracy_global = round(correct_predictions / total_predictions,4 ) if total_predictions > 0. else 0.

print("#-- Local measures --#")
print("True Positives:", tps)
print("False Positives:", fps)
print("False Negatives:", fns)

print("\n#-- Global measures --#")
print("Correct predictions:", correct_predictions)
print("Total predictions:", total_predictions)
print("Accuracy:", accuracy_global)

#-- Local measures --#
True Positives: {'I-ARG1': 25, 'O': 522, 'I-V': 0, 'I-ARG0': 40, 'B-V': 30, 'B-ARG0': 20, 'B-ARG1': 12}
False Positives: {'I-ARG1': 105, 'O': 195, 'I-V': 1, 'I-ARG0': 8, 'B-V': 20, 'B-ARG0': 7, 'B-ARG1': 44}
False Negatives: {'I-ARG1': 31, 'O': 145, 'I-V': 69, 'I-ARG0': 44, 'B-V': 21, 'B-ARG0': 31, 'B-ARG1': 39}

#-- Global measures --#
Correct predictions: 649
Total predictions: 1029
Accuracy: 0.6307


In [6]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

      B-ARG0     0.7407    0.3922    0.5128        51
      B-ARG1     0.2143    0.2353    0.2243        51
         B-V     0.6000    0.5882    0.5941        51
      I-ARG0     0.8333    0.4762    0.6061        84
      I-ARG1     0.1923    0.4464    0.2688        56
         I-V     0.0000    0.0000    0.0000        69
           O     0.7280    0.7826    0.7543       667

    accuracy                         0.6307      1029
   macro avg     0.4727    0.4173    0.4229      1029
weighted avg     0.6275    0.6307    0.6190      1029

