In [None]:
!pip install seqeval

In [None]:
from collections import Counter, OrderedDict, defaultdict

import numpy as np
from seqeval.metrics import accuracy_score, classification_report, f1_score
from seqeval.scheme import IOB2

In [None]:
y_true = [
    [
        "O",
        "O",
        "O",
        "O",
        "EVENT",
        "O",
        "O",
        "O",
        "O",
        "EVENT",
        "HELLO",
        "TESTING",
        "O",
        "O",
    ],
    ["B-PER", "I-PER", "O"],
]
y_pred = [
    [
        "O",
        "O",
        "O",
        "O",
        "EVENT",
        "O",
        "O",
        "O",
        "O",
        "HELLO",
        "HELLO",
        "HELLO",
        "O",
        "O",
    ],
    ["B-PER", "I-PER", "O"],
]

In [None]:
f1_score(y_true, y_pred)

In [None]:
print(classification_report(y_true, y_pred, scheme=IOB2))

In [None]:
def get_entities(llist):
    prev_tag = ""
    indices = []
    for i, ent in enumerate(llist):
        if ent != prev_tag:
            indices.append([ent, i, i])
        else:
            indices[-1][2] = i
        prev_tag = ent
    return [tuple(i) for i in indices if i[0] != "O"]


def f1_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    # intersection of predicted and true indexed named
    # entities
    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)
    nb_true = len(true_entities)

    p = nb_correct / nb_pred if nb_pred > 0 else 0
    r = nb_correct / nb_true if nb_true > 0 else 0

    return 2 * p * r / (p + r) if p + r > 0 else 0


def precision_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)

    return nb_correct / nb_pred if nb_pred > 0 else 0


def recall_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    nb_correct = len(true_entities & pred_entities)
    nb_true = len(true_entities)

    return nb_correct / nb_true if nb_true > 0 else 0


def classification_report(y_true, y_pred, digits=2):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    name_width = 0
    d1 = defaultdict(set)
    d2 = defaultdict(set)
    for e in true_entities:
        d1[e[0]].add((e[1], e[2]))
        name_width = max(name_width, len(e[0]))
    for e in pred_entities:
        d2[e[0]].add((e[1], e[2]))

    last_line_heading = "macro avg"
    width = max(name_width, len(last_line_heading), digits)

    headers = ["precision", "recall", "f1-score", "support"]
    head_fmt = u"{:>{width}s} " + u" {:>9}" * len(headers)
    report = head_fmt.format(u"", *headers, width=width)
    report += u"\n\n"

    row_fmt = u"{:>{width}s} " + u" {:>9.{digits}f}" * 3 + u" {:>9}\n"

    ps, rs, f1s, s = [], [], [], []
    for type_name, true_entities in d1.items():
        pred_entities = d2[type_name]
        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)

        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        f1 = 2 * p * r / (p + r) if p + r > 0 else 0

        report += row_fmt.format(
            *[type_name, p, r, f1, nb_true], width=width, digits=digits
        )

        ps.append(p)
        rs.append(r)
        f1s.append(f1)
        s.append(nb_true)

    report += u"\n"

    # compute averages
    report += row_fmt.format(
        "micro avg",
        precision_score(y_true, y_pred),
        recall_score(y_true, y_pred),
        f1_score(y_true, y_pred),
        np.sum(s),
        width=width,
        digits=digits,
    )
    report += row_fmt.format(
        last_line_heading,
        np.average(ps, weights=s),
        np.average(rs, weights=s),
        np.average(f1s, weights=s),
        np.sum(s),
        width=width,
        digits=digits,
    )

    return report

In [None]:
y_true = [
    "O",
    "O",
    "O",
    "O",
    "EVENT",
    "O",
    "O",
    "O",
    "O",
    "EVENT",
    "HELLO",
    "TESTING",
    "O",
    "O",
]
y_pred = [
    "O",
    "O",
    "O",
    "O",
    "EVENT",
    "O",
    "O",
    "O",
    "O",
    "EVENT",
    "TESTING",
    "TESTING",
    "O",
    "O",
]
print(classification_report(y_true, y_pred, digits=4))
print(accuracy_score(y_true, y_pred))