In [None]:
# !pip install seqeval datasets

In [None]:
from collections import Counter, OrderedDict, defaultdict

import numpy as np
from datasets import inspect_metric
from seqeval.metrics import accuracy_score, classification_report, f1_score
from seqeval.scheme import IOB1, IOB2, IOBES

In [None]:
from google.colab import drive

drive.mount("/content/gdrive")

## Test 1 - Simple

In [None]:
y_true = [["A", "B", "B", "A", "C"]]
y_pred = [["A", "B", "B", "A", "C"]]

In [None]:
print("F1 Score: {}".format(f1_score(y_true, y_pred)))
print("Acc Score: {}".format(accuracy_score(y_true, y_pred)))
print("CR Report: {}".format(classification_report(y_true, y_pred)))

## Test 2

In [None]:
y_true = [["A", "B", "B", "A", "C"], ["A", "B", "C"]]
y_pred = [["A", "B", "B", "A", "C"], ["A", "B", "B"]]

In [None]:
print("F1 Score: {}".format(f1_score(y_true, y_pred)))
print("Acc Score: {}".format(accuracy_score(y_true, y_pred)))
print("CR Report: {}".format(classification_report(y_true, y_pred, digits=3)))

## Test 3

In [None]:
y_true = [["O", "B", "B", "I", "E"], ["O", "B", "E"]]
y_pred = [["O", "B", "B", "I", "E"], ["O", "B", "B"]]

In [None]:
print("F1 Score: {}".format(f1_score(y_true, y_pred)))
print("Acc Score: {}".format(accuracy_score(y_true, y_pred)))
print("CR Report: {}".format(classification_report(y_true, y_pred, digits=3)))

## Test 3

In [None]:
y_true = [["I-A", "I-B", "B", "I-A", "C"], ["I-A", "I-B", "C"]]
y_pred = [["I-A", "I-B", "B", "I-A", "C"], ["I-A", "I-B", "I-B"]]

In [None]:
print("F1 Score: {}".format(f1_score(y_true, y_pred)))
print("Acc Score: {}".format(accuracy_score(y_true, y_pred)))
print("CR Report: {}".format(classification_report(y_true, y_pred, digits=3)))

## Test 3

In [None]:
y_true = [
    ["I-AGRICULTURAL", "I-BUSINESS", "I-BUSINESS", "I-AGRICULTURAL", "C"],
    ["I-AGRICULTURAL", "B-BUSINESS", "C"],
]
y_pred = [
    ["I-AGRICULTURAL", "I-BUSINESS", "B-BUSINESS", "I-AGRICULTURAL", "C"],
    ["I-AGRICULTURAL", "B-BUSINESS", "B-BUSINESS"],
]

In [None]:
print("F1 Score: {}".format(f1_score(y_true, y_pred)))
print("Acc Score: {}".format(accuracy_score(y_true, y_pred)))
print("CR Report: \n{}".format(classification_report(y_true, y_pred, digits=3)))

## Test 4 - Custom Metric

In [None]:
def get_entities(llist):
    prev_tag = ""
    indices = []
    for i, ent in enumerate(llist):
        if ent != prev_tag:
            indices.append([ent, i, i])
        else:
            indices[-1][2] = i
        prev_tag = ent
    return [tuple(i) for i in indices if i[0] != "O"]


def f1_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    # intersection of predicted and true indexed named
    # entities
    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)
    nb_true = len(true_entities)

    p = nb_correct / nb_pred if nb_pred > 0 else 0
    r = nb_correct / nb_true if nb_true > 0 else 0

    return 2 * p * r / (p + r) if p + r > 0 else 0


def precision_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)

    return nb_correct / nb_pred if nb_pred > 0 else 0


def recall_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    nb_correct = len(true_entities & pred_entities)
    nb_true = len(true_entities)

    return nb_correct / nb_true if nb_true > 0 else 0


def classification_report(y_true, y_pred, digits=2):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    name_width = 0
    d1 = defaultdict(set)
    d2 = defaultdict(set)
    for e in true_entities:
        d1[e[0]].add((e[1], e[2]))
        name_width = max(name_width, len(e[0]))
    for e in pred_entities:
        d2[e[0]].add((e[1], e[2]))

    last_line_heading = "macro avg"
    width = max(name_width, len(last_line_heading), digits)

    headers = ["precision", "recall", "f1-score", "support"]
    head_fmt = "{:>{width}s} " + " {:>9}" * len(headers)
    report = head_fmt.format("", *headers, width=width)
    report += "\n\n"

    row_fmt = "{:>{width}s} " + " {:>9.{digits}f}" * 3 + " {:>9}\n"

    ps, rs, f1s, s = [], [], [], []
    for type_name, true_entities in d1.items():
        pred_entities = d2[type_name]
        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)

        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        f1 = 2 * p * r / (p + r) if p + r > 0 else 0

        report += row_fmt.format(
            *[type_name, p, r, f1, nb_true], width=width, digits=digits
        )

        ps.append(p)
        rs.append(r)
        f1s.append(f1)
        s.append(nb_true)

    report += "\n"

    # compute averages
    report += row_fmt.format(
        "micro avg",
        precision_score(y_true, y_pred),
        recall_score(y_true, y_pred),
        f1_score(y_true, y_pred),
        np.sum(s),
        width=width,
        digits=digits,
    )
    report += row_fmt.format(
        last_line_heading,
        np.average(ps, weights=s),
        np.average(rs, weights=s),
        np.average(f1s, weights=s),
        np.sum(s),
        width=width,
        digits=digits,
    )

    return report


y_true = ["a", "a", "b", "o", "o", "i", "a"]
y_pred = ["a", "a", "O", "o"]
print(classification_report(y_true, y_pred, digits=4))
print(accuracy_score(y_true, y_pred))

## Inspect 'seqeval' metric