In [9]:
# !pip install seqeval datasets

In [3]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
from seqeval.scheme import IOB1, IOB2, IOBES
from datasets import inspect_metric
import numpy as np
from collections import defaultdict, Counter, OrderedDict

In [10]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Test 1 - Simple

In [None]:
y_true = [['A', 'B', 'B', 'A', 'C']]
y_pred = [['A', 'B', 'B', 'A', 'C']]

In [None]:
print("F1 Score: {}".format(f1_score(y_true, y_pred)))
print("Acc Score: {}".format(accuracy_score(y_true, y_pred)))
print("CR Report: {}".format(classification_report(y_true, y_pred)))

F1 Score: 1.0
Acc Score: 1.0
CR Report:               precision    recall  f1-score   support

           _       1.00      1.00      1.00         1

   micro avg       1.00      1.00      1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1





## Test 2

In [None]:
y_true = [['A', 'B', 'B', 'A', 'C'], ['A', 'B', 'C']]
y_pred = [['A', 'B', 'B', 'A', 'C'], ['A', 'B', 'B']]

In [None]:
print("F1 Score: {}".format(f1_score(y_true, y_pred)))
print("Acc Score: {}".format(accuracy_score(y_true, y_pred)))
print("CR Report: {}".format(classification_report(y_true, y_pred, digits=3)))

F1 Score: 0.5
Acc Score: 0.875
CR Report:               precision    recall  f1-score   support

           _      0.333     1.000     0.500         1

   micro avg      0.333     1.000     0.500         1
   macro avg      0.333     1.000     0.500         1
weighted avg      0.333     1.000     0.500         1





## Test 3

In [None]:
y_true = [['O', 'B', 'B', 'I', 'E'], ['O', 'B', 'E']]
y_pred = [['O', 'B', 'B', 'I', 'E'], ['O', 'B', 'B']]

In [None]:
print("F1 Score: {}".format(f1_score(y_true, y_pred)))
print("Acc Score: {}".format(accuracy_score(y_true, y_pred)))
print("CR Report: {}".format(classification_report(y_true, y_pred, digits=3)))

F1 Score: 0.5714285714285715
Acc Score: 0.875
CR Report:               precision    recall  f1-score   support

           _      0.500     0.667     0.571         3

   micro avg      0.500     0.667     0.571         3
   macro avg      0.500     0.667     0.571         3
weighted avg      0.500     0.667     0.571         3



## Test 3

In [None]:
y_true = [['I-A', 'I-B', 'B', 'I-A', 'C'], ['I-A', 'I-B', 'C']]
y_pred = [['I-A', 'I-B', 'B', 'I-A', 'C'], ['I-A', 'I-B', 'I-B']]

In [None]:
print("F1 Score: {}".format(f1_score(y_true, y_pred)))
print("Acc Score: {}".format(accuracy_score(y_true, y_pred)))
print("CR Report: {}".format(classification_report(y_true, y_pred, digits=3)))

F1 Score: 0.8333333333333334
Acc Score: 0.875
CR Report:               precision    recall  f1-score   support

           A      1.000     1.000     1.000         3
           B      0.500     0.500     0.500         2
           _      1.000     1.000     1.000         1

   micro avg      0.833     0.833     0.833         6
   macro avg      0.833     0.833     0.833         6
weighted avg      0.833     0.833     0.833         6





## Test 3

In [None]:
y_true = [['I-AGRICULTURAL', 'I-BUSINESS', 'I-BUSINESS', 'I-AGRICULTURAL', 'C'], ['I-AGRICULTURAL', 'B-BUSINESS', 'C']]
y_pred = [['I-AGRICULTURAL', 'I-BUSINESS', 'B-BUSINESS', 'I-AGRICULTURAL', 'C'], ['I-AGRICULTURAL', 'B-BUSINESS', 'B-BUSINESS']]

In [None]:
print("F1 Score: {}".format(f1_score(y_true, y_pred)))
print("Acc Score: {}".format(accuracy_score(y_true, y_pred)))
print("CR Report: \n{}".format(classification_report(y_true, y_pred, digits=3)))

F1 Score: 0.6666666666666666
Acc Score: 0.75
CR Report: 
              precision    recall  f1-score   support

AGRICULTURAL      1.000     1.000     1.000         3
    BUSINESS      0.250     0.500     0.333         2

   micro avg      0.571     0.800     0.667         5
   macro avg      0.625     0.750     0.667         5
weighted avg      0.700     0.800     0.733         5





## Test 4 - Custom Metric

In [None]:
def get_entities(llist):
    prev_tag = ""
    indices = []
    for i, ent in enumerate(llist):
        if ent!=prev_tag:
            indices.append([ent,i,i])
        else:
            indices[-1][2] = i
        prev_tag = ent
    return [tuple(i) for i in indices if i[0]!="O"]    

def f1_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))
    
    # intersection of predicted and true indexed named
    # entities
    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)
    nb_true = len(true_entities)
    
    p = nb_correct / nb_pred if nb_pred > 0 else 0
    r = nb_correct / nb_true if nb_true > 0 else 0

    return 2 * p * r / (p + r) if p + r > 0 else 0

def precision_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))
    
    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)
    
    return nb_correct / nb_pred if nb_pred > 0 else 0


def recall_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))
    
    nb_correct = len(true_entities & pred_entities)
    nb_true = len(true_entities)

    return nb_correct / nb_true if nb_true > 0 else 0

def classification_report(y_true, y_pred, digits=2):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    name_width = 0
    d1 = defaultdict(set)
    d2 = defaultdict(set)
    for e in true_entities:
        d1[e[0]].add((e[1], e[2]))
        name_width = max(name_width, len(e[0]))
    for e in pred_entities:
        d2[e[0]].add((e[1], e[2]))

    last_line_heading = 'macro avg'
    width = max(name_width, len(last_line_heading), digits)

    headers = ["precision", "recall", "f1-score", "support"]
    head_fmt = u'{:>{width}s} ' + u' {:>9}' * len(headers)
    report = head_fmt.format(u'', *headers, width=width)
    report += u'\n\n'

    row_fmt = u'{:>{width}s} ' + u' {:>9.{digits}f}' * 3 + u' {:>9}\n'

    ps, rs, f1s, s = [], [], [], []
    for type_name, true_entities in d1.items():
        pred_entities = d2[type_name]
        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)

        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        f1 = 2 * p * r / (p + r) if p + r > 0 else 0

        report += row_fmt.format(*[type_name, p, r, f1, nb_true], 
                                 width=width, digits=digits)

        ps.append(p)
        rs.append(r)
        f1s.append(f1)
        s.append(nb_true)

    report += u'\n'

    # compute averages
    report += row_fmt.format('micro avg',
                             precision_score(y_true, y_pred),
                             recall_score(y_true, y_pred),
                             f1_score(y_true, y_pred),
                             np.sum(s),
                             width=width, digits=digits)
    report += row_fmt.format(last_line_heading,
                             np.average(ps, weights=s),
                             np.average(rs, weights=s),
                             np.average(f1s, weights=s),
                             np.sum(s),
                             width=width, digits=digits)

    return report

y_true = ['a','a','b','o','o','i','a']
y_pred = ['a','a','O','o']
print(classification_report(y_true, y_pred, digits=4))
print(accuracy_score(y_true, y_pred))

           precision    recall  f1-score   support

        b     0.0000    0.0000    0.0000         1
        a     1.0000    0.5000    0.6667         2
        o     0.0000    0.0000    0.0000         1
        i     0.0000    0.0000    0.0000         1

micro avg     0.5000    0.2000    0.2857         5
macro avg     0.4000    0.2000    0.2667         5

0.42857142857142855


## Inspect 'seqeval' metric