# Sequence labeling (NER & POS) evaluation

This notebook contains code that reproduces Sequence Labeling mGPT experiments. Namely, this code can be used to evaluate predictions formed by *sequence_labeling_prediction.ipynb*.


# Metrics


In [None]:
#!pip install seqeval

from seqeval.metrics import f1_score, precision_score, accuracy_score, recall_score
import numpy as np
import pandas as pd
import math
import pickle
import os
import warnings
import random
from collections import Counter

warnings.filterwarnings('ignore')
pd.set_option("display.precision", 3)

In [None]:
# metrics
def calculate_scores(answers, predictions):
    langs = answers.keys()
    results = []
    for l in langs:
        results.append([l, accuracy_score(answers[l], predictions[l])])
    return results


def sequence_general_metrics_(true_label, pred_label):
    flat_true_label = []
    flat_pred_label = []
    for i in range(len(pred_label)):
        flat_true_label = flat_true_label + true_label[i]
        flat_pred_label = flat_pred_label + pred_label[i]
    tag_list = list(sorted(set(flat_true_label)))
    random_choice = [random.sample(tag_list, 1) for i in range(len(flat_true_label))]
    return [accuracy_score(flat_true_label, flat_pred_label), \
            precision_score(flat_true_label, flat_pred_label, average = 'weighted'), \
            recall_score(flat_true_label, flat_pred_label, average = 'weighted'), \
            f1_score(flat_true_label, flat_pred_label, average = 'weighted'),
            precision_score(flat_true_label, random_choice, average = 'weighted'), \
            f1_score(flat_true_label, random_choice, average = 'weighted')]

def sequence_general_metrics(true_label, pred_label):
    random_choice = []
    flat_true_label = []
    for i in range(len(true_label)):

        for j in range(len(true_label[i])):
            if true_label[i][j] == '':
                true_label[i][j] = 'X'
        flat_true_label = flat_true_label + true_label[i]
    tag_list = list(sorted(set(flat_true_label)))
    del flat_true_label
    for i in range(len(true_label)):
        random_choice.append([random.sample(tag_list, 1)[0] for i in range(len(true_label[i]))])

    return [accuracy_score(true_label, pred_label), \
            precision_score(true_label, pred_label), \
            recall_score(true_label, pred_label), \
            f1_score(true_label, pred_label),
            precision_score(true_label, random_choice), \
            f1_score(true_label, random_choice), \
            accuracy_score(true_label, random_choice)]



def calculate_sequence_labeling_scores(answers, predictions):
    langs = answers.keys()
    results = []
    for l in langs:
            true = answers[l]
            pred = predictions[l]
            results.append([l] +  sequence_general_metrics(true, pred))
    return pd.DataFrame(results, columns = ['Language', 'Acc', 'Precision', 'Recall', 'F1', 'Random Precision', 'Random F1', 'Random accuracy'])

#text preprocessing

#regular expression for tags generated by the model (POS-tags, NER-tags)
def words_only(text, regex):
    try:
        return " ".join(regex.findall(text))
    except:
        return ""

In [None]:
def show_tag_statistics(true_label, pred_label, lang, task = 'ner'):
    if task == 'ner':
        tags = {'O', 'I-LOC', 'I-ORG', 'I-PER', 'I-MISC'}
    else:
        tags =  {'NOUN', 'SCONJ', 'AUX', 'INTJ', 'ADP', 'ADJ', 'PRON', 'DET', 'VERB', 'PUNCT', 'X', 'SYM', 'PART', 'NUM', 'ADV', 'PROPN', 'CCONJ'}

    res = []

    flat_true_label = []
    flat_pred_label = []
    for i in range(len(pred_label)):
        flat_true_label = flat_true_label + true_label[i]
        flat_pred_label = flat_pred_label + pred_label[i]

    res_dict = {}
    tag_count = Counter()
    for key in sorted(tags):
        tag_count[key] = 0
        res_dict[key] = []

    for i, tag in enumerate(flat_true_label):
        if tag in tags:
            res_dict[tag].append(flat_pred_label[i])
            tag_count[tag] += 1
    print('Language: ', lang, 'Example number: ', len(flat_true_label))
    for key in tag_count.keys():
        if len(res_dict[key]):
            print(key,'\tPerc in data:  ', round(tag_count[key]/len(flat_true_label), 3), '   Tag precision:\t', round(Counter(res_dict[key])[key]/len(res_dict[key]), 3))
    print('\n\n')
    for key in tag_count.keys():
        print('Predicted tags for ', key, '(', tag_count[key],'out of',len(flat_true_label),'):')
        if tag_count[key] > 0:
            for item in Counter(res_dict[key]).most_common(5):
                print(item[0],'\t', item[1],'\tPerc: ', round(item[1]/tag_count[key], 4))
        print('\n\n')


# 4-shot XGLUE NER

In [None]:
task_name = 'NER_clf'
num_examples = 4
res4 = pickle.load(open('./'+ task_name +'/pred_few_shot_'+ str(num_examples) +'pred.pkl','rb'))
y_true, y_pred = res4[0], res4[1]
print('Task name: ', task_name)
print('Num few-shot examples: ', num_examples)
scores_df = calculate_sequence_labeling_scores(y_true, y_pred).sort_values('Language')
scores_df

# 4-shot XGLUE POS

In [None]:
import os
task_name = 'POS_clf'
num_examples = 4
res4 = pickle.load(open('./'+ task_name +'/pred_few_shot_'+ str(num_examples) +'pred.pkl','rb'))
y_true, y_pred = res4[0], res4[1]
print('Task name: ', task_name)
print('Num few-shot examples: ', num_examples)
scores_df = calculate_sequence_labeling_scores(y_true, y_pred).sort_values('Language').reset_index(drop = True)
scores_df

# 4-shot CIS & Low resource POS evaluation


In [None]:
task_name = 'UD_POS_clf'
num_examples = 4
res4 = pickle.load(open('./'+ task_name +'/pred_few_shot_'+ str(num_examples) +'pred.pkl','rb'))
y_true, y_pred = res4[0], res4[1]
print('Task name: ', task_name)
print('Num few-shot examples: ', num_examples)
scores_df = calculate_sequence_labeling_scores(y_true, y_pred).sort_values('Language').reset_index(drop = True)
scores_df