# Sequence labeling prediction

This notebook contains code that reproduces Sequence Labeling mGPT experiments. Namely, this code can be used to obtain predictions that are further evaluated in *sequence_labeling_evalution.ipynb*.

To run the experiments you need to download the data and specify the corresponding data folder as other paths in the configs.

XGLUE data can be downloaded from [XGLUE Leaderboard](https://microsoft.github.io/XGLUE/). For this you need to agree to the terms of service. After you do so a download link will be made available.

Data for POS evaluation on CIS & Low-Resource UD languages can be found *data/UD_POS_data.tar.gz*. Thus, you need to unpack the file and specify the path to the data directory in `POSUDTaskConfig`. If you work in the original repo, you may simply run the following line of code to extract files in the data folder:

`!tar xvzf data/UD_POS_data.tar.gz -C data/`

In [None]:
#!pip install transformers==4.22.2

In [None]:
%load_ext autoreload
%autoreload 2
import torch
import numpy as np
import transformers
from tqdm import tqdm
import sys
import os
import pickle
import random
from dataclasses import dataclass, field
from typing import List, Dict, Union
import json
from tqdm import tqdm
import re
import pandas as pd
from sklearn.metrics import *


from inference import load_mgpt, get_dataloader


import warnings
warnings.simplefilter("ignore")

# Model load

To evaluate $mGPT_{13}$B you need to load $mGPT_{13}$B instead.

In [None]:
model = load_mgpt("sberbank-ai/mGPT")

# Technical functions

In [None]:
# metrics
def calculate_scores(answers, predictions):
    langs = answers.keys()
    results = []
    for l in langs:
        results.append([l, accuracy_score(answers[l], predictions[l])])
    return results


def sequence_general_metrics(true_label, pred_label):
    flat_true_label = []
    flat_pred_label = []
    for i in range(len(pred_label)):
        flat_true_label = flat_true_label + true_label[i]
        flat_pred_label = flat_pred_label + pred_label[i]
    return [precision_score(flat_true_label, flat_pred_label, average = 'weighted'), \
            recall_score(flat_true_label, flat_pred_label, average = 'weighted'), \
            f1_score(flat_true_label, flat_pred_label, average = 'weighted')]


def sequence_labeling_em(true_label, pred_label):
    ems = []
    for idx in range(len(pred_label)):
        cur_pred = pred_label[idx]
        cur_true = true_label[idx]
        if len(cur_pred) != len(cur_true):
            print(cur_pred, cur_true)
            print('Size mismatch')
        else:
            cur_res = [1 if cur_pred[i] == cur_true[i] else 0 for i in range(len(cur_pred))]
            ems.append(np.mean(cur_res))
    return np.mean(ems)

def calculate_sequence_labeling_scores(answers, predictions):
    langs = answers.keys()
    results = []
    for l in langs:
        true = answers[l]
        pred = predictions[l]
        results.append([l,  sequence_labeling_em(true, pred)] + sequence_general_metrics(true, pred))
    return pd.DataFrame(results, columns = ['Language', 'EM', 'Precision', 'Recall', 'F1'])

#text preprocessing
#regular expression for tags generated by the model (POS-tags, NER-tags)
def words_only(text, regex):
    try:
        return " ".join(regex.findall(text))
    except:
        return ""

# Task configs

**Warning:** to run the code you need to indicate the data folder where you store the datasets and specify config directories.

In [None]:
@dataclass
class TaskConfig:
    split: str = 'test'
    cache_dir: str = './data/'
    output_dir: str = './'
    save_perplexities: bool = False
    save_predictions: bool = False


@dataclass
class POSTaskConfig(TaskConfig):
    task_name: str = 'POS_clf'
    #here you need to write our own path to the dataset
    data_dir: str = './data/xglue_full_dataset/POS/'
    pred_dir: str = './' 
    num_examples: int = 4
    train_lang = 'en'
    #prompts = {0: "<s>lang: ", 1: "\nSentence: ", 2: ' Parts of speech: ', 3:'</s>'}
    prompts = {0: "<s>lang: ", 1: "\nTagged sentence: ", 2: '</s>'}
    langs = ['de', 'el', 'en', 'es', 'fr','hi', 'it', 'nl','pl', 'pt', 'ru', 'th', 'tr', 'ur', 'vi','ar', 'bg', 'zh']
    tag_set =  {'NOUN', 'SCONJ', 'AUX', 'INTJ', 'ADP', 'ADJ', 'PRON', 'DET', 'VERB', 'PUNCT', 'X', 'SYM', 'PART', 'NUM', 'ADV', 'PROPN', 'CCONJ'}

    # expression used to filter all but tags from generation result
    tags_regex = re.compile("[A-Z]+")
    logging = True

@dataclass
class POSUDTaskConfig(TaskConfig):
    task_name: str = 'UD_POS_clf'
    #here you need to write our own path to the dataset
    data_dir: str = './data/UD/'
    #here you need to write the proper path
    pred_dir: str = './' 
    num_examples: int = 0
    train_lang = 'en'
    prompts = {0: "<s>lang: ", 1: "\nTagged sentence: ", 2: '</s>'}
    langs = ['be_hse','uk_iu', 'hy_armtdp', 'kk_ktb', 'bxr_bdt', 'sah_yktdt','tt_nmctt']
    tag_set =  {'NOUN', 'SCONJ', 'AUX', 'INTJ', 'ADP', 'ADJ', 'PRON', 'DET', 'VERB', 'PUNCT', 'X', 'SYM', 'PART', 'NUM', 'ADV', 'PROPN', 'CCONJ'}

    # expression used to filter all but tags from generation result
    tags_regex = re.compile("[A-Z]+")
    logging = True

@dataclass
class NERTaskConfig(TaskConfig):
    task_name: str = 'NER_clf'
    #here you need to write our own path to the dataset
    data_dir: str = './data/xglue_full_dataset/NER/'
    #here you need to write the proper path
    pred_dir: str = './'
    num_examples: int = 4
    train_lang = 'en'
    langs = ['de', 'en', 'es', 'nl']
    prompts = {0: "<s>lang: ", 1: "\nSentence: ", 2: ' Named Entities: ', 3:'</s>'}

    # expression used to filter all but tags from generation result
    tags_regex = re.compile("[A-Z\-]+")
    logging = True



config_pos = POSTaskConfig()
config_pos_ud = POSUDTaskConfig(TaskConfig)
config_ner = NERTaskConfig()

# General prediction class

In [None]:
class Task:
    def __init__(self, config):
        self.prompts = config.prompts
        self.langs = config.langs
        self.split = config.split
        self.cache_dir = config.cache_dir
        self.output_dir = config.output_dir
        self.task_name = config.task_name
        self.save_perplexities = config.save_perplexities
        self.save_predictions = config.save_predictions


    def verbalize_samples(self, dataset, prompt):
        raise NotImplementedError

    def load_data(self):
        raise NotImplementedError

    def calculate_scores(self, data, model, batch_size=8):
        losses = model.forward(data, loss_per_pos=True, batch_size=batch_size)
        return np.asarray([sum(l) for l in losses[0]])

    def predict_subset(self, lang, dataset_lang, model):
        scores, labels = [], []
        for i, prompt in self.prompts.items():
            labels.append(i)
            text_samples = self.verbalize_samples(dataset_lang, prompt)
            print(lang, 'example: "' + text_samples[0] + '"')
            scores.append(self.calculate_scores(text_samples, model))

        scores = np.array(scores).T
        labels = np.array(labels)

        idx = np.argmin(scores, axis=1)
        pred_label = np.take_along_axis(labels, idx, axis=0)
        true_label = dataset_lang['label']
        scores = np.concatenate((labels.reshape(1, -1), scores))

        return true_label, pred_label.tolist(), scores

    def predict(self, model):
        path_scores = None
        dataset = self.load_data()
        y_true = {}
        y_pred = {}
        for i, lang in enumerate(self.langs):
            print(i, '/', len(self.langs), ':', lang)
            dataset_lang = dataset.filter(lambda example: example['language'] == lang)

            true_label, pred_label, scores = self.predict_subset(lang, dataset_lang, model)
            y_true[lang] = true_label
            y_pred[lang] = pred_label
            print('%.2f' % accuracy_score(true_label, pred_label))

            if self.save_perplexities:
                path_scores = os.path.join(self.output_dir, self.task_name, f"{lang}_scores.pkl")
                pickle.dump(scores, open(path_scores, 'wb'))

        if self.save_predictions:
            path = os.path.join(self.output_dir, self.task_name, "pred.pkl")
            pickle.dump([y_true, y_pred], open(path, 'wb'))
        return y_true, y_pred

# Sequence Labeling Task

Universal Sequence Labeling Task solution can be used for both POS and NER.

In [None]:
class SequenceLabelingClassificationTask(Task):
    def __init__(self, config):
        super().__init__(config)
        self.UD_ = False
        self.data_dir = config.data_dir
        self.prompts = config.prompts
        self.num_examples = config.num_examples
        #lang for few-shot examples
        #in POS and NER tasks few-shot are from English train
        self.train_lang = config.train_lang
        # expression used to filter all but tags from generation result
        self.tags_regex = config.tags_regex
        self.logging = config.logging
        self.tag_set = {}
        self.max_examples = 10000
        self.prog = re.compile('\</?[a-z]+\>?')

        self.print_ = False


    #formating few-shot examples
    def format_train_data(self, sent):
        sent = sent.replace(' ','_').replace('\n',' ').strip()
        result = self.prompts[0] +self.train_lang+ self.prompts[1] +sent+ self.prompts[2]
        return result

    def format_test_data(self, sent, i, lang, word_res):
        if len(word_res) != i:
            print('len(word_res) != i')
        splitted_sent = sent.replace(' ','_').replace('\n',' ').strip().split()
        splitted_sent = [x.split('_')[0] for x in splitted_sent]
        for j in range(i):
            splitted_sent[j] = splitted_sent[j] + '_' + word_res[j]
        result = self.prompts[0] +lang+ self.prompts[1] +' '.join(splitted_sent[:i]) + ' ' + splitted_sent[i] + '_'
        return result

    def load_data(self):
        #in POS&NER tasks train is available only in English (self.train_lang)
        #thus, all the examples for few-shot are in English
        train = open(self.data_dir + self.train_lang + '.train', 'r').read().split('\n\n')

        #delete extra tag markup
        train = [x for x in train if not '_ ' in x]
        self.tag_set = set([l.split(' ')[1].split('\n')[0]  for l in train if ' ' in l])
        tests = {}
        for lang in self.langs:
            tests[lang] = [x for x in open(self.data_dir+lang+'.test', 'r').read().split('\n\n') if len(x) > 0 and not x == '\n']
        return train, tests

    def read_conllu(self, filename):
        examples = open(self.data_dir + filename + '-ud-test.conllu','r').read().split('# sent_id ')[1:]
        parsed_examples = []
        for example in examples:
            example = example.split('\n')
            example = [x for x in example if len(x) > 0 and x[0] in '123456789']
            example = [x.split('\t')[1]+' '+x.split('\t')[3] for x in example]
            parsed_examples.append('\n'.join(example))
        return parsed_examples

    def load_UD_data(self):
        #in POS&NER tasks train is available only in English (self.train_lang)
        #thus, all the examples for few-shot are in English
        train = open(self.data_dir + self.train_lang + '.train', 'r').read()[1:].split('\n\n')

        #delete extra tag markup
        train = [x for x in train if not '_ ' in x]
        self.tag_set = set([l.split(' ')[1].split('\n')[0]  for l in train if ' ' in l])


        tests = {}
        for lang in self.langs:
            tests[lang] = self.read_conllu(lang)
        return train, tests


    def classify_with_examples(self, prompt, examples, model, lang='en', top_k = 1, top_p = 0.90, seed = 1337):
        if self.num_examples > 0:
            #few_shot
            some = random.sample(examples, self.num_examples)

            #in POS&NER tasks task train is available only in English (self.train_lang)
            #thus, all the examples for few-shot are in English
            train_examples = [self.format_train_data(s) for s in some]
            text = '\n'.join(train_examples)+'\n'

        else:
            #zero-shot
            text = ''
        #sentence splitted into word_TAG
        tagged_sent = [x for x in prompt.replace(' ','_').replace('\n',' ').strip().split() if '_' in x]
        true_tags = [x.split('_')[1] for x in tagged_sent]

        word_num = len(true_tags)

        if self.print_:
            print('Word_num: ', word_num, true_tags)
        word_res = []

        # word by word generation
        tags = sorted(self.tag_set)
        for i in range(word_num):
            #example Tagged sentence: What_PRON if_SCONJ Google_
            test_prompt = example.format_test_data(prompt, i, lang, word_res)
            tagged_candidates = [text + test_prompt + tag for tag in tags]
            scores = self.calculate_scores(tagged_candidates, model)
            word_res.append(tags[np.argmin(scores)])

        return true_tags, word_res


    def predict_lang(self, dataset_lang, train, lang, model):
        true_label = []
        pred_label = []

        if self.logging:
            if not os.path.exists(os.path.join(self.output_dir, self.task_name)):
                os.makedirs(os.path.join(self.output_dir, self.task_name))
            path = os.path.join(self.output_dir, self.task_name, str(self.num_examples) + "logs.txt")
            with open(path, 'a') as f:
                f.write('Processing language: ' + lang + '\n')

        for i in tqdm(range(min(self.max_examples, len(dataset_lang)))):
            true, gen = self.classify_with_examples(dataset_lang[i], train, model, lang=lang)
            true_label.append(true)
            pred_label.append(gen)
            if self.logging:
                if not os.path.exists(os.path.join(self.output_dir, self.task_name)):
                    os.makedirs(os.path.join(self.output_dir, self.task_name))
                if i%10 == 0:
                    path = os.path.join(self.output_dir, self.task_name, str(self.num_examples) + "logs.txt")
                    with open(path, 'a') as f:
                        f.write(lang + '\t' + str(i+1) + ' out of ' + str(min(self.max_examples, len(dataset_lang))) + '\n')


        return true_label, pred_label


    def predict(self, model):
        if self.UD_:
            train, tests = self.load_UD_data()
        else:
            train, tests = self.load_data()
        y_true = {}
        y_pred = {}
        if self.logging:
            if not os.path.exists(os.path.join(self.output_dir, self.task_name)):
                os.makedirs(os.path.join(self.output_dir, self.task_name))
            path = os.path.join(self.output_dir, self.task_name, str(self.num_examples) + "logs.txt")
            with open(path, 'w') as f:
                f.write('Start processing '+ self.task_name + '\n')
        for lang in tqdm(self.langs):
            dataset_lang = tests[lang]
            true_label, pred_label = self.predict_lang(dataset_lang, train, lang, model)
            y_true[lang] = true_label
            y_pred[lang] = pred_label
            if self.save_predictions:
                path = os.path.join(self.output_dir, self.task_name, "pred_few_shot_"+str(self.num_examples)+"pred.pkl")
                pickle.dump([y_true, y_pred], open(path, 'wb'))
        if self.save_predictions:
            if not os.path.exists(os.path.join(self.output_dir, self.task_name)):
                os.makedirs(os.path.join(self.output_dir, self.task_name))
            path = os.path.join(self.output_dir, self.task_name, "pred_few_shot_"+str(self.num_examples)+"pred.pkl")
            pickle.dump([y_true, y_pred], open(path, 'wb'))
        return y_true, y_pred

# 4-shot XGLUE NER prediction

XGLUE data can be downloaded from [XGLUE Leaderboard](https://microsoft.github.io/XGLUE/). For this you need to agree to the terms of service. After you do so a download link will be made available.

In [None]:
example = SequenceLabelingClassificationTask(config_ner)
example.save_predictions = True
LANGS = config_ner.langs
example.langs = LANGS

example.num_examples = 4

train, tests = example.load_data()
print(example.tag_set)
for key in tests.keys():
    print(key, len(tests[key]))


In [None]:
y_true, y_pred = example.predict(model)

# 4-shot XGLUE POS predictions

XGLUE data can be downloaded from [XGLUE Leaderboard](https://microsoft.github.io/XGLUE/). For this you need to agree to the terms of service. After you do so a download link will be made available on the site.

In [None]:
example = SequenceLabelingClassificationTask(config_pos)
example.save_predictions = True
LANGS = config_pos.langs

example.langs = LANGS
example.num_examples = 4

train, tests = example.load_data()
print(example.tag_set)
for key in tests.keys():
    print(key, len(tests[key]))

In [None]:
y_true, y_pred = example.predict(model)

# 4-shot CIS & Low resource POS predictions

Data for POS evaluation on CIS & Low-Resource UD languages can be found *./data/UD_POS_data.tar.gz*. If you work in the original repo folder, uncomment and run the cell below to extract files into the data folder.

In [None]:
#!tar xvzf data/UD_POS_data.tar.gz -C data/

In [None]:
example = SequenceLabelingClassificationTask(config_pos_ud)
example.save_predictions = True
example.UD_ = True
LANGS = config_pos_ud.langs
example.langs = LANGS
example.num_examples = 4

train, tests = example.load_UD_data()
print(example.tag_set)
for key in tests.keys():
    print(key, len(tests[key]))

In [None]:
y_true, y_pred = example.predict(model)