In [None]:
%%capture
! pip install transformers datasets seqeval #tensorboard matplotlib pandas sklearn
! apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model_id = "xlm-roberta-base"

dataset_id = "cartesinus/leyzer-fedcsis"
dataset_configs=["en-US"] #,"de-DE","fr-FR","it-IT","pt-PT","es-ES","nl-NL"]

repository_id = "fedcsis-slot_baseline-xlm_r-en"

#### dataset preparation

In [None]:
import transformers
from datasets import load_dataset, concatenate_datasets, DatasetDict
from datasets import Features, Sequence, Value, ClassLabel
import numpy as np

# the columns we want to keep in the dataset
keep_columns = ["utterance", "bio"]

def convert_to_bio(sentence):
    bio = ""
    in_slot = False
    slot = ""
    raw_sentence = ""
    for word in sentence.split(' '):
        word = word.strip()
        if word.startswith('['):
            in_slot = True
            b_slot = True
            slot = word[1:].lower()
        elif word == ':' and in_slot:
            continue
        elif word.endswith(']'):
            in_slot = False
            if b_slot:
                b_slot = False
                bio += "B-" + slot.lower() + " "
            else:
                bio += "I-" + slot.lower() + " "
            raw_sentence += word[:-1] + " "
        elif in_slot:
            if b_slot:
                b_slot = False
                bio += "B-" + slot.lower() + " "
            else:
                bio += "I-" + slot.lower() + " "
            raw_sentence += word + " "
        else:
            bio += "O "
            raw_sentence += word + " "

    return bio.strip()


def convert_to_flattag(sentence):
    flattag = ""
    in_slot = False
    slot = ""
    raw_sentence = ""
    for word in sentence.split(' '):
        word = word.strip()
        if word.startswith('['):
            in_slot = True
            b_slot = True
            slot = word[1:].lower()
        elif word == ':' and in_slot:
            continue
        elif word.endswith(']'):
            in_slot = False
            if b_slot:
                b_slot = False
                flattag += slot.lower() + " "
            else:
                flattag += slot.lower() + " "
            raw_sentence += word[:-1] + " "
        elif in_slot:
            if b_slot:
                b_slot = False
                flattag += slot.lower() + " "
            else:
                flattag += slot.lower() + " "
            raw_sentence += word + " "
        else:
            flattag += "O "
            raw_sentence += word + " "

    return flattag.strip()


def get_all_iob_tokens(dataset):
    uniq_iob = []
    for x in dataset:
        for iob_token in x.split(' '):
            if not iob_token in uniq_iob:
                uniq_iob.append(iob_token)
    return uniq_iob


# process individuell datasets
proc_lan_dataset_list=[]
iob = []

for lang in dataset_configs:
    # load dataset for language
    lang_ds = load_dataset(dataset_id, lang)
    # only keep the 'utt' & 'scenario column
    lang_ds = lang_ds.remove_columns([col for col in lang_ds["train"].column_names if col not in keep_columns])
    # rename the columns to match transformers schema
    lang_ds = lang_ds.rename_column("utterance", "text")
    lang_ds = lang_ds.rename_column("bio", "bio_raw")

    bio_uniq = get_all_iob_tokens([s for s in lang_ds["train"]["bio_raw"] + lang_ds["test"]["bio_raw"] + lang_ds["validation"]["bio_raw"]])
    bio = ClassLabel(num_classes=len(bio_uniq), names=bio_uniq)

    #tokens
    print(lang_ds["train"]["text"])
    lang_ds["train"] = lang_ds["train"].add_column("tokens", [x.split() for x in lang_ds["train"]["text"]])
    lang_ds["test"] = lang_ds["test"].add_column("tokens", [x.split() for x in lang_ds["test"]["text"]])
    lang_ds["validation"] = lang_ds["validation"].add_column("tokens", [x.split() for x in lang_ds["validation"]["text"]])
    #lang_ds["validation"] = lang_ds["validation"].add_column("tokens", [x.split() for x in lang_ds["validation"]["text"]])

    #iob
    lang_ds["train"] = lang_ds["train"].add_column("bio_tokens", [s.split() for s in lang_ds["train"]["bio_raw"]])
    lang_ds["train"] = lang_ds["train"].add_column("bio", [bio.str2int(s) for s in lang_ds["train"]["bio_tokens"]])

    lang_ds["test"] = lang_ds["test"].add_column("bio_tokens", [s.split() for s in lang_ds["test"]["bio_raw"]])
    lang_ds["test"] = lang_ds["test"].add_column("bio", [bio.str2int(s) for s in lang_ds["test"]["bio_tokens"]])

    lang_ds["validation"] = lang_ds["validation"].add_column("bio_tokens", [s.split() for s in lang_ds["validation"]["bio_raw"]])
    lang_ds["validation"] = lang_ds["validation"].add_column("bio", [bio.str2int(s) for s in lang_ds["validation"]["bio_tokens"]])

    proc_lan_dataset_list.append(lang_ds)


# concat single splits into one
train_dataset = concatenate_datasets([ds["train"] for ds in proc_lan_dataset_list])
test_dataset = concatenate_datasets([ds["test"] for ds in proc_lan_dataset_list])
eval_dataset = concatenate_datasets([ds["validation"] for ds in proc_lan_dataset_list]) 

# create datset dict for easier processing
dataset = DatasetDict(dict(train=train_dataset,validation=eval_dataset,test=test_dataset))
print(dataset['train'].features) #['iob'][0]
print(dataset['train'][200])

#### tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenized_input = tokenizer(dataset["train"]["text"], is_split_into_words=True, padding=True, max_length=512, truncation=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

#### token alignment with labels

In [None]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["bio"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx and word_idx < len(label):
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx < len(label):
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            else:
            ##this is nasty hack to skip some problematical (probably due to errors in dataset) cases
                print("label", label)
                print("word_idx", word_idx)
                continue
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

#### model details and train

In [None]:
from transformers import AutoModelForTokenClassification,DataCollatorWithPadding, TrainingArguments, Trainer
#from huggingface_hub import HfFolder

# create label2id, id2label dicts for nice outputs for the model
num_labels = bio.num_classes
labels = bio.names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label


model = AutoModelForTokenClassification.from_pretrained(model_id,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

In [None]:
import numpy as np
from datasets import load_dataset, load_metric

metric = load_metric("seqeval")
label_list = labels

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

##### training args

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

args = TrainingArguments(
    repository_id,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=True,
#    hub_strategy = "all_checkpoints", #for future testing
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

#### evaluate model

In [None]:
def evaluate_on_dataset(dataset):
    predictions, eval_labels, _ = trainer.predict(dataset)
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, eval_labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, eval_labels)
    ]

    return metric.compute(predictions=true_predictions, references=true_labels)
    results

evaluate_on_dataset(tokenized_datasets["validation"])

In [None]:
evaluate_on_dataset(tokenized_datasets["test"])

#### send model to huggingface

In [None]:
trainer.push_to_hub()