In [None]:
%%capture
!pip install transformers==4.28.1 datasets seqeval optuna #tensorboard matplotlib pandas sklearn

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!apt install git-lfs

In [None]:
import transformers

print(transformers.__version__)

In [None]:
model_id = "xlm-roberta-base"
dataset_id = "cartesinus/leyzer-fedcsis"
dataset_configs=["pl-PL"]


repository_id = "fedcsis-intent_baseline-xlm_r-pl"

In [None]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

# the columns we want to keep in the dataset
keep_columns = ["utterance", "intent"]

# process individuell datasets
proc_lan_dataset_list=[]
for lang in dataset_configs:
    # load dataset for language
    lang_ds = load_dataset(dataset_id, lang)
    # only keep the 'utt' & 'scenario column
    lang_ds = lang_ds.remove_columns([col for col in lang_ds["train"].column_names if col not in keep_columns])  
    # rename the columns to match transformers schema
    lang_ds = lang_ds.rename_column("utterance", "text")
    lang_ds = lang_ds.rename_column("intent", "label")
    proc_lan_dataset_list.append(lang_ds)
    
# concat single splits into one
train_dataset = concatenate_datasets([ds["train"] for ds in proc_lan_dataset_list])
eval_dataset = concatenate_datasets([ds["validation"] for ds in proc_lan_dataset_list])
test_dataset = concatenate_datasets([ds["test"] for ds in proc_lan_dataset_list])
# create datset dict for easier processing
dataset = DatasetDict(dict(train=train_dataset,validation=eval_dataset,test=test_dataset))
print(dataset)

In [None]:
import pandas as pd

df = dataset["train"].to_pandas()

df.hist()

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
def process(examples):
    tokenized_inputs = tokenizer(
       examples["text"], padding="max_length", truncation=True
    )
    return tokenized_inputs

tokenized_datasets = dataset.map(process, batched=True)
tokenized_datasets["train"].features

In [None]:
from datasets import load_metric
import numpy as np

# define metrics and metrics function
f1_metric = load_metric("f1")
accuracy_metric = load_metric( "accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="micro")
    return {
        "accuracy": acc["accuracy"],
        "f1": f1["f1"],

    }

In [None]:
from transformers import AutoModelForSequenceClassification,DataCollatorWithPadding
#from optimum.habana import GaudiTrainer, GaudiTrainingArguments
from huggingface_hub import HfFolder

# create label2id, id2label dicts for nice outputs for the model
labels = tokenized_datasets["train"].features["label"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

#model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

In [None]:
#model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    repository_id,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=True,
    save_strategy="epoch",
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

In [None]:
trainer.evaluate(tokenized_datasets["test"])

In [None]:
trainer.push_to_hub()