In [1]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

from code_.process_conll import process_file, extract_features

from code_.bert import Tokenizer, convert_to_dataset, compute_metrics, get_labels_list_from_dataset, task, batch_size, model_checkpoint
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_metric

  metric = load_metric("seqeval")
  torch.utils._pytree._register_pytree_node(


In [2]:
df_val = process_file('data/raw/en_ewt-up-dev.conllu')
df_train = process_file('data/raw/en_ewt-up-train.conllu')
df_test = process_file('data/raw/en_ewt-up-test.conllu')

process_file(): dataframe len: 4979
process_file(): dataframe len: 40498
process_file(): dataframe len: 4802


In [3]:
# df_val = extract_features(df_val)
# df_train = extract_features(df_train)
# df_test = extract_features(df_test)

In [4]:
dataset = convert_to_dataset(df_train, df_val, df_test)

In [5]:
labels_list = get_labels_list_from_dataset(dataset)
print(labels_list)

['', 'C-ARGM-EXT', 'C-ARGM-PRR', 'R-ARG4', 'C-ARG4', 'C-ARGM-COM', 'C-ARGM-PRP', 'ARGM-REC', 'ARGM-NEG', 'R-ARGM-ADV', 'C-ARGM-LOC', 'R-ARGM-LOC', 'ARG1', 'ARGM-LVB', 'R-ARGM-CAU', 'R-ARGM-MNR', 'R-ARGM-DIR', 'R-ARGM-GOL', 'C-ARGM-GOL', 'R-ARGM-ADJ', 'ARG3', 'R-ARG2', 'R-ARG0', 'C-ARGM-MNR', 'ARGM-PRD', 'ARGM-MNR', 'R-ARG1', 'ARGM-PRP', 'R-ARGM-TMP', 'V', 'ARGM-ADV', 'ARGM-EXT', 'ARGA', 'ARG1-DSP', 'C-ARGM-DIR', 'ARGM-COM', 'C-ARG3', 'ARGM-DIS', 'C-ARGM-CXN', 'C-V', 'ARGM-GOL', 'C-ARG1', 'C-ARGM-ADV', 'ARGM-TMP', 'ARG2', 'ARGM-PRR', 'ARGM-DIR', 'ARGM-CXN', 'ARG4', 'C-ARG2', 'R-ARG3', 'ARG5', 'R-ARGM-COM', '_', 'C-ARG1-DSP', 'ARGM-CAU', 'C-ARG0', 'ARGM-LOC', 'ARGM-MOD', 'C-ARGM-TMP', 'ARG0', 'ARGM-ADJ']


In [6]:
tok = Tokenizer(model_checkpoint, labels_list)

In [8]:
tokenized_datasets = dataset.map(tok.tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/40498 [00:00<?, ? examples/s]

Map:   0%|          | 0/4979 [00:00<?, ? examples/s]

Map:   0%|          | 0/4802 [00:00<?, ? examples/s]

In [9]:
# initialise model

In [10]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(labels_list))

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [12]:
data_collator = DataCollatorForTokenClassification(tok.tokenizer)
metric = load_metric("seqeval")

In [13]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tok.tokenizer,
    compute_metrics=compute_metrics
)

In [14]:
# train

In [16]:
trainer.train()

  0%|          | 0/7596 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
trainer.save_model('model_checkpoints/some_model_name.pth')

In [None]:
# results, plots, reports etc.

In [None]:
trainer.evaluate()

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results