Run these commands if you work on colab.

In [None]:
# !pip install datasets
# !pip install seqeval
# !pip install accelerate==0.21.0
# !pip install transformers[torch]
# !pip install accelerate -U

## Import all Required library

In [1]:
import time
import pandas as pd
import transformers
import numpy as np
from transformers import AutoTokenizer,AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_metric
from datasets import Dataset
from utils import read_data_as_sentence,map_labels_in_dataframe,tokenize_and_align_labels,get_label_mapping,get_labels_from_map,load_srl_model,load_dataset,compute_metrics,write_predictions_to_csv,compute_evaluation_metrics_from_csv

## Preprocessing data

Read data files and save them as dataframe.\
\
Use `read_data_as_sentence` function from `utils.py` to read files as Dataframe.\

`read_data_as_sentence` need two input:
1. path of conllu file.
2. path for save datarame

In [2]:
train_data = read_data_as_sentence('data/en_ewt-up-train.conllu', 'data/en_ewt-up-train.preprocessed.csv')
dev_data = read_data_as_sentence('data/en_ewt-up-dev.conllu', 'data/en_ewt-up-dev.preprocessed.csv')
test_data = read_data_as_sentence('data/en_ewt-up-test.conllu', 'data/en_ewt-up-test.preprocessed.csv')

Show 20 input form and their gold list in test data

In [3]:
for i in range(20):
    print(f"{test_data.input_form[i]}\t:\t{test_data.argument[i]}")

['What', 'if', 'Google', 'Morphed', 'Into', 'GoogleOS', '?', '[SEP]', 'Morphed']	:	['_', '_', 'ARG1', '_', '_', 'ARG2', '_', None, None]
['What', 'if', 'Google', 'expanded', 'on', 'its', 'search', '-', 'engine', '(', 'and', 'now', 'e-mail', ')', 'wares', 'into', 'a', 'full', '-', 'fledged', 'operating', 'system', '?', '[SEP]', 'expanded']	:	['_', '_', 'ARG0', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'ARG1', '_', '_', '_', '_', '_', '_', 'ARG4', '_', None, None]
['(', 'And', ',', 'by', 'the', 'way', ',', 'is', 'anybody', 'else', 'just', 'a', 'little', 'nostalgic', 'for', 'the', 'days', 'when', 'that', 'was', 'a', 'good', 'thing', '?', ')', '[SEP]', 'way']	:	['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', None, None]
['(', 'And', ',', 'by', 'the', 'way', ',', 'is', 'anybody', 'else', 'just', 'a', 'little', 'nostalgic', 'for', 'the', 'days', 'when', 'that', 'was', 'a', 'good', 'thing', '?', ')', '[

Head of test data after process

In [4]:
test_data.head()

Unnamed: 0,input_form,argument
0,"[What, if, Google, Morphed, Into, GoogleOS, ?,...","[_, _, ARG1, _, _, ARG2, _, None, None]"
1,"[What, if, Google, expanded, on, its, search, ...","[_, _, ARG0, _, _, _, _, _, _, _, _, _, _, _, ..."
2,"[(, And, ,, by, the, way, ,, is, anybody, else...","[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, ..."
3,"[(, And, ,, by, the, way, ,, is, anybody, else...","[_, _, _, _, _, ARGM-DIS, _, _, ARG1, _, _, _,..."
4,"[(, And, ,, by, the, way, ,, is, anybody, else...","[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, ..."


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4799 entries, 0 to 4798
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   input_form  4799 non-null   object
 1   argument    4799 non-null   object
dtypes: object(2)
memory usage: 75.1+ KB


## Importing the model and tokenizer

In [6]:
model_checkpoint = "distilbert-base-uncased"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Checking the sentence representation

In [8]:
example = test_data['input_form'][1]
print(example)

['What', 'if', 'Google', 'expanded', 'on', 'its', 'search', '-', 'engine', '(', 'and', 'now', 'e-mail', ')', 'wares', 'into', 'a', 'full', '-', 'fledged', 'operating', 'system', '?', '[SEP]', 'expanded']


The sentence contains the [SEP] special token followed by the predicate. Therefore, the parameter `add_special_tokens` is set to True so that the index is converted to 102 accordingly and is not treated as another word. \
In addition, the sentence is already split into tokens, to the parameter `is_split_into_words` is also set to True

In [9]:
tokenizer(example,add_special_tokens=True,is_split_into_words=True)

{'input_ids': [101, 2054, 2065, 8224, 4423, 2006, 2049, 3945, 1011, 3194, 1006, 1998, 2085, 1041, 1011, 5653, 1007, 16283, 2015, 2046, 1037, 2440, 1011, 26712, 4082, 2291, 1029, 102, 4423, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
tokenized_input = tokenizer(example,add_special_tokens=True,is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'what', 'if', 'google', 'expanded', 'on', 'its', 'search', '-', 'engine', '(', 'and', 'now', 'e', '-', 'mail', ')', 'ware', '##s', 'into', 'a', 'full', '-', 'fledged', 'operating', 'system', '?', '[SEP]', 'expanded', '[SEP]']


## Tokenizing and preparing input for the model

Getting the mapping of all possible arguments across all datasets to a numerical value with the `get_label_mapping` function.\
None value stays as None to be mapped to the special token in the model.

In [11]:
label_map = get_label_mapping(train_data, test_data, dev_data)

In [12]:
print(label_map)

{'_': 0, 'ARG0': 1, 'ARG1': 2, 'ARG1-DSP': 3, 'ARG2': 4, 'ARG3': 5, 'ARG4': 6, 'ARG5': 7, 'ARGA': 8, 'ARGM-ADJ': 9, 'ARGM-ADV': 10, 'ARGM-CAU': 11, 'ARGM-COM': 12, 'ARGM-CXN': 13, 'ARGM-DIR': 14, 'ARGM-DIS': 15, 'ARGM-EXT': 16, 'ARGM-GOL': 17, 'ARGM-LOC': 18, 'ARGM-LVB': 19, 'ARGM-MNR': 20, 'ARGM-MOD': 21, 'ARGM-NEG': 22, 'ARGM-PRD': 23, 'ARGM-PRP': 24, 'ARGM-PRR': 25, 'ARGM-REC': 26, 'ARGM-TMP': 27, 'C-ARG0': 28, 'C-ARG1': 29, 'C-ARG1-DSP': 30, 'C-ARG2': 31, 'C-ARG3': 32, 'C-ARG4': 33, 'C-ARGM-ADV': 34, 'C-ARGM-COM': 35, 'C-ARGM-CXN': 36, 'C-ARGM-DIR': 37, 'C-ARGM-EXT': 38, 'C-ARGM-GOL': 39, 'C-ARGM-LOC': 40, 'C-ARGM-MNR': 41, 'C-ARGM-PRP': 42, 'C-ARGM-PRR': 43, 'C-ARGM-TMP': 44, 'R-ARG0': 45, 'R-ARG1': 46, 'R-ARG2': 47, 'R-ARG3': 48, 'R-ARG4': 49, 'R-ARGM-ADJ': 50, 'R-ARGM-ADV': 51, 'R-ARGM-CAU': 52, 'R-ARGM-COM': 53, 'R-ARGM-DIR': 54, 'R-ARGM-GOL': 55, 'R-ARGM-LOC': 56, 'R-ARGM-MNR': 57, 'R-ARGM-TMP': 58, None: None}


Converting the labels in the df to numerical values for the language model with `map_labels_in_dataframe` function. The label_map dictionary from the function above is needed to map the arguments to their value.\
Add a new column to the df matching the arguments to label numbers. 0 stands for '_' (no argument) and the rest of the arguments are alphabetically ordered. \
*None* label will be mapped to the *[SEP]* token.


In [13]:
train_data = map_labels_in_dataframe(train_data,label_map)
dev_data = map_labels_in_dataframe(dev_data,label_map)
test_data = map_labels_in_dataframe(test_data,label_map)

Checking the head to confirm the labels were correctly converted:

In [14]:
test_data.head()

Unnamed: 0,input_form,argument,mapped_labels
0,"[What, if, Google, Morphed, Into, GoogleOS, ?,...","[_, _, ARG1, _, _, ARG2, _, None, None]","[0, 0, 2, 0, 0, 4, 0, None, None]"
1,"[What, if, Google, expanded, on, its, search, ...","[_, _, ARG0, _, _, _, _, _, _, _, _, _, _, _, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, ..."
2,"[(, And, ,, by, the, way, ,, is, anybody, else...","[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[(, And, ,, by, the, way, ,, is, anybody, else...","[_, _, _, _, _, ARGM-DIS, _, _, ARG1, _, _, _,...","[0, 0, 0, 0, 0, 15, 0, 0, 2, 0, 0, 0, 0, 4, 0,..."
4,"[(, And, ,, by, the, way, ,, is, anybody, else...","[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


Use `tokenize_and_align_labels` function to tokenize train, test, and dev dataframe. Padding is applied to make sure all input is the same length for the model.

In [15]:
tokenized_test = tokenize_and_align_labels(tokenizer, test_data, label_all_tokens=True)
tokenized_train = tokenize_and_align_labels(tokenizer, train_data, label_all_tokens=True)
tokenized_dev = tokenize_and_align_labels(tokenizer, dev_data, label_all_tokens=True)

The input for the model has the corresponding special token [CLS] followed by the tokenized sentence, the special token [SEP], the predicate and the final [SEP] token. \
The numerical labels to be fed to the model correspond to the tokenized sentence.\
The input is padded so that every vector is of the same length, including the labels and the attention mask.

In [16]:
print(tokenizer.convert_ids_to_tokens(tokenized_test["input_ids"][0]))
print(tokenized_test["attention_mask"][0])
print(tokenized_test["input_ids"][0])
print(tokenized_test["labels"][0])

['[CLS]', 'what', 'if', 'google', 'mor', '##ph', '##ed', 'into', 'google', '##os', '?', '[SEP]', 'mor', '##ph', '##ed', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Confirming all the tokens contain a label and the attention mask also matches the length of the input

In [17]:
print(len(tokenized_test["input_ids"][0]),len(tokenized_test["labels"][0]),len(tokenized_test["attention_mask"][0]))

97 97 97


Converting the tokenized data to datasets format with the function `load_dataset`

In [18]:
dataset_train = load_dataset(tokenized_train)
dataset_dev = load_dataset(tokenized_dev)
dataset_test = load_dataset(tokenized_test)

## Fine-tuning the model

Reducing the size of the dataset for a mini test

In [None]:
small_train_dataset = dataset_train.shuffle(seed=42).select(range(1000))
small_eval_dataset = dataset_dev.shuffle(seed=42).select(range(1000))
small_test_dataset = dataset_test.shuffle(seed=42).select(range(1000))

Getting the labels that will be predicted by the model with the `get_labels_from_map`function

In [19]:
label_list = get_labels_from_map(label_map)

Loading the model for semantic role labelling task with function `load_srl_model` to get the model, its name and the arguments necessary for training. \
The model selected is **distilbert-base-uncased**

In [20]:
model, model_name, args = load_srl_model(model_checkpoint, label_list)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
print(model_name)

distilbert-base-uncased


Load the seqeval metric to compute the metrics from the predictions

In [22]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


metric.compute(predictions=[label_list], references=[label_list])

Passing the arguments along with the datasets to the `trainer` function to fine-tune the model for semantic role labelling with `trainer.train()`

In [28]:
trainer = Trainer(
        model,
        args,
        train_dataset=dataset_train,
        eval_dataset=dataset_dev,
        tokenizer=tokenizer,
        compute_metrics=lambda p: compute_metrics(*p, label_list, metric)
    )
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0347,0.15128,0.722451,0.753376,0.737589,0.965937
2,0.0327,0.150485,0.744207,0.741104,0.742652,0.967305
3,0.0406,0.148667,0.74503,0.754786,0.749876,0.967856


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=7593, training_loss=0.0330926868012186, metrics={'train_runtime': 1973.5235, 'train_samples_per_second': 61.538, 'train_steps_per_second': 3.847, 'total_flos': 5615118349103484.0, 'train_loss': 0.0330926868012186, 'epoch': 3.0})

Evaluate a model fine-tuned for semantic role labelling with `trainer.evaluate()`

In [29]:
trainer.evaluate()

{'eval_loss': 0.1486673206090927,
 'eval_precision': 0.74502984988118,
 'eval_recall': 0.7547856723429243,
 'eval_f1': 0.7498760318525216,
 'eval_accuracy': 0.9678560363523702,
 'eval_runtime': 17.3463,
 'eval_samples_per_second': 286.921,
 'eval_steps_per_second': 17.987,
 'epoch': 3.0}

After training is finished, the precision/recall/f1 for each category can be computed. \
The same function `compute_metrics` is applied on the result of the predict method.

In [30]:
predictions, labels, _ = trainer.predict(dataset_test)
results = compute_metrics(predictions, labels, label_list, metric)
results



{'precision': 0.7505595476498998,
 'recall': 0.7698767520541324,
 'f1': 0.7600954369221593,
 'accuracy': 0.968718998292951}

Writing the predictions together with the gold labels to a csv file with the function `write_predictions_to_csv` so that the metrics per class can be computed with the `compute_evaluation_metrics_from_csv` function.

In [31]:
results_file = "predictions.csv"
write_predictions_to_csv(predictions, labels, label_list, results_file)
f1,classification_report = compute_evaluation_metrics_from_csv("predictions.csv")
print(classification_report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         ADJ       0.73      0.71      0.72       225
         ADV       0.63      0.59      0.61       492
        ARG0       0.88      0.81      0.84        70
        ARG1       0.67      0.75      0.71       104
    ARG1-DSP       0.00      0.00      0.00         1
        ARG2       0.25      0.25      0.25         8
        ARG3       0.00      0.00      0.00         2
    ARGM-ADJ       0.00      0.00      0.00         1
    ARGM-ADV       0.00      0.00      0.00         1
    ARGM-CXN       0.62      1.00      0.77         5
    ARGM-DIR       0.00      0.00      0.00         1
    ARGM-LOC       0.50      0.50      0.50        10
    ARGM-MNR       0.00      0.00      0.00         8
    ARGM-TMP       0.00      0.00      0.00         2
         CAU       0.57      0.63      0.60        46
         COM       0.50      0.54      0.52        13
         CXN       0.69      0.92      0.79        12
         DIR       0.62    

Then, we save fine-tuned model.

In [32]:
# Use these codes to save model:
tokenizer.save_pretrained("tokenizer.save_pretrained.distillbert-base-uncased-finetuned-srl")
trainer.save_model("trainer.save_model.distillbert-base-uncased-finetuned-srl")
model.save_pretrained("model.save_pretrained.distillbert-base-uncased-finetuned-srl")


Here, we copy saved model to google drive.

In [33]:
!cp -r '/content/trainer.save_model.distillbert-base-uncased-finetuned-srl' '/content/drive/MyDrive/NLP_3_baseline_model/model'
!cp -r '/content/model.save_pretrained.distillbert-base-uncased-finetuned-srl' '/content/drive/MyDrive/NLP_3_baseline_model/model'
!cp -r '/content/tokenizer.save_pretrained.distillbert-base-uncased-finetuned-srl' '/content/drive/MyDrive/NLP_3_baseline_model/model'