Import all Required library

In [1]:
import time
import pandas as pd
import transformers
from transformers import AutoTokenizer
from utils import read_data_as_sentence,map_labels_in_dataframe,tokenize_and_align_labels

Read data files and save them as dataframe.\
\
Use `read_data_as_sentence` function from `utils.py` to read files as Dataframe.\

`read_data_as_sentence` need two input:
1. path of conllu file.
2. path for save datarame

In [2]:
train_data = read_data_as_sentence('data/en_ewt-up-train.conllu', 'data/en_ewt-up-train.preprocessed.csv')
dev_data = read_data_as_sentence('data/en_ewt-up-dev.conllu', 'data/en_ewt-up-dev.preprocessed.csv')
test_data = read_data_as_sentence('data/en_ewt-up-test.conllu', 'data/en_ewt-up-test.preprocessed.csv')

Show 20 input form and their gold list in test data

In [3]:
for i in range(20):
    print(f"{test_data.input_form[i]}\t:\t{test_data.argument[i]}")

What if Google Morphed Into GoogleOS ? [SEP] morph	:	['_', '_', 'ARG1', '_', '_', 'ARG2', '_']
What if Google expanded on its search - engine ( and now e-mail ) wares into a full - fledged operating system ? [SEP] expand	:	['_', '_', 'ARG0', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'ARG1', '_', '_', '_', '_', '_', '_', 'ARG4', '_']
( And , by the way , is anybody else just a little nostalgic for the days when that was a good thing ? ) [SEP] way	:	['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_']
( And , by the way , is anybody else just a little nostalgic for the days when that was a good thing ? ) [SEP] be	:	['_', '_', '_', '_', '_', 'ARGM-DIS', '_', '_', 'ARG1', '_', '_', '_', '_', 'ARG2', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_']
( And , by the way , is anybody else just a little nostalgic for the days when that was a good thing ? ) [SEP] be	:	['_', '_', '_', '_', '_', '_', '_', '_

Head of test data after process

In [4]:
test_data.head()

Unnamed: 0,input_form,argument
0,What if Google Morphed Into GoogleOS ? [SEP] m...,"[_, _, ARG1, _, _, ARG2, _]"
1,What if Google expanded on its search - engine...,"[_, _, ARG0, _, _, _, _, _, _, _, _, _, _, _, ..."
2,"( And , by the way , is anybody else just a li...","[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, ..."
3,"( And , by the way , is anybody else just a li...","[_, _, _, _, _, ARGM-DIS, _, _, ARG1, _, _, _,..."
4,"( And , by the way , is anybody else just a li...","[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, ..."


## Preprocessing data

In [5]:
# Apply the function to the DataFrame
mapped_df = map_labels_in_dataframe(test_data)

mapped_df.head()


Unnamed: 0,input_form,argument,mapped_labels
0,What if Google Morphed Into GoogleOS ? [SEP] m...,"[_, _, ARG1, _, _, ARG2, _]","[0, 0, 2, 0, 0, 4, 0]"
1,What if Google expanded on its search - engine...,"[_, _, ARG0, _, _, _, _, _, _, _, _, _, _, _, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, ..."
2,"( And , by the way , is anybody else just a li...","[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"( And , by the way , is anybody else just a li...","[_, _, _, _, _, ARGM-DIS, _, _, ARG1, _, _, _,...","[0, 0, 0, 0, 0, 15, 0, 0, 2, 0, 0, 0, 0, 4, 0,..."
4,"( And , by the way , is anybody else just a li...","[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [8]:
data = test_data['input_form']
print(data[1])

What if Google expanded on its search - engine ( and now e-mail ) wares into a full - fledged operating system ? [SEP] expand


The sentence contains the [SEP] special token already followed by the predicate. Therefore, the parameter add_special_tokens is set to True so that the index is converted to 102 accordingly and is not treated as another word.

In [9]:
tokenizer(data[1],add_special_tokens=True)

{'input_ids': [101, 2054, 2065, 8224, 4423, 2006, 2049, 3945, 1011, 3194, 1006, 1998, 2085, 1041, 1011, 5653, 1007, 16283, 2015, 2046, 1037, 2440, 1011, 26712, 4082, 2291, 1029, 102, 7818, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
tokenized_input = tokenizer(data[1],add_special_tokens=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'what',
 'if',
 'google',
 'expanded',
 'on',
 'its',
 'search',
 '-',
 'engine',
 '(',
 'and',
 'now',
 'e',
 '-',
 'mail',
 ')',
 'ware',
 '##s',
 'into',
 'a',
 'full',
 '-',
 'fledged',
 'operating',
 'system',
 '?',
 '[SEP]',
 'expand',
 '[SEP]']

Add a new column to the df matching the arguments to label number. 0 stands for '_' (no argument) and the rest of the arguments are alphabetically ordered.

In [11]:
labels = mapped_df['mapped_labels']

In [12]:
len(labels[1]), len(tokenized_input["input_ids"])

(23, 30)

In [13]:
print(tokenized_input.word_ids())

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, None]


In [14]:
print(tokenized_input.word_ids())
tokenized_input

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, None]


{'input_ids': [101, 2054, 2065, 8224, 4423, 2006, 2049, 3945, 1011, 3194, 1006, 1998, 2085, 1041, 1011, 5653, 1007, 16283, 2015, 2046, 1037, 2440, 1011, 26712, 4082, 2291, 1029, 102, 7818, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}