In [1]:
# !pip install -r requirements.txt

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from code_.process_conll import process_file, advanced_process_file
from code_.evaluation import class_report_base, class_report_advanced, shrink_predictions
from code_.bert import Tokenizer, convert_to_dataset, compute_metrics, get_labels_list_from_dataset, task, batch_size, model_checkpoint
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_metric

In [2]:
df_val = process_file('data/raw/en_ewt-up-dev.conllu')
df_train = process_file('data/raw/en_ewt-up-train.conllu')
df_test = process_file('data/raw/en_ewt-up-test.conllu')

process_file(): dataframe len: 4979
process_file(): dataframe len: 40498
process_file(): dataframe len: 4802


In [3]:
dataset = convert_to_dataset(df_train, df_val, df_test)
labels_list = get_labels_list_from_dataset(dataset)
print(labels_list)

['ARG0', 'ARG1', 'ARG1-DSP', 'ARG2', 'ARG3', 'ARG4', 'ARG5', 'ARGA', 'ARGM-ADJ', 'ARGM-ADV', 'ARGM-CAU', 'ARGM-COM', 'ARGM-CXN', 'ARGM-DIR', 'ARGM-DIS', 'ARGM-EXT', 'ARGM-GOL', 'ARGM-LOC', 'ARGM-LVB', 'ARGM-MNR', 'ARGM-MOD', 'ARGM-NEG', 'ARGM-PRD', 'ARGM-PRP', 'ARGM-PRR', 'ARGM-REC', 'ARGM-TMP', 'C-ARG0', 'C-ARG1', 'C-ARG1-DSP', 'C-ARG2', 'C-ARG3', 'C-ARG4', 'C-ARGM-ADV', 'C-ARGM-COM', 'C-ARGM-CXN', 'C-ARGM-DIR', 'C-ARGM-EXT', 'C-ARGM-GOL', 'C-ARGM-LOC', 'C-ARGM-MNR', 'C-ARGM-PRP', 'C-ARGM-PRR', 'C-ARGM-TMP', 'R-ARG0', 'R-ARG1', 'R-ARG2', 'R-ARG3', 'R-ARG4', 'R-ARGM-ADJ', 'R-ARGM-ADV', 'R-ARGM-CAU', 'R-ARGM-COM', 'R-ARGM-DIR', 'R-ARGM-GOL', 'R-ARGM-LOC', 'R-ARGM-MNR', 'R-ARGM-TMP', '_']


In [4]:
tok = Tokenizer(model_checkpoint, labels_list)
tokenized_datasets = dataset.map(tok.tokenize_and_align_labels_pred, batched=True)

Map:   0%|          | 0/40498 [00:00<?, ? examples/s]

Map:   0%|          | 0/4979 [00:00<?, ? examples/s]

Map:   0%|          | 0/4802 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(labels_list))

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"model_checkpoints/baseline",
    evaluation_strategy = 'epoch',
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)

data_collator = DataCollatorForTokenClassification(tok.tokenizer)
metric = load_metric("seqeval")

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tok.tokenizer,
    compute_metrics=compute_metrics
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [7]:
trainer.train()
trainer.save_model()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3468,0.156694,0.647999,0.621218,0.634326,0.958976
2,0.1321,0.129621,0.679661,0.707094,0.693106,0.964137
3,0.1075,0.119227,0.703613,0.722093,0.712734,0.966666
4,0.0845,0.115446,0.712812,0.729561,0.721089,0.967707
5,0.0785,0.113801,0.718075,0.731943,0.724943,0.968094


In [8]:
# predictions_raw, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions_raw, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions_raw, axis=2)

true_predictions = [
    [labels_list[p] for (p, l) in zip(prediction, label) if l != -100 and p < len(labels_list)]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [labels_list[l] for (p, l) in zip(prediction, label) if l != -100 and p < len(labels_list)]
    for prediction, label in zip(predictions, labels)
]

In [9]:
val_word_ids = []
for sentence in dataset['validation']['sentence']:
# for sentence in dataset['test']['sentence']:
    val_word_ids.append(tok.tokenizer(sentence, truncation=True, is_split_into_words=True).word_ids())

df = pd.DataFrame(columns=['sentence', 'prediction', 'gold', 'word_ids'])
for tokens, prediction, gold, word_ids in zip(tokenized_datasets['validation']['input_ids'], true_predictions, true_labels, val_word_ids):
# for tokens, prediction, gold, word_ids in zip(tokenized_datasets['test']['input_ids'], true_predictions, true_labels, val_word_ids):
    sentence = tok.tokenizer.decode(tokens)
    df.loc[len(df.index)] = [sentence, prediction, gold, word_ids]

In [10]:
gold_restored = []
pred_restored = []
for i, row in df.iterrows():
    sentence = row[0]
    orig_sentence = sentence.split('[SEP]')[0].split(' ')[1:]
    prediction = row[1]
    gold = row[2]
    word_ids = row[3][1:-1]
    gold_restored.append(shrink_predictions(word_ids, gold))
    pred_restored.append(shrink_predictions(word_ids, prediction))

df['gold_restored'] = gold_restored
df['pred_restored'] = pred_restored
df.to_csv('data/output/base_val.csv')

In [11]:
from code_.evaluation import class_report_base
class_report_base('data/output/base_val.csv')

              precision    recall  f1-score   support

         'V'       0.00      0.00      0.00         0
         '_'       0.99      0.99      0.99     95623
       'C-V'       0.00      0.00      0.00         0
      'ARGA'       0.00      0.00      0.00         0
      'ARG3'       0.00      0.00      0.00        82
      'ARG2'       0.71      0.73      0.72      1298
      'ARG5'       0.00      0.00      0.00         1
      'ARG0'       0.81      0.85      0.83      1389
      'ARG4'       0.51      0.33      0.40        54
      'ARG1'       0.79      0.86      0.82      3218
    'C-ARG4'       0.00      0.00      0.00         0
    'C-ARG2'       0.00      0.00      0.00         7
    'R-ARG2'       0.00      0.00      0.00         4
    'C-ARG0'       0.00      0.00      0.00         5
    'R-ARG0'       0.79      0.85      0.82        54
    'C-ARG1'       0.50      0.33      0.40        55
    'C-ARG3'       0.00      0.00      0.00         7
    'R-ARG3'       0.00    

In [2]:
df_val = advanced_process_file('data/raw/en_ewt-up-dev.conllu')
df_train = advanced_process_file('data/raw/en_ewt-up-train.conllu')
df_test = advanced_process_file('data/raw/en_ewt-up-test.conllu')

dataset = convert_to_dataset(df_train, df_val, df_test)
labels_list = get_labels_list_from_dataset(dataset)

tok = Tokenizer(model_checkpoint, labels_list)
tokenized_datasets = dataset.map(tok.tokenize_and_align_labels_context, batched=True)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(labels_list))

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"model_checkpoints/context",
    evaluation_strategy = 'epoch',
    learning_rate=2e-5,
    save_steps=7000,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)
data_collator = DataCollatorForTokenClassification(tok.tokenizer)
metric = load_metric("seqeval")
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tok.tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model()

advanced_process_file(): dataframe len: 4979
advanced_process_file(): dataframe len: 40498
advanced_process_file(): dataframe len: 4802


Map:   0%|          | 0/40498 [00:00<?, ? examples/s]

Map:   0%|          | 0/4979 [00:00<?, ? examples/s]

Map:   0%|          | 0/4802 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3265,0.138324,0.663766,0.643299,0.653372,0.964538
2,0.1194,0.111597,0.705745,0.71733,0.71149,0.969759
3,0.095,0.102366,0.723934,0.741084,0.732409,0.972047
4,0.0731,0.098756,0.735281,0.746878,0.741034,0.973093
5,0.0676,0.098428,0.733363,0.749839,0.741509,0.973045


In [3]:
predictions_raw, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions_raw, axis=2)

list_predictions = [
    [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=list_predictions, references=true_labels)

val_word_ids = []
for sentence in dataset['validation']['sentence']:
    val_word_ids.append(tok.tokenizer(sentence, truncation=True, is_split_into_words=True).word_ids())

df = pd.DataFrame(columns=['sentence', 'prediction', 'gold', 'word_ids'])
for tokens, prediction, gold, word_ids in zip(tokenized_datasets['validation']['input_ids'], list_predictions, true_labels, val_word_ids):
    sentence = tok.tokenizer.decode(tokens)
    df.loc[len(df.index)] = [sentence, prediction, gold, word_ids]

gold_restored = []
pred_restored = []
for i, row in df.iterrows():
    sentence = row[0]
    orig_sentence = sentence.split('[SEP]')[0].split(' ')[1:]
    prediction = row[1]
    gold = row[2]
    word_ids = row[3][1:-1]
    gold_restored.append(shrink_predictions(word_ids, gold))
    pred_restored.append(shrink_predictions(word_ids, prediction))

df['gold_restored'] = gold_restored
df['pred_restored'] = pred_restored
df.to_csv('data/output/context_val.csv')

class_report_base('data/output/context_val.csv')

              precision    recall  f1-score   support

         'V'       0.00      0.00      0.00         0
         '_'       0.99      0.99      0.99     95833
       'C-V'       0.00      0.00      0.00         0
      'ARGA'       0.00      0.00      0.00         0
      'ARG3'       0.00      0.00      0.00        82
      'ARG2'       0.73      0.75      0.74      1298
      'ARG5'       0.00      0.00      0.00         1
      'ARG0'       0.81      0.86      0.83      1389
      'ARG4'       0.63      0.48      0.55        54
      'ARG1'       0.82      0.86      0.84      3218
    'C-ARG4'       0.00      0.00      0.00         0
    'C-ARG2'       0.00      0.00      0.00         7
    'R-ARG2'       0.00      0.00      0.00         4
    'C-ARG0'       0.00      0.00      0.00         5
    'R-ARG0'       0.76      0.87      0.81        54
    'C-ARG1'       0.43      0.27      0.33        55
    'C-ARG3'       0.00      0.00      0.00         7
    'R-ARG3'       0.00    