In [12]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
import pandas as pd
from code_.bert import convert_to_dataset, get_labels_list_from_dataset, Tokenizer, compute_metrics, load_metric
from code_.process_conll import process_file, advanced_process_file

In [2]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "distilbert-base-uncased"
batch_size = 32

In [3]:
df_val = advanced_process_file('data/raw/en_ewt-up-dev.conllu')
df_train = advanced_process_file('data/raw/en_ewt-up-train.conllu')
df_test = advanced_process_file('data/raw/en_ewt-up-test.conllu')

# df_val = process_file('/content/en_ewt-up-dev.conllu')
# df_train = process_file('/content/en_ewt-up-train.conllu')
# df_test = process_file('/content/en_ewt-up-test.conllu')

advanced_process_file(): dataframe len: 4979
advanced_process_file(): dataframe len: 40498
advanced_process_file(): dataframe len: 4802


In [4]:
dataset = convert_to_dataset(df_train, df_val, df_test)
labels_list = get_labels_list_from_dataset(dataset)
print(sorted(labels_list))

['ARG0', 'ARG1', 'ARG1-DSP', 'ARG2', 'ARG3', 'ARG4', 'ARG5', 'ARGA', 'ARGM-ADJ', 'ARGM-ADV', 'ARGM-CAU', 'ARGM-COM', 'ARGM-CXN', 'ARGM-DIR', 'ARGM-DIS', 'ARGM-EXT', 'ARGM-GOL', 'ARGM-LOC', 'ARGM-LVB', 'ARGM-MNR', 'ARGM-MOD', 'ARGM-NEG', 'ARGM-PRD', 'ARGM-PRP', 'ARGM-PRR', 'ARGM-REC', 'ARGM-TMP', 'C-ARG0', 'C-ARG1', 'C-ARG1-DSP', 'C-ARG2', 'C-ARG3', 'C-ARG4', 'C-ARGM-ADV', 'C-ARGM-COM', 'C-ARGM-CXN', 'C-ARGM-DIR', 'C-ARGM-EXT', 'C-ARGM-GOL', 'C-ARGM-LOC', 'C-ARGM-MNR', 'C-ARGM-PRP', 'C-ARGM-PRR', 'C-ARGM-TMP', 'C-V', 'R-ARG0', 'R-ARG1', 'R-ARG2', 'R-ARG3', 'R-ARG4', 'R-ARGM-ADJ', 'R-ARGM-ADV', 'R-ARGM-CAU', 'R-ARGM-COM', 'R-ARGM-DIR', 'R-ARGM-GOL', 'R-ARGM-LOC', 'R-ARGM-MNR', 'R-ARGM-TMP', 'V', '_']


In [5]:
tok = Tokenizer(model_checkpoint, labels_list)

In [10]:
tokenized_datasets = dataset.map(tok.tokenize_and_align_labels_context, batched=True)
# tokenized_datasets = dataset.map(tok.tokenize_and_align_labels_pred, batched=True)

Map:   0%|          | 0/40498 [00:00<?, ? examples/s]

Map:   0%|          | 0/4979 [00:00<?, ? examples/s]

Map:   0%|          | 0/4802 [00:00<?, ? examples/s]

In [8]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(labels_list))
# model = AutoModelForTokenClassification.from_pretrained('model_checkpoints/pred', num_labels=len(labels_list))

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"model_checkpoints/pred",
    evaluation_strategy = 'epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
)

data_collator = DataCollatorForTokenClassification(tok.tokenizer)
metric = load_metric("seqeval")

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tok.tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
predictions_raw, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions_raw, axis=2)

In [None]:
true_predictions = [
    [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

In [None]:
trainer.save_model()

In [None]:
df = pd.DataFrame(columns=['sentence', 'prediction', 'gold'])
for tokens, prediction, gold in zip(tokenized_datasets['validation']['input_ids'], true_predictions, true_labels):
    sentence = tok.tokenizer.decode(tokens)
    df.loc[len(df.index)] = [sentence, prediction, gold]

In [None]:
df.to_csv('base.csv')

In [None]:
df.head()