# Prepare the unannotated corpus for doccano

In [1]:
import os
from glob import glob
import csv
import pandas as pd
import re
import json

# Define evaluation metrics

In [6]:
import evaluate

metric = evaluate.load("seqeval")

In [7]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# DeBERTa

**Assumption**: Padding fixed to 128.

In [8]:
from transformers import DebertaTokenizerFast, DebertaForTokenClassification

tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base",add_prefix_space=True)

In [9]:
import pickle
from utils import dataset

In [10]:
with open('deberta_training.pkl','rb') as f:
    training_set = pickle.load(f)
with open('deberta_validation.pkl','rb') as f:
    val_set = pickle.load(f)

## Prepare Pretrained DeBERTa Model

In [15]:
id2tag = training_set.id2tag
tag2id = training_set.tag2id

In [16]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    'microsoft/deberta-base',
    id2label=training_set.id2tag,
    label2id=training_set.tag2id,
)

loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /home/nanomineduke/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "U-G",
    "1": "I-C",
    "2": "L-G",
    "3": "U-C",
    "4": "B-S",
    "5": "L-C",
    "6": "L-S",
    "7": "B-G",
    "8": "U-P",
    "9": "B-P",
    "10": "I-S",
    "11": "L-P",
    "12": "B-C",
    "13": "O",
    "14": "I-G",
    "15": "I-P",
    "16": "U-S"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-C": 12,
    "B-G": 7,
    "B-P": 9,
    "B-S": 4,
    "I-C": 1,
    "I-G": 14,
    "I-P": 15,
    "I-S": 10,
    "L-C": 5,
    "L-G": 2,
   

In [17]:
model.config.num_labels==len(training_set.id2tag)

True

In [18]:
from transformers import TrainingArguments

args = TrainingArguments(
    "deberta-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
#     data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810
The following columns in the training set don't have a corresponding argument in `DebertaForTokenClassification.forward` and have been ignored: offset_mapping. If offset_mapping are not expected by `DebertaForTokenClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.368354,0.548739,0.719549,0.622642,0.883588
2,0.479900,0.299824,0.657895,0.770677,0.709834,0.909717
3,0.204400,0.30137,0.720379,0.8,0.758105,0.916412
4,0.111300,0.361513,0.728011,0.809023,0.766382,0.918102
5,0.111300,0.386925,0.727584,0.799248,0.761734,0.920767
6,0.058700,0.43966,0.744828,0.81203,0.776978,0.921482
7,0.031600,0.437177,0.747405,0.81203,0.778378,0.924407
8,0.020900,0.465675,0.758787,0.82782,0.791802,0.926682
9,0.020900,0.470846,0.755662,0.82782,0.790097,0.926942
10,0.009500,0.472234,0.758431,0.828571,0.791951,0.928307


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForTokenClassification.forward` and have been ignored: offset_mapping. If offset_mapping are not expected by `DebertaForTokenClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to deberta-finetuned-ner/checkpoint-381
Configuration saved in deberta-finetuned-ner/checkpoint-381/config.json
Model weights saved in deberta-finetuned-ner/checkpoint-381/pytorch_model.bin
tokenizer config file saved in deberta-finetuned-ner/checkpoint-381/tokenizer_config.json
Special tokens file saved in deberta-finetuned-ner/checkpoint-381/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForTokenClassification.forward` and have been ignored: offset_mapping. If offset_mapping are not ex

Saving model checkpoint to deberta-finetuned-ner/checkpoint-1524
Configuration saved in deberta-finetuned-ner/checkpoint-1524/config.json
Model weights saved in deberta-finetuned-ner/checkpoint-1524/pytorch_model.bin
tokenizer config file saved in deberta-finetuned-ner/checkpoint-1524/tokenizer_config.json
Special tokens file saved in deberta-finetuned-ner/checkpoint-1524/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForTokenClassification.forward` and have been ignored: offset_mapping. If offset_mapping are not expected by `DebertaForTokenClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to deberta-finetuned-ner/checkpoint-1905
Configuration saved in deberta-finetuned-ner/checkpoint-1905/config.json
Model weights saved in deberta-finetuned-ner/checkpoint-1905/pytorch_model.bin
tokenizer config file saved in debert

Saving model checkpoint to deberta-finetuned-ner/checkpoint-3048
Configuration saved in deberta-finetuned-ner/checkpoint-3048/config.json
Model weights saved in deberta-finetuned-ner/checkpoint-3048/pytorch_model.bin
tokenizer config file saved in deberta-finetuned-ner/checkpoint-3048/tokenizer_config.json
Special tokens file saved in deberta-finetuned-ner/checkpoint-3048/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForTokenClassification.forward` and have been ignored: offset_mapping. If offset_mapping are not expected by `DebertaForTokenClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to deberta-finetuned-ner/checkpoint-3429
Configuration saved in deberta-finetuned-ner/checkpoint-3429/config.json
Model weights saved in deberta-finetuned-ner/checkpoint-3429/pytorch_model.bin
tokenizer config file saved in debert

TrainOutput(global_step=3810, training_loss=0.1207746792340216, metrics={'train_runtime': 1739.2774, 'train_samples_per_second': 17.507, 'train_steps_per_second': 2.191, 'total_flos': 3625675982460000.0, 'train_loss': 0.1207746792340216, 'epoch': 10.0})