# Prepare the unannotated corpus for doccano

In [1]:
import os
from glob import glob
import csv
import pandas as pd
import re
import json

# Define evaluation metrics

In [6]:
import evaluate

metric = evaluate.load("seqeval")

In [7]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# DeBERTa

**Assumption**: Padding fixed to 128.

In [8]:
from transformers import DebertaTokenizerFast, DebertaForTokenClassification

tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base",add_prefix_space=True)

In [9]:
import pickle
from utils import dataset

In [10]:
with open('deberta_training.pkl','rb') as f:
    training_set = pickle.load(f)
with open('deberta_validation.pkl','rb') as f:
    val_set = pickle.load(f)

## Prepare randomly initialized DeBERTa Model

In [15]:
id2tag = training_set.id2tag
tag2id = training_set.tag2id

In [20]:
from transformers import DebertaConfig, DebertaForTokenClassification
from transformers import AutoConfig
rand_init_config = AutoConfig.from_pretrained('microsoft/deberta-base',
                                              id2label=training_set.id2tag,
                                              label2id=training_set.tag2id)

rand_init_model = DebertaForTokenClassification(rand_init_config)
# rand_init_model = DebertaForTokenClassification(DebertaConfig())
# rand_init_model.config.id2label=training_set.id2tag
# rand_init_model.config.label2id=training_set.tag2id

loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /home/nanomineduke/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "U-G",
    "1": "I-C",
    "2": "L-G",
    "3": "U-C",
    "4": "B-S",
    "5": "L-C",
    "6": "L-S",
    "7": "B-G",
    "8": "U-P",
    "9": "B-P",
    "10": "I-S",
    "11": "L-P",
    "12": "B-C",
    "13": "O",
    "14": "I-G",
    "15": "I-P",
    "16": "U-S"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-C": 12,
    "B-G": 7,
    "B-P": 9,
    "B-S": 4,
    "I-C": 1,
    "I-G": 14,
    "I-P": 15,
    "I-S": 10,
    "L-C": 5,
    "L-G": 2,
   

In [21]:
rand_init_model.config

DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "U-G",
    "1": "I-C",
    "2": "L-G",
    "3": "U-C",
    "4": "B-S",
    "5": "L-C",
    "6": "L-S",
    "7": "B-G",
    "8": "U-P",
    "9": "B-P",
    "10": "I-S",
    "11": "L-P",
    "12": "B-C",
    "13": "O",
    "14": "I-G",
    "15": "I-P",
    "16": "U-S"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-C": 12,
    "B-G": 7,
    "B-P": 9,
    "B-S": 4,
    "I-C": 1,
    "I-G": 14,
    "I-P": 15,
    "I-S": 10,
    "L-C": 5,
    "L-G": 2,
    "L-P": 11,
    "L-S": 6,
    "O": 13,
    "U-C": 3,
    "U-G": 0,
    "U-P": 8,
    "U-S": 16
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler

In [22]:
from transformers import TrainingArguments

args = TrainingArguments(
    "deberta-random-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
#     no_cuda=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [23]:
from transformers import Trainer

rand_init_trainer = Trainer(
    model=rand_init_model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
#     data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
rand_init_trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810
The following columns in the training set don't have a corresponding argument in `DebertaForTokenClassification.forward` and have been ignored: offset_mapping. If offset_mapping are not expected by `DebertaForTokenClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.71815,0.164173,0.296992,0.211456,0.779786
2,0.843400,0.650062,0.211196,0.374436,0.270065,0.79935
3,0.629000,0.676141,0.19841,0.412782,0.268001,0.78219
4,0.533600,0.627451,0.246537,0.401504,0.305492,0.80728
5,0.533600,0.621958,0.234474,0.417293,0.300243,0.808905
6,0.477000,0.617469,0.254017,0.43985,0.322048,0.81261
7,0.424900,0.637974,0.234387,0.445865,0.307254,0.80416
8,0.393300,0.621847,0.253975,0.468421,0.329368,0.813715
9,0.393300,0.632853,0.257594,0.478195,0.334825,0.81443
10,0.362200,0.628627,0.267606,0.471429,0.34141,0.817745


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForTokenClassification.forward` and have been ignored: offset_mapping. If offset_mapping are not expected by `DebertaForTokenClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to deberta-random-ner/checkpoint-381
Configuration saved in deberta-random-ner/checkpoint-381/config.json
Model weights saved in deberta-random-ner/checkpoint-381/pytorch_model.bin
tokenizer config file saved in deberta-random-ner/checkpoint-381/tokenizer_config.json
Special tokens file saved in deberta-random-ner/checkpoint-381/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForTokenClassification.forward` and have been ignored: offset_mapping. If offset_mapping are not expected by `Debe

Saving model checkpoint to deberta-random-ner/checkpoint-1524
Configuration saved in deberta-random-ner/checkpoint-1524/config.json
Model weights saved in deberta-random-ner/checkpoint-1524/pytorch_model.bin
tokenizer config file saved in deberta-random-ner/checkpoint-1524/tokenizer_config.json
Special tokens file saved in deberta-random-ner/checkpoint-1524/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForTokenClassification.forward` and have been ignored: offset_mapping. If offset_mapping are not expected by `DebertaForTokenClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to deberta-random-ner/checkpoint-1905
Configuration saved in deberta-random-ner/checkpoint-1905/config.json
Model weights saved in deberta-random-ner/checkpoint-1905/pytorch_model.bin
tokenizer config file saved in deberta-random-ner/checkpoint-

Saving model checkpoint to deberta-random-ner/checkpoint-3048
Configuration saved in deberta-random-ner/checkpoint-3048/config.json
Model weights saved in deberta-random-ner/checkpoint-3048/pytorch_model.bin
tokenizer config file saved in deberta-random-ner/checkpoint-3048/tokenizer_config.json
Special tokens file saved in deberta-random-ner/checkpoint-3048/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaForTokenClassification.forward` and have been ignored: offset_mapping. If offset_mapping are not expected by `DebertaForTokenClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to deberta-random-ner/checkpoint-3429
Configuration saved in deberta-random-ner/checkpoint-3429/config.json
Model weights saved in deberta-random-ner/checkpoint-3429/pytorch_model.bin
tokenizer config file saved in deberta-random-ner/checkpoint-

TrainOutput(global_step=3810, training_loss=0.5087095205552309, metrics={'train_runtime': 1731.6452, 'train_samples_per_second': 17.584, 'train_steps_per_second': 2.2, 'total_flos': 3625675982460000.0, 'train_loss': 0.5087095205552309, 'epoch': 10.0})