In [None]:
!pip install -r requirements.txt

In [1]:
from transformers import BertTokenizerFast, BertForPreTraining,TFBertForTokenClassification ,TFTrainer, TFTrainingArguments
import os
import tensorflow as tf
import pandas as pd
import datasets

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
from datasets import load_dataset
dataset = load_dataset("german_legal_entity_recognition","all",ignore_verifications =True , download_mode="force_redownload")

Downloading:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading and preparing dataset german_legal_entity_recognition/all (download: 4.19 MiB, generated: 37.05 MiB, post-processed: Unknown size, total: 41.24 MiB) to /root/.cache/huggingface/datasets/german_legal_entity_recognition/all/1.0.0/1d4ec33053d6b0f788de20aa9010bf6f5a16ea6682b83ddd79ab82773348658c...


Downloading:   0%|          | 0.00/4.39M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

Dataset german_legal_entity_recognition downloaded and prepared to /root/.cache/huggingface/datasets/german_legal_entity_recognition/all/1.0.0/1d4ec33053d6b0f788de20aa9010bf6f5a16ea6682b83ddd79ab82773348658c. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
dataset = dataset["train"].train_test_split(test_size=0.2)

In [5]:
labels = dataset["train"].features[f"ner_tags"].feature.names

In [6]:
len(labels) 

39

In [7]:
#Load Model and Tokenizer
from transformers import AutoTokenizer
with tf.device('GPU'):
    tokenizer = AutoTokenizer.from_pretrained('deepset/gbert-large')
    model = TFBertForTokenClassification.from_pretrained('deepset/gbert-large', num_labels=len(labels) )

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def tokenize_and_align_labels(examples):
    #print(examples.keys())
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [9]:
from datasets import Dataset

In [10]:
tokenized = dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/54 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

In [23]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")

tf_train_set  = tokenized["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=2,
    collate_fn=data_collator,
)

tf_test_set  = tokenized["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=2,
    collate_fn=data_collator,
)


In [24]:
from transformers import create_optimizer

batch_size = 2
num_train_epochs = 3
num_train_steps = (len(tokenized["train"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [29]:
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")
def compute_metrics(predictions, labels):
    #predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
      if(k not in flattened_results.keys()):
        flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

In [30]:
import tensorflow as tf
import tensorflow_addons as tfa
model.compile(optimizer=optimizer,metrics=[compute_metrics])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [31]:
model.fit(
    tf_train_set,
    
    epochs=num_train_epochs
)

Epoch 1/3


  return py_builtins.overload_of(f)(*args)


TypeError: in user code:

    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1021, in train_function  *
        return step_function(self, iterator)

    TypeError: tf__compute_metrics() takes 1 positional argument but 2 were given


In [None]:
model.evaluate(tf_test_set,batch_size=2)

In [28]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.1 MB/s  eta 0:00:01
Collecting scikit-learn>=0.21.3
  Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[K     |████████████████████████████████| 26.7 MB 129 kB/s  eta 0:00:01     |████████████████████████████▋   | 23.8 MB 12.0 MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.1.0
  Downloading scipy-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.6 MB)
[K     |████████████████████████████████| 41.6 MB 7.9 MB/s eta 0:00:011
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16182 sha256=c1218846939ebddde1d499226ebca22d2c0978968daf3da6b8799a4bfaa286e6
  Stored in directory: /root/.cache/pip/wheel