
### Data description

**NER features:**

*'O'* = Outside (not part of named entity)  

*'ART'* = Artefacts (objects)  

*'PER'* = Time Periods  

*'MAT'* = Materials  

*'LOC'* = Locations  

*'CON'* = Contexts  

*'SPE'* = Species

**Prefixes**

*'B-'*  Beginning token of a named entity

*'I-'* = Inside token of a named entity  

<br><br>


**Examples:**

<span style="font-size: 14px;">

"Medieval JJ B-PER

and CC O

post-medieval JJ B-PER

The DT B-LOC

Blue NNP I-LOC

Boar NNP I-LOC

Inn NNP I-LOC

( ( O

John NNP O "



<br>

*Note:*  the second token in a line is the POS (part of speeach) but it is not our target
</span>



In [1]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [2]:
!pip install tensorflow==2.13.*

Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.13.* (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.13.*[0m[31m
[0m

In [3]:
%pip install transformers datasets tensorflow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import numpy as np
from transformers import AutoTokenizer

NER_features = []

def Data_preprocessing(path =  'val.txt'):
        
    
    with open(path, 'r') as file:
        raw_data = file.read()

    #print(raw_data)

    lines = raw_data.split('\n')

    # take the last word of each line, and take only what's after '-', take unique values
    # find NER_features and construct the NER tags
    NER_features = list(set(line.split()[-1] for line in lines if line))
    NER_features.insert(0, NER_features.pop(NER_features.index('O'))) # move 'O' to the front so the tag is 0
    ner_tags = {tag: id for id, tag in enumerate(NER_features)}

    #print(NER_features)
    #print(ner_tags)


    # Separate lines into phrases

    phrases = []
    current_phrase = []

    # Group lines into phrases based on the period (.) separator
    for line in lines:
        current_phrase.append(line)
        if line.startswith('.'):
            if current_phrase:
                phrases.append(current_phrase)
                current_phrase = []

    # Right now the first element in a phrase is an empty string 
    for phrase in phrases:
        del phrase[0]


    """
    ###Example RUN
    phrase = phrases[0]

    # construct the words, labels 
    words = list(word.split()[0]for word in phrase if word.strip())

    labels = list(label.split()[-1] for label in phrase if label.strip())

    #print(f"words NER NER_tags")
    for inp, lbl in zip(words, labels):
        print(f"{inp} {lbl} {ner_tags[lbl]}")

    model_checkpoint = "bert-base-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    inputs = tokenizer(words, is_split_into_words=True)

    #inputs.tokens()

    word_ids = inputs.word_ids()

    label_tag = [ner_tags[lbl] for lbl in labels]

    new_labels = align_labels_with_tokens(label_tag, word_ids)


    print(inputs.tokens())
    print(new_labels)

    ### END OF EXAMPLE RUN

    """


    model_checkpoint = "bert-base-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    
    all_tokenized = []

    total_phrases = len(phrases)


    for i, phrase in enumerate(phrases, start=1):

        print(f"Iteration {i} out of {total_phrases}")    

        # construct the words, labels 
        words = list(word.split()[0]for word in phrase if word.strip())

        labels = list(label.split()[-1] for label in phrase if label.strip())

        inputs = tokenizer(words, is_split_into_words=True)

        word_ids = inputs.word_ids()

        label_tag = [ner_tags[lbl] for lbl in labels]

        new_labels = align_labels_with_tokens(label_tag, word_ids)

        all_tokenized.append((inputs, new_labels))
    
    return all_tokenized





**EXAMPLE RUN results**


| Token         | Label   | Label_tag |
|--------------|-------|-------|
| modern       | B-PER | 11    |
| ploughsoil   | O     | 0     |
| was          | O     | 0     |
| 300          | O     | 0     |
| mm           | O     | 0     |
| deep         | O     | 0     |
| and          | O     | 0     |
| directly     | O     | 0     |
| overlay      | O     | 0     |
| archaeological | O   | 0     |
| deposits     | O     | 0     |
| .            | O     | 0     |

Inputs:

['[CLS]', 'modern', 'p', '##lough', '##so', '##il', 'was', '300', 'mm', 'deep', 'and', 'directly', 'over', '##lay', 'archaeological', 'deposits', '.', '[SEP]']

New Labels:

[-100, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


Note: it's not yet padded

In [5]:

# it takes quite a while to run
val_data = Data_preprocessing('val.txt')
train_data = Data_preprocessing('train.txt')
test_data = Data_preprocessing('test.txt')



Iteration 1 out of 873
Iteration 2 out of 873
Iteration 3 out of 873
Iteration 4 out of 873
Iteration 5 out of 873
Iteration 6 out of 873
Iteration 7 out of 873
Iteration 8 out of 873
Iteration 9 out of 873
Iteration 10 out of 873
Iteration 11 out of 873
Iteration 12 out of 873
Iteration 13 out of 873
Iteration 14 out of 873
Iteration 15 out of 873
Iteration 16 out of 873
Iteration 17 out of 873
Iteration 18 out of 873
Iteration 19 out of 873
Iteration 20 out of 873
Iteration 21 out of 873
Iteration 22 out of 873
Iteration 23 out of 873
Iteration 24 out of 873
Iteration 25 out of 873
Iteration 26 out of 873
Iteration 27 out of 873
Iteration 28 out of 873
Iteration 29 out of 873
Iteration 30 out of 873
Iteration 31 out of 873
Iteration 32 out of 873
Iteration 33 out of 873
Iteration 34 out of 873
Iteration 35 out of 873
Iteration 36 out of 873
Iteration 37 out of 873
Iteration 38 out of 873
Iteration 39 out of 873
Iteration 40 out of 873
Iteration 41 out of 873
Iteration 42 out of 873
I

In [6]:
# still need some more data processing



In [7]:
from transformers import DataCollatorForTokenClassification, AutoTokenizer


model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# add padding to the data
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf"
)


print(val_data[0])

# Convert tuples to dictionaries
val_data_dicts = [{"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels} for inputs, labels in val_data]

batch = data_collator(list(val_data_dicts[i] for i in range(2)))
print(batch["labels"])



# columns=["attention_mask", "input_ids", "labels", "token_type_ids"],






2024-11-02 16:53:05.983587: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-02 16:53:05.994363: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-02 16:53:05.996293: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-02 16:53:06.001371: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730562786.009766  925411 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730562786.01

({'input_ids': [101, 2030, 185, 14704, 7301, 2723, 1108, 3127, 2608, 1996, 1105, 2626, 1166, 6622, 8962, 10009, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, [-100, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100])
tf.Tensor(
[[-100    4    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0 -100]
 [-100    0    0    0    0    0    0    0    0 -100 -100 -100 -100 -100
  -100 -100 -100 -100]], shape=(2, 18), dtype=int64)


W0000 00:00:1730562797.251472  925411 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [38]:
import tensorflow as tf
from datasets import Dataset

# Convert tuples to dictionaries
train_data_dicts = [{"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels} for inputs, labels in train_data]

# Convert list of dictionaries to dictionary of lists
train_data_dict = {key: [dic[key] for dic in train_data_dicts] for key in train_data_dicts[0]}

train_data_good = Dataset.from_dict(train_data_dict)

# Create a TensorFlow dataset
tf_train_dataset = train_data_good.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

# Repeat the process for the validation data

# Convert tuples to dictionaries
val_data_dicts = [{"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels} for inputs, labels in val_data]

# Convert list of dictionaries to dictionary of lists
val_data_dict = {key: [dic[key] for dic in val_data_dicts] for key in val_data_dicts[0]}

# Create a Dataset object from the dictionary
val_data_good = Dataset.from_dict(val_data_dict)


# Create a TensorFlow dataset
tf_val_dataset = val_data_good.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    collate_fn=data_collator,
    shuffle=False,  # Typically, validation data is not shuffled
    batch_size=16,
)


# Convert tuples to dictionaries
test_data_dicts = [{"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels} for inputs, labels in test_data]

# Convert list of dictionaries to dictionary of lists
test_data_dict = {key: [dic[key] for dic in test_data_dicts] for key in test_data_dicts[0]}

# Create a Dataset object from the dictionary
test_data_good = Dataset.from_dict(test_data_dict)


# Create a TensorFlow dataset
tf_test_dataset = test_data_good.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    collate_fn=data_collator,
    shuffle=False,  # Typically, validation data is not shuffled
    batch_size=16,
)


In [9]:
import tensorflow as tf

# Assuming tf_dataset is your TensorFlow dataset
for batch in tf_val_dataset:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    
    # Print the tensors
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Labels:", labels)
    
    # Check if tensors are empty
    if tf.size(input_ids) == 0:
        print("Input IDs tensor is empty.")
    if tf.size(attention_mask) == 0:
        print("Attention Mask tensor is empty.")
    if tf.size(labels) == 0:
        print("Labels tensor is empty.")
    
    break  # Remove this break to print all batches

Input IDs: tf.Tensor(
[[  101  2030   185 14704  7301  2723  1108  3127  2608  1996  1105  2626
   1166  6622  8962 10009   119   102     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0]
 [  101  1109 19374  1108  7620  1289   118 12370   119   102     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0]
 [  101 15559   117  8962  1956  2626  2195  1103  2379 22593 10879   194
   8785   120 24705 10493  3763  1105  1122  2691  1115  2030   185 14704
   1158  1144  2856 26558   155  4165  1126 10009   119   102     0     0
      0     0     0]
 [  101  1438   117  6440  2588 14541  1108   170  4240  1843   185 15581
   4999 14304 21689  1158  7172  8301  1571   119   102     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0]
 [  10

In [10]:
# Read the data from the file
with open('val.txt', 'r') as file:
    raw_data = file.read()

lines = raw_data.strip().split('\n')

# Extract NER features and construct the NER tags
NER_features = list(set(line.split()[-1] for line in lines if line))
NER_features.insert(0, NER_features.pop(NER_features.index('O')))  # Move 'O' to the front
ner_tags = {tag: id for id, tag in enumerate(NER_features)}

# Group lines into phrases
phrases = []
current_phrase = []

for line in lines:
    if line:
        current_phrase.append(line)
        if line.startswith('.'):
            phrases.append(current_phrase)
            current_phrase = []
    else:
        if current_phrase:
            phrases.append(current_phrase)
            current_phrase = []

# Remove empty first elements in phrases
for phrase in phrases:
    if phrase and phrase[0] == '':
        del phrase[0]

# Select two phrases
phrase1 = phrases[0]
phrase2 = phrases[-1]

# Function to process a phrase
def process_phrase(phrase):
    # Extract words and labels from the phrase
    words = [word.split()[0] for word in phrase if word.strip()]
    labels = [label.split()[-1] for label in phrase if label.strip()]
    return words, labels

# Process phrases
words1, labels1 = process_phrase(phrase1)
words2, labels2 = process_phrase(phrase2)

# Combine the phrases for batch processing
words_batch = [words1, words2]
labels_batch = [labels1, labels2]

# Initialize the tokenizer
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenize the words with padding
inputs = tokenizer(
    words_batch,
    is_split_into_words=True,
    padding=True,
    return_tensors="tf"
)

# Get word IDs for each batch
word_ids_list = [inputs.word_ids(batch_index=i) for i in range(len(words_batch))]

# Map NER tags to IDs
label_tags = [[ner_tags[label] for label in labels] for labels in labels_batch]

# Align labels with tokens
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        else:
            new_labels.append(-100)
    return new_labels

aligned_labels = [
    align_labels_with_tokens(label_tags[i], word_ids_list[i])
    for i in range(len(label_tags))
]

# Prepare attention masks and input IDs (padding)
attention_masks = inputs['attention_mask'].numpy().tolist()
input_ids = inputs['input_ids'].numpy().tolist()

# For each phrase, print the requested information
for idx in range(len(words_batch)):
    print(f"Phrase {idx+1}:")
    print("Words:", words_batch[idx])
    print("Labels:", labels_batch[idx])
    print("Tokens:", inputs.tokens(batch_index=idx))
    print("Aligned Labels:", aligned_labels[idx])
    print("Padding (Input IDs):", input_ids[idx])
    print("Attention Mask:", attention_masks[idx])
    print()

Phrase 1:
Words: ['The', 'modern', 'ploughsoil', 'was', '300', 'mm', 'deep', 'and', 'directly', 'overlay', 'archaeological', 'deposits', '.']
Labels: ['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens: ['[CLS]', 'The', 'modern', 'p', '##lough', '##so', '##il', 'was', '300', 'mm', 'deep', 'and', 'directly', 'over', '##lay', 'archaeological', 'deposits', '.', '[SEP]']
Aligned Labels: [-100, 0, 4, 0, -100, -100, -100, 0, 0, 0, 0, 0, 0, 0, -100, 0, 0, 0, -100]
Padding (Input IDs): [101, 1109, 2030, 185, 14704, 7301, 2723, 1108, 3127, 2608, 1996, 1105, 2626, 1166, 6622, 8962, 10009, 119, 102]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Phrase 2:
Words: ['Roman', 'walls', ',', 'surfaces', 'and', 'robber', 'trenches', '.']
Labels: ['B-PER', 'B-CON', 'O', 'O', 'O', 'B-CON', 'I-CON', 'O']
Tokens: ['[CLS]', 'Roman', 'walls', ',', 'surfaces', 'and', 'r', '##ob', '##ber', 'trenches', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',

In [11]:
id2label = {i: label for i, label in enumerate(NER_features)}
label2id = {v: k for k, v in id2label.items()}

print(NER_features)

['O', 'B-CON', 'I-MAT', 'I-CON', 'B-PER', 'I-PER', 'B-ART', 'B-LOC', 'B-MAT', 'I-ART', 'I-LOC', 'B-SPE', 'I-SPE']


In [12]:

# Install tf-keras package
%pip install tf-keras

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [13]:
from transformers import TFAutoModelForTokenClassification

num_labels = len(id2label)  # Define the number of labels

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,     # Specify the number of labels
    id2label=id2label,
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model.config.num_labels


13

In [15]:
#from huggingface_hub import notebook_login

#notebook_login()

In [None]:
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
# tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [17]:

# Check if TensorFlow can detect the GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to prevent TensorFlow from allocating all GPU memory at once
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("TensorFlow has detected the GPU:")
        for gpu in gpus:
            print(f" - {gpu}")
    except RuntimeError as e:
        print("Error setting memory growth:", e)
else:
    print("TensorFlow did not detect the GPU.")

TensorFlow did not detect the GPU.


# TO DO: implement CALLBACK

In [18]:
#from transformers.keras_callbacks import PushToHubCallback


# Ensure that the data collator is correctly padding the sequences
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf", padding=True
)

model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    epochs=num_epochs,
)

Epoch 1/3


Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7f59d4627f50>

In [19]:
!pip install seqeval


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m774.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=3b04c9e6d17c562d12f0556a03233bd7b7e5a42aca61420224785860e7617d35
  Stored in directory: /vol/home/s4422090/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
!nvidia-smi

Sat Nov  2 16:47:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4060        Off | 00000000:01:00.0 Off |                  N/A |
| 32%   30C    P8              N/A / 115W |     69MiB /  8188MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [21]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: evaluate
[0mSuccessfully installed evaluate-0.4.3


In [22]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
# Select two phrases
phrase1 = phrases[0]


words1, labels1 = process_phrase(phrase1)

print(words1 labels1)

['The', 'modern', 'ploughsoil', 'was', '300', 'mm', 'deep', 'and', 'directly', 'overlay', 'archaeological', 'deposits', '.'] ['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [34]:

phrase1 = phrases[10]
words1, labels1 = process_phrase(phrase1)

for word, label in zip(words1, labels1):
    print(f"{word} {label}")

predictions = labels1.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels1])

5.1.50 O
Feature O
3503 O
was O
a O
small O
, O
circular O
, O
stone-filled O
pit B-CON
. O


{'CON': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [39]:
import numpy as np

all_predictions = []
all_labels = []


print("Evaluating the model on the test set")
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"].numpy()
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(NER_features[predicted_idx])
            all_labels.append(NER_features[label_idx])

metric.compute(predictions=[all_predictions], references=[all_labels])

Evaluating the model on the test set


{'ART': {'precision': 0.4898876404494382,
  'recall': 0.6770186335403726,
  'f1': 0.5684485006518905,
  'number': 322},
 'CON': {'precision': 0.36363636363636365,
  'recall': 0.4968944099378882,
  'f1': 0.41994750656167984,
  'number': 161},
 'LOC': {'precision': 0.7080745341614907,
  'recall': 0.4634146341463415,
  'f1': 0.5601965601965603,
  'number': 246},
 'MAT': {'precision': 0.7341463414634146,
  'recall': 0.7777777777777778,
  'f1': 0.755332496863237,
  'number': 387},
 'PER': {'precision': 0.6533333333333333,
  'recall': 0.6774193548387096,
  'f1': 0.6651583710407241,
  'number': 434},
 'SPE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2},
 'overall_precision': 0.5958579881656805,
 'overall_recall': 0.648840206185567,
 'overall_f1': 0.621221468229488,
 'overall_accuracy': 0.9500892898026466}