
### Data description

**NER features:**

*'O'* = Outside (not part of named entity)  

*'ART'* = Artefacts (objects)  

*'PER'* = Time Periods  

*'MAT'* = Materials  

*'LOC'* = Locations  

*'CON'* = Contexts  

*'SPE'* = Species

**Prefixes**

*'B-'*  Beginning token of a named entity

*'I-'* = Inside token of a named entity  

<br><br>


**Examples:**

<span style="font-size: 14px;">

"Medieval JJ B-PER

and CC O

post-medieval JJ B-PER

The DT B-LOC

Blue NNP I-LOC

Boar NNP I-LOC

Inn NNP I-LOC

( ( O

John NNP O "



<br>

*Note:*  the second token in a line is the POS (part of speeach) but it is not our target
</span>



In [None]:
import numpy as np

path = 'val.txt'
with open(path, 'r') as file:
    raw_data = file.read()

#print(raw_data)

lines = raw_data.split('\n')


# take the last word of each line, and take only what's after '-', take unique values
NER_features = list(set(line.split()[-1].split('-')[-1] for line in lines if line))

print(NER_features)

# same but without unique values

words = list(line.split()[0]for line in lines if line)

labels = list(line.split()[-1] for line in lines if line)

for inp, lbl in zip(inputs, labels):
    print(f"{inp} {lbl}")




['ART', 'O', 'LOC', 'PER', 'CON', 'SPE', 'MAT']
The O
modern B-PER
ploughsoil O
was O
300 O
mm O
deep O
and O
directly O
overlay O
archaeological O
deposits O
. O
The O
trench O
was O
extensively O
hand-cleaned O
. O
Generally O
, O
archaeological O
features O
directly O
cut O
the O
natural O
flint O
y O
clay/Chalk O
Head O
and O
it O
appears O
that O
modern B-PER
ploughing O
has O
removed O
superficial O
Rom B-PER
an I-PER
deposits O
. O
However O
, O
layer O
3511 O
was O
a O
thin O
dark O
pebbly O
deposit O
concealing O
pit O
3505 O
. O
There O
was O
no O
trace O
of O
an O
underlying O
buried O
soil O
and O
the O
interpretation O
of O
this O
layer O
is O
uncertain O
. O
It O
is O
possible O
that O
it O
was O
a O
plough-disturbed O
capping O
to O
the O
pit O
. O
5.1.49 O
Pit O
3505 O
was O
circular O
0.85 O
m O
in O
diameter O
with O
almost O
vertical O
sides O
. O
The O
water-table O
was O
reached O
at O
650 O
mm O
although O
the O
basal O
break O
of O
slope O
could O
be O
detected O

In [2]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
import numpy as np
from transformers import AutoTokenizer

def Data_preprocessing(path =  'val.txt'):
        
    
    with open(path, 'r') as file:
        raw_data = file.read()

    #print(raw_data)

    lines = raw_data.split('\n')

    # take the last word of each line, and take only what's after '-', take unique values
    # find NER_features and construct the NER tags
    NER_features = list(set(line.split()[-1] for line in lines if line))
    NER_features.insert(0, NER_features.pop(NER_features.index('O'))) # move 'O' to the front so the tag is 0
    ner_tags = {tag: id for id, tag in enumerate(NER_features)}

    #print(NER_features)
    #print(ner_tags)


    # Separate lines into phrases

    phrases = []
    current_phrase = []

    # Group lines into phrases based on the period (.) separator
    for line in lines:
        current_phrase.append(line)
        if line.startswith('.'):
            if current_phrase:
                phrases.append(current_phrase)
                current_phrase = []

    # Right now the first element in a phrase is an empty string 
    for phrase in phrases:
        del phrase[0]


    """
    ###Example RUN
    phrase = phrases[0]

    # construct the words, labels 
    words = list(word.split()[0]for word in phrase if word.strip())

    labels = list(label.split()[-1] for label in phrase if label.strip())

    #print(f"words NER NER_tags")
    for inp, lbl in zip(words, labels):
        print(f"{inp} {lbl} {ner_tags[lbl]}")

    model_checkpoint = "bert-base-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    inputs = tokenizer(words, is_split_into_words=True)

    #inputs.tokens()

    word_ids = inputs.word_ids()

    label_tag = [ner_tags[lbl] for lbl in labels]

    new_labels = align_labels_with_tokens(label_tag, word_ids)


    print(inputs.tokens())
    print(new_labels)

    ### END OF EXAMPLE RUN

    """


    model_checkpoint = "bert-base-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    
    all_tokenized = []

    total_phrases = len(phrases)


    for i, phrase in enumerate(phrases, start=1):

        print(f"Iteration {i} out of {total_phrases}")    

        # construct the words, labels 
        words = list(word.split()[0]for word in phrase if word.strip())

        labels = list(label.split()[-1] for label in phrase if label.strip())

        inputs = tokenizer(words, is_split_into_words=True)

        word_ids = inputs.word_ids()

        label_tag = [ner_tags[lbl] for lbl in labels]

        new_labels = align_labels_with_tokens(label_tag, word_ids)

        all_tokenized.append((inputs, new_labels))
    
    return all_tokenized





**EXAMPLE RUN results**


| Token         | Label   | Label_tag |
|--------------|-------|-------|
| modern       | B-PER | 11    |
| ploughsoil   | O     | 0     |
| was          | O     | 0     |
| 300          | O     | 0     |
| mm           | O     | 0     |
| deep         | O     | 0     |
| and          | O     | 0     |
| directly     | O     | 0     |
| overlay      | O     | 0     |
| archaeological | O   | 0     |
| deposits     | O     | 0     |
| .            | O     | 0     |

Inputs:

['[CLS]', 'modern', 'p', '##lough', '##so', '##il', 'was', '300', 'mm', 'deep', 'and', 'directly', 'over', '##lay', 'archaeological', 'deposits', '.', '[SEP]']

New Labels:

[-100, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


Note: it's not yet padded

In [21]:

# it takes quite a while to run
val_data = Data_preprocessing('val.txt')
train_data = Data_preprocessing('train.txt')
test_data = Data_preprocessing('test.txt')



100
Iteration 1 out of 873
Iteration 2 out of 873
Iteration 3 out of 873
Iteration 4 out of 873
Iteration 5 out of 873
Iteration 6 out of 873
Iteration 7 out of 873
Iteration 8 out of 873
Iteration 9 out of 873
Iteration 10 out of 873
Iteration 11 out of 873
Iteration 12 out of 873
Iteration 13 out of 873
Iteration 14 out of 873
Iteration 15 out of 873
Iteration 16 out of 873
Iteration 17 out of 873
Iteration 18 out of 873
Iteration 19 out of 873
Iteration 20 out of 873
Iteration 21 out of 873
Iteration 22 out of 873
Iteration 23 out of 873
Iteration 24 out of 873
Iteration 25 out of 873
Iteration 26 out of 873
Iteration 27 out of 873
Iteration 28 out of 873
Iteration 29 out of 873
Iteration 30 out of 873
Iteration 31 out of 873
Iteration 32 out of 873
Iteration 33 out of 873
Iteration 34 out of 873
Iteration 35 out of 873
Iteration 36 out of 873
Iteration 37 out of 873
Iteration 38 out of 873
Iteration 39 out of 873
Iteration 40 out of 873
Iteration 41 out of 873
Iteration 42 out of 8

In [None]:
# still need some more data processing



In [None]:
from transformers import DataCollatorForTokenClassification, AutoTokenizer


model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# add padding to the data
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf"
)


print(val_data[0])

# Convert tuples to dictionaries
#val_data_dicts = [{"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels} for inputs, labels in val_data]

batch = data_collator(list(val_data_dicts[i] for i in range(2)))
print(batch["labels"])



# columns=["attention_mask", "input_ids", "labels", "token_type_ids"],






({'input_ids': [101, 2030, 185, 14704, 7301, 2723, 1108, 3127, 2608, 1996, 1105, 2626, 1166, 6622, 8962, 10009, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, [-100, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100])
tf.Tensor(
[[-100   11    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0 -100]
 [-100    0    0    0    0    0    0    0    0 -100 -100 -100 -100 -100
  -100 -100 -100 -100]], shape=(2, 18), dtype=int64)


In [34]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.10.10-cp312-cp312-win_amd64.whl.metadata (7.8 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\baroi\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import tensorflow as tf
from datasets import Dataset

# Convert tuples to dictionaries
train_data_dicts = [{"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels} for inputs, labels in train_data]

# Convert list of dictionaries to dictionary of lists
train_data_dict = {key: [dic[key] for dic in train_data_dicts] for key in train_data_dicts[0]}

train_data_good = Dataset.from_dict(train_data_dict)

# Create a TensorFlow dataset
tf_train_dataset = train_data_good.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

# Repeat the process for the validation data

# Convert tuples to dictionaries
val_data_dicts = [{"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels} for inputs, labels in val_data]

# Convert list of dictionaries to dictionary of lists
val_data_dict = {key: [dic[key] for dic in val_data_dicts] for key in val_data_dicts[0]}

# Create a Dataset object from the dictionary
val_data_good = Dataset.from_dict(val_data_dict)


# Create a TensorFlow dataset
tf_val_dataset = val_data_good.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    collate_fn=data_collator,
    shuffle=False,  # Typically, validation data is not shuffled
    batch_size=16,
)


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 873
})


In [42]:
import tensorflow as tf

# Assuming tf_dataset is your TensorFlow dataset
for batch in tf_val_dataset:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    
    # Print the tensors
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Labels:", labels)
    
    # Check if tensors are empty
    if tf.size(input_ids) == 0:
        print("Input IDs tensor is empty.")
    if tf.size(attention_mask) == 0:
        print("Attention Mask tensor is empty.")
    if tf.size(labels) == 0:
        print("Labels tensor is empty.")
    
    break  # Remove this break to print all batches

Input IDs: tf.Tensor(
[[  101  2030   185 14704  7301  2723  1108  3127  2608  1996  1105  2626
   1166  6622  8962 10009   119   102     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0]
 [  101  1109 19374  1108  7620  1289   118 12370   119   102     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0]
 [  101 15559   117  8962  1956  2626  2195  1103  2379 22593 10879   194
   8785   120 24705 10493  3763  1105  1122  2691  1115  2030   185 14704
   1158  1144  2856 26558   155  4165  1126 10009   119   102     0     0
      0     0     0]
 [  101  1438   117  6440  2588 14541  1108   170  4240  1843   185 15581
   4999 14304 21689  1158  7172  8301  1571   119   102     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0]
 [  10

In [45]:
id2label = {i: label for i, label in enumerate(NER_features)}
label2id = {v: k for k, v in id2label.items()}

print(NER_features)

['O', 'B-MAT', 'B-ART', 'I-CON', 'B-SPE', 'B-LOC', 'I-SPE', 'I-ART', 'I-LOC', 'I-PER', 'I-MAT', 'B-PER', 'B-CON']


In [46]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)





All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
model.config.num_labels


13

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [48]:
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
# tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [None]:
from transformers.keras_callbacks import PushToHubCallback


model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    epochs=num_epochs,
    
)

Epoch 1/3

Epoch 2/3