In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast
import ast
import torch

Dataset: https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus/data

In [37]:
df = pd.read_csv('./Data/ner.csv')

print(df.columns, df.dtypes)

Index(['Sentence #', 'Sentence', 'POS', 'Tag'], dtype='object') Sentence #    object
Sentence      object
POS           object
Tag           object
dtype: object


In [38]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [39]:
# Convert the string representation of a list to a list
df['Tag'] = df['Tag'].apply(lambda x: ast.literal_eval(x))

In [40]:
# Atomize tags with explode and get unique labels
labels = set(df['Tag'].explode().unique()) 

labels

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O'}

## Prefixes

`B` - prefix indicates the beginning of a named entity. <br>
`I` - prefix indicates that the token is inside a named entity. <br>
`O` - indicates that the token is not a named entity. <br>
<br>

## Suffixes
`art` Artifacts, e.g., books, songs, etc.<br>
`eve` Events, e.g., battles, elections, holidays, etc.<br>
`geo` Geographical entities, e.g., cities, rivers, countries, etc.<br>
`gpe` Geopolitical entities, e.g., cities, states, countries.<br>
`nat` Natural phenomena, e.g., hurricanes, earthquakes.<br>
`org` Organizations, e.g., companies, government organizations, etc.<br>
`per` Persons.<br>
`tim` Time indicators, e.g., dates, days, months, etc.

In [41]:
label_to_id = {l: i for i, l in enumerate(labels)}
id_to_label = {i: l for l, i in label_to_id.items()}

label_to_id

{'I-tim': 0,
 'B-nat': 1,
 'I-per': 2,
 'I-art': 3,
 'I-gpe': 4,
 'B-geo': 5,
 'B-org': 6,
 'I-nat': 7,
 'I-geo': 8,
 'B-eve': 9,
 'O': 10,
 'B-tim': 11,
 'I-eve': 12,
 'B-art': 13,
 'I-org': 14,
 'B-gpe': 15,
 'B-per': 16}

`padding` : If sequence length not reach maximum add **[PAD]** token

`max_length` : maximum sequence length in tokens

`truncation` : truncate sequence if it exceeds max_length

`return_tensors` : tensor return type

In [42]:
df['Sentence'][0].split()[1]

'of'

In [43]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

tokens = tokenizer(df['Sentence'][5], padding='max_length', truncation=True, max_length=128, return_tensors='pt')

tokens

{'input_ids': tensor([[  101,  1996,  2283,  2003,  4055,  2058,  3725,  1005,  1055,  6577,
          1999,  1996,  5712,  4736,  1998,  1996,  2506, 10813,  1997,  1022,
          1010,  3156,  2329,  3629,  1999,  2008,  2406,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

`input_ids` : numeric represnetation of tokens, where {101: **[CLS]**, 102: **[SEP]**, 0: }

`token_type_ids` : numeric representation of sequence, used in sequence classification or question answering 

`attention_mask` : Boolean for not **[PAD]** token, that is 1 for real tokens, else 0

In [44]:
tokenizer.decode(tokens['input_ids'][0])

"[CLS] the party is divided over britain's participation in the iraq conflict and the continued deployment of 8, 500 british troops in that country. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

subword tokenizer

In [45]:
word_ids = tokens.word_ids()
print(tokenizer.convert_ids_to_tokens(tokens["input_ids"][0]))
print(word_ids)

['[CLS]', 'the', 'party', 'is', 'divided', 'over', 'britain', "'", 's', 'participation', 'in', 'the', 'iraq', 'conflict', 'and', 'the', 'continued', 'deployment', 'of', '8', ',', '500', 'british', 'troops', 'in', 'that', 'country', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

Problem: Model operates at token (sub-word) level and includes special tokens but the labels are provided at the word level with no special tokens<br>
Solution: Align labels with tokens by creating new list of labels that correspond to the tokenized inputs.<br>

In [46]:
def tokenize_and_align_labels(sentence, tags):
    tokenized_input = tokenizer(sentence, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    word_ids = tokenized_input.word_ids()
    labels = []
    j = 0
    for i in word_ids:
        if i is None: # If the word id is None (Special token)
            labels.append(-100) # Ignore the label
        elif i == j:
            labels.append(label_to_id[tags[j]])
            j += 1
        else: # If the word is a subword
            labels.append(-100) # Ignore the label
    return tokenized_input, torch.tensor(labels)

In [None]:
class DataSequence(torch.utils.data.Dataset):
    def __init__(self, df):
        raw_labels = set(df['Tag'].explode().unique()) 
        raw_texts = df['Sentence'].values.tolist()
        
        self.texts = [tokenizer(str(text), padding='max_length', truncation=True, max_length=128, return_tensors='pt') for text in raw_texts]
        self.labels = [tokenize_and_align_labels(sent, tag) for sent, tag in zip(raw_texts, raw_labels)]
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_text(self, idx):
        return self.texts[idx]
    
    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])
    
    def __getitem__(self, idx):
        return self.get_batch_text(idx), self.get_batch_labels(idx)
        

In [None]:
train, val, test = np.split(df.sample(frac=1,random_state=2002), [int(.8*len(df)), int(.9*len(df))])