In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import ast

  from .autonotebook import tqdm as notebook_tqdm


Dataset: https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus/data

In [2]:
df = pd.read_csv('./Data/ner.csv')

print(df.columns, df.dtypes)

Index(['Sentence #', 'Sentence', 'POS', 'Tag'], dtype='object') Sentence #    object
Sentence      object
POS           object
Tag           object
dtype: object


In [3]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [4]:
# Convert the string representation of a list to a list
df['Tag'] = df['Tag'].apply(lambda x: ast.literal_eval(x))

In [5]:
# Atomize tags with explode and get unique labels
labels = set(df['Tag'].explode().unique()) 

labels

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O'}

## Prefixes

B - prefix indicates the beginning of a named entity. <br>
I - prefix indicates that the token is inside a named entity. <br>
O - indicates that the token is not a named entity. <br>
<br>

## Suffixes
art: Artifacts, e.g., books, songs, etc.<br>
eve: Events, e.g., battles, elections, holidays, etc.<br>
geo: Geographical entities, e.g., cities, rivers, countries, etc.<br>
gpe: Geopolitical entities, e.g., cities, states, countries.<br>
nat: Natural phenomena, e.g., hurricanes, earthquakes.<br>
org: Organizations, e.g., companies, government organizations, etc.<br>
per: Persons.<br>
tim: Time indicators, e.g., dates, days, months, etc.

In [6]:
label_to_id = {l: i for i, l in enumerate(labels)}
id_to_label = {i: l for l, i in label_to_id.items()}

label_to_id

{'I-eve': 0,
 'B-per': 1,
 'B-tim': 2,
 'B-nat': 3,
 'I-gpe': 4,
 'I-per': 5,
 'I-nat': 6,
 'I-tim': 7,
 'B-eve': 8,
 'O': 9,
 'I-org': 10,
 'B-geo': 11,
 'I-art': 12,
 'B-org': 13,
 'B-art': 14,
 'B-gpe': 15,
 'I-geo': 16}

`padding` : If sequence length not reach maximum add **[PAD]** token

`max_length` : maximum sequence length in tokens

`truncation` : truncate sequence if it exceeds max_length

`return_tensors` : tensor return type

In [7]:
df['Sentence'][0].split()[1]

'of'

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokens = tokenizer(df['Sentence'][0], padding='max_length', truncation=True, max_length=128, return_tensors='pt')

tokens

{'input_ids': tensor([[  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
          1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
          3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

`input_ids` : numeric represnetation of tokens, where {101: **[CLS]**, 102: **[SEP]**, 0: }

`token_type_ids` : numeric representation of sequence, used in sequence classification or question answering 

`attention_mask` : Boolean for not **[PAD]** token, that is 1 for real tokens, else 0

In [9]:
tokenizer.decode(tokens['input_ids'][0])

2024-02-12 12:06:43.246905: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'[CLS] thousands of demonstrators have marched through london to protest the war in iraq and demand the withdrawal of british troops from that country. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'