In [None]:
# !pip install -U datasets
!pip install seqeval
# !pip install transformers
!pip install evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=f0884418681c460553c94061ce8f2c45f6fb7da28d3880ec1670df530d027354
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.0.0-py3-none-any.whl.m

In [None]:
import evaluate
import time
import random
import numpy as np
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, ClassLabel
from functools import partial

In [None]:
import transformers

print(transformers.__version__)

4.44.2


In [None]:
SEED = 42
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
set_seed(SEED)

## Dataset Ingestion

In [None]:
!unzip ner_datasetreference.csv.zip

Archive:  ner_datasetreference.csv.zip
  inflating: ner_datasetreference.csv  


In [None]:
data_path = r"ner_datasetreference.csv"
ner_df = pd.read_csv(data_path, encoding='unicode_escape')

In [None]:
ner_df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


In [None]:
tag_counts =ner_df['Tag'].value_counts().to_dict()

In [None]:
entity_counts = {}
for tag, freq in tag_counts.items():
  if tag!= 'O' and tag.startswith("B"):
    if tag[2:] not in entity_counts:
      entity_counts[tag[2:]] = 0
    entity_counts[tag[2:]] += freq
entity_counts

{'geo': 37644,
 'tim': 20333,
 'org': 20143,
 'per': 16990,
 'gpe': 15870,
 'art': 402,
 'eve': 308,
 'nat': 201}

In [None]:
remove_cols = ['B-art', 'I-art', 'B-eve', 'I-eve', 'B-nat', 'I-nat']
ner_df = ner_df[~ner_df['Tag'].isin(remove_cols)]
ner_df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


In [None]:
ner_df['Tag'].value_counts()

Unnamed: 0_level_0,count
Tag,Unnamed: 1_level_1
O,887908
B-geo,37644
B-tim,20333
B-org,20143
I-per,17251
B-per,16990
I-org,16784
B-gpe,15870
I-geo,7414
I-tim,6528


In [None]:
# ner_df['Sentence #'] = ner_df['Sentence #'].ffill()

In [None]:
len(ner_df['Sentence #'].unique())

47921

In [None]:
ner_df = ner_df.fillna(method = 'ffill')

  ner_df = ner_df.fillna(method = 'ffill')


In [None]:
ner_df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [None]:
# ner_df = ner_df.astype({'Word': str, 'Tag': str})

In [None]:
ner_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1047063 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Sentence #  1047063 non-null  object
 1   Word        1047063 non-null  object
 2   POS         1047063 non-null  object
 3   Tag         1047063 non-null  object
dtypes: object(4)
memory usage: 39.9+ MB


In [None]:
gk = ner_df.groupby(by = 'Sentence #')

In [None]:
# new_ner_df = pd.DataFrame(columns = ['Sentence #', 'Sentence', 'labels'])

In [None]:
# s_time = time.time()
# sentence_no = []
# sentences = []
# labels = []
# for sno in ner_df['Sentence #'].unique():
#   g_df = gk.get_group(sno)
#   x = ' '.join(map(str, g_df['Word']))
#   sentence_no.append(sno)
#   sentences.append(x)
#   labels.append(','.join(g_df['Tag']))
# new_ner_df['Sentence #'] = sentence_no
# new_ner_df['Sentence'] = sentences
# new_ner_df['labels'] = labels
# e_time = time.time()
# print("elapsed time: ", (e_time - s_time)/60)

In [None]:
gk = ner_df.groupby('Sentence #')
s_time = time.time()
ner_df['Sentence'] = gk['Word'].transform(lambda x: '[SEP]'.join([x_dash.strip() for x_dash in x]))
ner_df['ner_tags'] = gk['Tag'].transform(lambda x: '[SEP]'.join([x_dash.strip() for x_dash in x]))
new_ner_df = ner_df[['Sentence', 'ner_tags']].drop_duplicates().reset_index(drop=True)
# print(new_ner_df)
e_time = time.time()
new_ner_df['tokens'] = new_ner_df['Sentence'].apply(lambda x: x.split('[SEP]'))
new_ner_df['Sentence'] = new_ner_df['Sentence'].apply(lambda x: x.replace("[SEP]", " "))
new_ner_df['ner_tags'] = new_ner_df['ner_tags'].apply(lambda x: x.split('[SEP]'))
print("elapsed time: ", (e_time - s_time)/60)
# new_ner_df = ['Word'].transform(lambda x: ' '.join(x))

elapsed time:  0.2149635155995687


In [None]:
new_ner_df

Unnamed: 0,Sentence,ner_tags,tokens
0,Thousands of demonstrators have marched throug...,"[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[Thousands, of, demonstrators, have, marched, ..."
1,Families of soldiers killed in the conflict jo...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Families, of, soldiers, killed, in, the, conf..."
2,They marched from the Houses of Parliament to ...,"[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo...","[They, marched, from, the, Houses, of, Parliam..."
3,"Police put the number of marchers at 10,000 wh...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[Police, put, the, number, of, marchers, at, 1..."
4,The protest comes on the eve of the annual con...,"[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,...","[The, protest, comes, on, the, eve, of, the, a..."
...,...,...,...
47566,Indian border security forces are accusing the...,"[B-gpe, O, O, O, O, O, O, B-gpe, O, O, O, O, O...","[Indian, border, security, forces, are, accusi..."
47567,Indian officials said no one was injured in Sa...,"[B-gpe, O, O, O, O, O, O, O, B-tim, O, O, O, O...","[Indian, officials, said, no, one, was, injure..."
47568,Two more landed in fields belonging to a nearb...,"[O, O, O, O, O, O, O, O, O, O, O]","[Two, more, landed, in, fields, belonging, to,..."
47569,They say not all of the rockets exploded upon ...,"[O, O, O, O, O, O, O, O, O, O, O]","[They, say, not, all, of, the, rockets, explod..."


In [None]:
new_ner_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47571 entries, 0 to 47570
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  47571 non-null  object
 1   ner_tags  47571 non-null  object
 2   tokens    47571 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [None]:
new_ner_df['len_tokens'] = new_ner_df['tokens'].apply(lambda x: len(x))
new_ner_df['len_labels'] = new_ner_df['ner_tags'].apply(lambda x: len(x))

In [None]:
new_ner_df['check'] = new_ner_df['len_tokens'] == new_ner_df['len_labels']
new_ner_df[new_ner_df['check'] == False]

Unnamed: 0,Sentence,ner_tags,tokens,len_tokens,len_labels,check


In [None]:
new_ner_df.drop(columns = ['check', 'len_tokens', 'len_labels'], axis = 1, inplace = True)

In [None]:
# new_ner_df = new_ner_df.sample(n = 2000, random_state = SEED)

In [None]:

train, test = train_test_split(new_ner_df, test_size=0.20, random_state=SEED, shuffle=True)
train, val = train_test_split(train, test_size=0.10, random_state = SEED)

In [None]:
train.shape, test.shape, val.shape

((34250, 3), (9515, 3), (3806, 3))

In [None]:
train

Unnamed: 0,Sentence,ner_tags,tokens
18147,The witnesses say the Yemeni community was ele...,"[O, O, O, O, B-gpe, O, O, O, O, O, O, O, O, B-...","[The, witnesses, say, the, Yemeni, community, ..."
14510,The Uri Party vowed to push through a bill rep...,"[O, B-per, I-per, O, O, O, O, O, O, O, O, O, O...","[The, Uri, Party, vowed, to, push, through, a,..."
13442,The release is part of a promise Israeli Prime...,"[O, O, O, O, O, O, O, B-gpe, B-per, I-per, I-p...","[The, release, is, part, of, a, promise, Israe..."
3706,Officials in Thailand say at least 47 Burmese ...,"[O, O, B-geo, O, O, O, O, B-gpe, O, O, O, O, O...","[Officials, in, Thailand, say, at, least, 47, ..."
7799,The U.S. military says a man dressed in an Afg...,"[O, B-geo, O, O, O, O, O, O, O, B-gpe, O, O, O...","[The, U.S., military, says, a, man, dressed, i..."
...,...,...,...
38224,"North of Baghdad , near Baquba , gunmen attack...","[B-org, I-org, I-org, O, O, B-geo, O, O, O, O,...","[North, of, Baghdad, ,, near, Baquba, ,, gunme..."
39066,Eight died from their injuries .,"[O, O, O, O, O, O]","[Eight, died, from, their, injuries, .]"
11361,The emergency decree authorizes a midnight-to-...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[The, emergency, decree, authorizes, a, midnig..."
39610,"But soon after signing the agreement , the Nor...","[O, O, O, O, O, O, O, O, B-geo, O, B-geo, O, O...","[But, soon, after, signing, the, agreement, ,,..."


In [None]:
x = train.loc[100].to_list()
list(zip(x[1], x[2]))

[('O', 'The'),
 ('B-gpe', 'Pakistani'),
 ('O', 'military'),
 ('O', 'launched'),
 ('O', 'its'),
 ('O', 'offensive'),
 ('O', 'in'),
 ('B-geo', 'Orakzai'),
 ('O', 'to'),
 ('O', 'hunt'),
 ('B-org', 'Taliban'),
 ('O', 'insurgents'),
 ('O', '.')]

In [None]:
ds = DatasetDict()

In [None]:
train_ner_ds = Dataset.from_pandas(train)
test_ner_ds = Dataset.from_pandas(test)
val_ner_ds = Dataset.from_pandas(val)

ds['train'] = train_ner_ds
ds['test'] = test_ner_ds
ds['val'] = val_ner_ds

In [None]:
ds['train'] = ds['train'].remove_columns('__index_level_0__')
ds['test'] = ds['test'].remove_columns('__index_level_0__')
ds['val'] = ds['val'].remove_columns('__index_level_0__')

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['Sentence', 'ner_tags', 'tokens'],
        num_rows: 34250
    })
    test: Dataset({
        features: ['Sentence', 'ner_tags', 'tokens'],
        num_rows: 9515
    })
    val: Dataset({
        features: ['Sentence', 'ner_tags', 'tokens'],
        num_rows: 3806
    })
})

In [None]:
len(ds['train'][0]['ner_tags']), len(ds['train'][0]['tokens'])

(25, 25)

In [None]:
for tag, token in zip(ds['train'][1]['ner_tags'], ds['train'][1]['tokens']):
  print(tag, ":", token)

O : The
B-per : Uri
I-per : Party
O : vowed
O : to
O : push
O : through
O : a
O : bill
O : repealing
O : the
O : law
O : last
O : year
O : ,
O : but
O : the
O : opposition
B-org : Grand
I-org : National
I-org : Party
O : blocked
O : the
O : legislation
O : ,
O : saying
B-geo : North
I-geo : Korea
O : still
O : poses
O : a
O : threat
O : to
O : national
O : security
O : .


In [None]:
ds['train'][0]['Sentence']

'The witnesses say the Yemeni community was electing a local leader in southern Mogadishu Thursday when a grenade was thrown into the meeting hall .'

In [None]:
ds['train'].features['ner_tags']

Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)

In [None]:
label_names = list(ner_df['Tag'].unique())
print(len(label_names))
label_names = set([label.replace("B-", "").replace("I-", "") for label in label_names])
new_label_names = ['O']
for label in label_names:
  if label != 'O':
    new_label_names.extend(["B-"+label, "I-"+label])
label_names = new_label_names
label_names

11


['O',
 'B-gpe',
 'I-gpe',
 'B-tim',
 'I-tim',
 'B-geo',
 'I-geo',
 'B-org',
 'I-org',
 'B-per',
 'I-per']

In [None]:
ix_to_label = {k:v for k, v in enumerate(label_names)}
label_to_ix = {k:v for v,k in ix_to_label.items()}
print(ix_to_label)
print(label_to_ix)

{0: 'O', 1: 'B-gpe', 2: 'I-gpe', 3: 'B-tim', 4: 'I-tim', 5: 'B-geo', 6: 'I-geo', 7: 'B-org', 8: 'I-org', 9: 'B-per', 10: 'I-per'}
{'O': 0, 'B-gpe': 1, 'I-gpe': 2, 'B-tim': 3, 'I-tim': 4, 'B-geo': 5, 'I-geo': 6, 'B-org': 7, 'I-org': 8, 'B-per': 9, 'I-per': 10}


In [None]:
def map_label2id(ner_tags, label_to_ix):
    return [label_to_ix[label] for label in ner_tags]

def map_label2id_batches(examples, label_to_ix):
  # print(len(examples))
  all_ner_tags = examples['ner_tags']
  labels = []
  for ner_tags in all_ner_tags:
    # print(example)
    labels.append(map_label2id(ner_tags, label_to_ix))
  examples['labels'] = labels
  return examples

# ds = ds.map(partial(map_label2id, label_to_ix = label_to_ix))
ds = ds.map(partial(map_label2id_batches, label_to_ix = label_to_ix), batched = True)
# ds['train'] = ds['train'].map(map_label2id)
# ds['test'] = ds['test'].map(map_label2id)
# ds['val'] = ds['val'].map(map_label2id)

Map:   0%|          | 0/34250 [00:00<?, ? examples/s]

Map:   0%|          | 0/9515 [00:00<?, ? examples/s]

Map:   0%|          | 0/3806 [00:00<?, ? examples/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['Sentence', 'ner_tags', 'tokens', 'labels'],
        num_rows: 34250
    })
    test: Dataset({
        features: ['Sentence', 'ner_tags', 'tokens', 'labels'],
        num_rows: 9515
    })
    val: Dataset({
        features: ['Sentence', 'ner_tags', 'tokens', 'labels'],
        num_rows: 3806
    })
})

In [None]:
for label_id, token in zip(ds['train'][0]['labels'], ds['train'][0]['tokens']):
  print(label_id, ":", ix_to_label[label_id], ":", token)

0 : O : The
0 : O : witnesses
0 : O : say
0 : O : the
1 : B-gpe : Yemeni
0 : O : community
0 : O : was
0 : O : electing
0 : O : a
0 : O : local
0 : O : leader
0 : O : in
0 : O : southern
5 : B-geo : Mogadishu
3 : B-tim : Thursday
0 : O : when
0 : O : a
0 : O : grenade
0 : O : was
0 : O : thrown
0 : O : into
0 : O : the
0 : O : meeting
0 : O : hall
0 : O : .


In [None]:
# # Creating a ClassLabel Object
# df = dataset["train"].to_pandas()
# labels = df['label'].unique().tolist()
# ClassLabels = ClassLabel(num_classes=len(labels), names=labels)

# # Mapping Labels to IDs
# def map_label2id(example):
#     example['label'] = ClassLabels.str2int(example['label'])
#     return example

# dataset = dataset.map(map_label2id, batched=True)

# # Casting label column to ClassLabel Object
# dataset = dataset.cast_column('label', ClassLabels)

In [None]:
# # Creating a ClassLabel Object
# ClassLabels = ClassLabel(num_classes=len(labels), names=labels)

# # Mapping Labels to IDs
# def map_label2id(example):
#     example['labels'] = ClassLabels.str2int(example['labels'])
#     return example

# ds['train'] = ds['train'].map(map_label2id, batched=True)

# # # Casting label column to ClassLabel Object
# # dataset = dataset.cast_column('label', ClassLabels)

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [None]:
inputs = tokenizer(ds["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'The',
 'witnesses',
 'say',
 'the',
 'Yemen',
 '##i',
 'community',
 'was',
 'elect',
 '##ing',
 'a',
 'local',
 'leader',
 'in',
 'southern',
 'Mo',
 '##ga',
 '##dis',
 '##hu',
 'Thursday',
 'when',
 'a',
 'grenade',
 'was',
 'thrown',
 'into',
 'the',
 'meeting',
 'hall',
 '.',
 '[SEP]']

In [None]:
inputs.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 4,
 5,
 6,
 7,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 13,
 13,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 None]

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            # label_name = ix_to_label[label]
            if label % 2 == 1:
            # if label_name.startswith("B-"):
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
new_labels = ds["train"][0]["labels"]
# print(labels)
word_ids = inputs.word_ids()
print(new_labels, len(new_labels))
print(word_ids, len(word_ids))
align_label_tokens = align_labels_with_tokens(new_labels, word_ids)
print(align_label_tokens,len(align_label_tokens))
print(inputs.tokens(), len(inputs.tokens()))

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 25
[None, 0, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 13, 13, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, None] 32
[-100, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 6, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100] 32
['[CLS]', 'The', 'witnesses', 'say', 'the', 'Yemen', '##i', 'community', 'was', 'elect', '##ing', 'a', 'local', 'leader', 'in', 'southern', 'Mo', '##ga', '##dis', '##hu', 'Thursday', 'when', 'a', 'grenade', 'was', 'thrown', 'into', 'the', 'meeting', 'hall', '.', '[SEP]'] 32


In [None]:
align_label_name_tokens = [ix_to_label[label_id] if label_id != -100 else None for label_id in align_label_tokens]

In [None]:
# align_label_name_tokens

In [None]:
list(zip(word_ids, inputs.tokens(), align_label_tokens, align_label_name_tokens))

[(None, '[CLS]', -100, None),
 (0, 'The', 0, 'O'),
 (1, 'witnesses', 0, 'O'),
 (2, 'say', 0, 'O'),
 (3, 'the', 0, 'O'),
 (4, 'Yemen', 1, 'B-gpe'),
 (4, '##i', 2, 'I-gpe'),
 (5, 'community', 0, 'O'),
 (6, 'was', 0, 'O'),
 (7, 'elect', 0, 'O'),
 (7, '##ing', 0, 'O'),
 (8, 'a', 0, 'O'),
 (9, 'local', 0, 'O'),
 (10, 'leader', 0, 'O'),
 (11, 'in', 0, 'O'),
 (12, 'southern', 0, 'O'),
 (13, 'Mo', 5, 'B-geo'),
 (13, '##ga', 6, 'I-geo'),
 (13, '##dis', 6, 'I-geo'),
 (13, '##hu', 6, 'I-geo'),
 (14, 'Thursday', 3, 'B-tim'),
 (15, 'when', 0, 'O'),
 (16, 'a', 0, 'O'),
 (17, 'grenade', 0, 'O'),
 (18, 'was', 0, 'O'),
 (19, 'thrown', 0, 'O'),
 (20, 'into', 0, 'O'),
 (21, 'the', 0, 'O'),
 (22, 'meeting', 0, 'O'),
 (23, 'hall', 0, 'O'),
 (24, '.', 0, 'O'),
 (None, '[SEP]', -100, None)]

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = ds.map(tokenize_and_align_labels,
    batched=True,
    remove_columns=ds["train"].column_names,
)

Map:   0%|          | 0/34250 [00:00<?, ? examples/s]

Map:   0%|          | 0/9515 [00:00<?, ? examples/s]

Map:   0%|          | 0/3806 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 34250
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9515
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3806
    })
})

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding = "longest")

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    1,    2,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    5,    6,    6,    6,    3,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0, -100, -100, -100, -100, -100,
         -100, -100, -100, -100],
        [-100,    0,    9,   10,   10,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    7,    8,    8,
            0,    0,    0,    0,    0,    5,    6,    0,    0,    0,    0,    0,
            0,    0,    0, -100]])

In [None]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 6, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]
[-100, 0, 9, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [None]:
# labels = ds["train"][0]["labels"]
# print(labels)
# labels = [label_names[i] for i in labels]
# labels

In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
# predictions = labels.copy()
# predictions[4] = "O"
# metric.compute(predictions=[predictions], references=[labels])

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
id2label

{0: 'O',
 1: 'B-gpe',
 2: 'I-gpe',
 3: 'B-tim',
 4: 'I-tim',
 5: 'B-geo',
 6: 'I-geo',
 7: 'B-org',
 8: 'I-org',
 9: 'B-per',
 10: 'I-per'}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.num_labels

11

In [None]:
# !rm -r /content/model/

In [None]:
trained_model_path = r"/content/model"

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    # "bert-finetuned-ner",
    output_dir = trained_model_path,
    gradient_accumulation_steps = 8,
    load_best_model_at_end = True,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    weight_decay=0.01,
    greater_is_better = True,
    overwrite_output_dir = True,
    save_total_limit = 1,
    logging_strategy = "epoch",
    metric_for_best_model = "f1", #f1
    # push_to_hub=True,
)



In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    # test_dataset = tokenized_datasets['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
0,0.2699,0.131671,0.782186,0.815062,0.798286,0.959694
1,0.1293,0.118247,0.794586,0.828075,0.810985,0.962911
2,0.1138,0.114419,0.808204,0.832681,0.82026,0.964608
3,0.1056,0.113813,0.809131,0.836711,0.82269,0.965178


TrainOutput(global_step=2140, training_loss=0.15462612972081263, metrics={'train_runtime': 1268.4197, 'train_samples_per_second': 108.008, 'train_steps_per_second': 1.687, 'total_flos': 2926766906005140.0, 'train_loss': 0.15462612972081263, 'epoch': 3.998131714152265})

In [None]:
!zip -r /content/model.zip /content/model

  adding: content/model/ (stored 0%)
  adding: content/model/runs/ (stored 0%)
  adding: content/model/runs/Sep18_19-46-06_2e86667e9070/ (stored 0%)
  adding: content/model/runs/Sep18_19-46-06_2e86667e9070/events.out.tfevents.1726688773.2e86667e9070.1285.7 (deflated 61%)
  adding: content/model/checkpoint-2140/ (stored 0%)
  adding: content/model/checkpoint-2140/special_tokens_map.json (deflated 42%)
  adding: content/model/checkpoint-2140/optimizer.pt (deflated 15%)
  adding: content/model/checkpoint-2140/tokenizer_config.json (deflated 76%)
  adding: content/model/checkpoint-2140/model.safetensors (deflated 7%)
  adding: content/model/checkpoint-2140/training_args.bin (deflated 51%)
  adding: content/model/checkpoint-2140/vocab.txt (deflated 49%)
  adding: content/model/checkpoint-2140/rng_state.pth (deflated 25%)
  adding: content/model/checkpoint-2140/scheduler.pt (deflated 56%)
  adding: content/model/checkpoint-2140/trainer_state.json (deflated 68%)
  adding: content/model/checkp

In [None]:
from google.colab import files
files.download("/content/model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from torch.utils.data import DataLoader

# train_dataloader = DataLoader(
#     tokenized_datasets["train"],
#     shuffle=True,
#     collate_fn=data_collator,
#     batch_size=8,
# )

#load model
def load_saved_model(model_checkpoint, device):
  model = AutoModelForTokenClassification.from_pretrained(
      model_checkpoint,
  )
  model.to(device)
  return model

def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

def evaluate(model, test_dataloader, device):
  for i, batch in enumerate(test_dataloader):
    batch = {k:v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    # print("outputs: ", outputs.logits.shape)
    prob = torch.nn.functional.softmax(outputs.logits, dim = -1)
    # print("prob: ", prob.shape)
    predictions = torch.argmax(prob, dim = -1)
    labels = batch["labels"]
    # print("predictions: ", predictions.shape)
    # print("labels: ", labels.shape)
    true_predictions, true_labels = postprocess(predictions, labels)
    # print("true_predictions: ", true_predictions)
    # print("true_labels: ", true_labels)
    metric.add_batch(predictions=true_predictions, references=true_labels)
  results = metric.compute()
  print(
      {
          key: results[f"overall_{key}"]
          for key in ["precision", "recall", "f1", "accuracy"]
      })





In [None]:
model_checkpoint = r"/content/model/checkpoint-2140"
device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu')
test_dataloader = DataLoader(
    tokenized_datasets["test"], collate_fn=data_collator, batch_size=8
)
model = load_saved_model(model_checkpoint, device)

In [None]:
evaluate(model, test_dataloader, device)

{'precision': 0.8369706475464934, 'recall': 0.8066773204336372, 'f1': 0.8215448227324712, 'accuracy': 0.9619527962077432}


In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
# model_checkpoint = "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'per',
  'score': 0.9507495,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'geo',
  'score': 0.9831645,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]