In [1]:
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset('conllpp', trust_remote_code=True)
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [4]:
pd.DataFrame(data['train'][:])[['tokens', 'ner_tags']].iloc[0]

tokens      [EU, rejects, German, call, to, boycott, Briti...
ner_tags                          [3, 0, 7, 0, 0, 0, 7, 0, 0]
Name: 0, dtype: object

In [5]:
tags=data['train'].features['ner_tags'].feature

index2tag={idx:tag for idx, tag in enumerate(tags.names)}
tag2index={tag:idx for idx, tag in index2tag.items()}

In [6]:
tags.int2str(3)

'B-ORG'

In [7]:
def create_tag_names(batch):
    temp= {'ner_tags_str': [ tags.int2str(idx) for idx in batch['ner_tags']]}
    return temp

In [8]:
data=data.map(create_tag_names)

In [9]:
# Access the dataframe with the new column
pd.DataFrame(data['train'][:])[['tokens', 'ner_tags', 'ner_tags_str']].iloc[0]

tokens          [EU, rejects, German, call, to, boycott, Briti...
ner_tags                              [3, 0, 7, 0, 0, 0, 7, 0, 0]
ner_tags_str            [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]
Name: 0, dtype: object

# Model building 

In [10]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)

In [11]:
tokenizer.is_fast

True

In [12]:
inputs=data['train'][0]['tokens']
inputs=tokenizer(inputs, is_split_into_words=True)
print(inputs.tokens())

['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']


In [13]:
print(data['train'][0]['ner_tags_str'])
print(data['train'][0]['tokens'])

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']


In [14]:
print(inputs.word_ids())

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


In [15]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word=None
  for word_id in word_ids:
    if word_id != current_word:
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)

    elif word_id is None:
      new_labels.append(-100)

    else:
      label = labels[word_id]

      if label%2==1:
        label = label + 1
      new_labels.append(label)

  return new_labels

In [16]:
labels=data['train'][0]['ner_tags']
word_ids=inputs.word_ids()
print(labels, word_ids)

[3, 0, 7, 0, 0, 0, 7, 0, 0] [None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


In [17]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

  all_labels = examples['ner_tags']

  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs['labels'] = new_labels

  return tokenized_inputs

In [19]:
tokenized_datasets = data.map(tokenize_and_align_labels, remove_columns=data['train'].column_names, batched=True)

Map: 100%|██████████| 14041/14041 [00:01<00:00, 8632.04 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 4102.75 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 5523.09 examples/s]


In [20]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

## Data Collection and Metrics

In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [27]:

batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[  101,  7327, 19164,  2446,  2655,  2000, 17757,  2329, 12559,  1012,
           102],
        [  101,  2848, 13934,   102,     0,     0,     0,     0,     0,     0,
             0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100]])}

## Metrics

In [28]:
import evaluate
metric=evaluate.load('seqeval')

Downloading builder script: 100%|██████████| 6.34k/6.34k [00:00<00:00, 12.1MB/s]


In [29]:
ner_features = data['train'].features['ner_tags']
ner_features

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [30]:
label_names = ner_features.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [31]:
labels= data['train'][0]['ner_tags']
labels=[label_names[label] for label in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [32]:
predictions = labels.copy()
predictions[2] = "O"

metric.compute(predictions=[predictions], references=[labels])

{'MISC': {'precision': np.float64(1.0),
  'recall': np.float64(0.5),
  'f1': np.float64(0.6666666666666666),
  'number': np.int64(2)},
 'ORG': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'overall_precision': np.float64(1.0),
 'overall_recall': np.float64(0.6666666666666666),
 'overall_f1': np.float64(0.8),
 'overall_accuracy': 0.8888888888888888}

In [33]:
import numpy as np

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

# Model Traning

In [34]:
id2label = {i:label for i, label in enumerate(label_names)}
label2id = {label:i for i, label in enumerate(label_names)}

In [35]:

print(id2label)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


In [36]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label, label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
model.config.num_labels

9

In [38]:
from transformers import TrainingArguments

args = TrainingArguments("distilbert-finetuned-ner",
                         evaluation_strategy = "epoch",
                         save_strategy="epoch",
                         learning_rate = 2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)



In [39]:
from transformers import Trainer
trainer = Trainer(model=model,
                  args=args,
                  train_dataset = tokenized_datasets['train'],
                  eval_dataset = tokenized_datasets['validation'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

trainer.train()

  trainer = Trainer(model=model,
 10%|▉         | 501/5268 [01:16<11:49,  6.72it/s]

{'loss': 0.2706, 'grad_norm': 3.0364344120025635, 'learning_rate': 1.810174639331815e-05, 'epoch': 0.28}


 19%|█▉        | 1001/5268 [02:31<11:12,  6.35it/s]

{'loss': 0.093, 'grad_norm': 0.1904825121164322, 'learning_rate': 1.6203492786636296e-05, 'epoch': 0.57}


 28%|██▊       | 1501/5268 [03:47<08:39,  7.25it/s]

{'loss': 0.0756, 'grad_norm': 2.2823235988616943, 'learning_rate': 1.4305239179954442e-05, 'epoch': 0.85}


                                                   
 33%|███▎      | 1756/5268 [04:44<08:42,  6.72it/s]

{'eval_loss': 0.05920746177434921, 'eval_precision': 0.9054031860732469, 'eval_recall': 0.927802086839448, 'eval_f1': 0.9164657966918793, 'eval_accuracy': 0.9830492318934977, 'eval_runtime': 18.8458, 'eval_samples_per_second': 172.452, 'eval_steps_per_second': 21.596, 'epoch': 1.0}


 38%|███▊      | 2001/5268 [05:26<07:43,  7.05it/s]  

{'loss': 0.0577, 'grad_norm': 2.2862045764923096, 'learning_rate': 1.240698557327259e-05, 'epoch': 1.14}


 47%|████▋     | 2501/5268 [06:41<06:23,  7.21it/s]

{'loss': 0.0415, 'grad_norm': 1.307584524154663, 'learning_rate': 1.0508731966590738e-05, 'epoch': 1.42}


 57%|█████▋    | 3001/5268 [07:55<05:32,  6.83it/s]

{'loss': 0.0419, 'grad_norm': 0.04176468402147293, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.71}


 66%|██████▋   | 3501/5268 [09:10<04:32,  6.49it/s]

{'loss': 0.0356, 'grad_norm': 3.192499876022339, 'learning_rate': 6.712224753227031e-06, 'epoch': 1.99}


                                                   
 67%|██████▋   | 3512/5268 [09:30<04:02,  7.23it/s]

{'eval_loss': 0.05554986745119095, 'eval_precision': 0.923305785123967, 'eval_recall': 0.9400875126220128, 'eval_f1': 0.9316210807204804, 'eval_accuracy': 0.985209779655901, 'eval_runtime': 18.6093, 'eval_samples_per_second': 174.644, 'eval_steps_per_second': 21.871, 'epoch': 2.0}


 76%|███████▌  | 4001/5268 [10:49<03:16,  6.45it/s]  

{'loss': 0.022, 'grad_norm': 0.04176066815853119, 'learning_rate': 4.8139711465451785e-06, 'epoch': 2.28}


 85%|████████▌ | 4501/5268 [12:03<01:35,  8.03it/s]

{'loss': 0.0206, 'grad_norm': 2.5300686359405518, 'learning_rate': 2.9157175398633257e-06, 'epoch': 2.56}


 95%|█████████▍| 5001/5268 [13:16<00:43,  6.19it/s]

{'loss': 0.0236, 'grad_norm': 4.52792501449585, 'learning_rate': 1.0174639331814731e-06, 'epoch': 2.85}


                                                   
100%|██████████| 5268/5268 [14:29<00:00,  6.29it/s]

{'eval_loss': 0.0586426667869091, 'eval_precision': 0.9278008298755187, 'eval_recall': 0.9407606866374958, 'eval_f1': 0.9342358151583522, 'eval_accuracy': 0.9856387119322605, 'eval_runtime': 22.9091, 'eval_samples_per_second': 141.865, 'eval_steps_per_second': 17.766, 'epoch': 3.0}


100%|██████████| 5268/5268 [14:33<00:00,  6.03it/s]

{'train_runtime': 873.3111, 'train_samples_per_second': 48.234, 'train_steps_per_second': 6.032, 'train_loss': 0.06571457054729071, 'epoch': 3.0}





TrainOutput(global_step=5268, training_loss=0.06571457054729071, metrics={'train_runtime': 873.3111, 'train_samples_per_second': 48.234, 'train_steps_per_second': 6.032, 'total_flos': 445994355589020.0, 'train_loss': 0.06571457054729071, 'epoch': 3.0})

In [41]:
from transformers import pipeline

checkpoint = "distilbert-finetuned-ner/checkpoint-5268"
token_classifier = pipeline(
    "token-classification", model=checkpoint, aggregation_strategy="simple"
)

token_classifier("My name is Laxmi Kant Tiwari. I work at KGP Talkie and live in Mumbai")

Device set to use cuda:0


[{'entity_group': 'PER',
  'score': np.float32(0.99818516),
  'word': 'laxmi kant tiwari',
  'start': 11,
  'end': 28},
 {'entity_group': 'ORG',
  'score': np.float32(0.97350466),
  'word': 'kgp talkie',
  'start': 40,
  'end': 50},
 {'entity_group': 'LOC',
  'score': np.float32(0.9980684),
  'word': 'mumbai',
  'start': 63,
  'end': 69}]