In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
!pip install transformers datasets



In [3]:
import nltk
from nltk.corpus import brown

In [4]:
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /usr/share/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [5]:
corpus = brown.tagged_sents(tagset='universal')
corpus

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [6]:
inputs = []
targets = []

for sentence_tag_pairs in corpus:
  tokens = []
  target = []
  for token, tag in sentence_tag_pairs:
    tokens.append(token)
    target.append(tag)
  inputs.append(tokens)
  targets.append(target)

In [7]:
# save data to json format
import json

with open('data.json', 'w') as f:
  for x, y in zip(inputs, targets):
    j = {'inputs': x, 'targets': y}
    s = json.dumps(j)
    f.write(f"{s}\n")

In [8]:
from datasets import load_dataset

In [9]:
data = load_dataset("json", data_files='data.json')

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
data

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 57340
    })
})

In [11]:
small = data["train"].shuffle(seed=42).select(range(20_000))
small

Dataset({
    features: ['inputs', 'targets'],
    num_rows: 20000
})

In [12]:
data = small.train_test_split(seed=42)

In [13]:
data["train"][0]

{'inputs': ['Ulyate',
  'and',
  'Kearton',
  'climbed',
  'on',
  'toward',
  'the',
  'sound',
  'of',
  'the',
  'barking',
  'of',
  'the',
  'dogs',
  'and',
  'the',
  'sporadic',
  'roaring',
  'of',
  'the',
  'lion',
  ',',
  'till',
  'they',
  'came',
  ',',
  'out',
  'of',
  'breath',
  ',',
  'to',
  'the',
  'crest',
  ',',
  'and',
  'peering',
  'through',
  'the',
  'branches',
  'of',
  'a',
  'bush',
  ',',
  'this',
  'is',
  'what',
  'Ulyate',
  'saw',
  ':',
  'Jones',
  'who',
  'had',
  'apparently',
  '(',
  'and',
  'actually',
  'had',
  ')',
  'ridden',
  'up',
  'the',
  'nearly',
  'impassable',
  'hillside',
  ',',
  'sitting',
  'calmly',
  'on',
  'his',
  'horse',
  'within',
  'forty',
  'feet',
  'of',
  'a',
  'full-grown',
  'young',
  'lioness',
  ',',
  'who',
  'was',
  'crouched',
  'on',
  'a',
  'flat',
  'rock',
  'and',
  'seemed',
  'just',
  'about',
  'to',
  'charge',
  'him',
  ',',
  'while',
  'the',
  'dogs',
  'whirled',
  'aroun

In [14]:
data["train"].features

{'inputs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'targets': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [15]:
# map targets to ints
target_set = set()
for target in targets:
  target_set = target_set.union(target)
target_set

{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

In [16]:
target_list = list(target_set)
id2label = {k: v for k, v in enumerate(target_list)}
label2id = {v: k for k, v in id2label.items()}

In [17]:
from transformers import AutoTokenizer

# also try using bert
checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [18]:
idx = 0
t = tokenizer(data["train"][idx]["inputs"], is_split_into_words=True)
t

{'input_ids': [101, 158, 25928, 1566, 1105, 26835, 9349, 1320, 5998, 1113, 1755, 1103, 1839, 1104, 1103, 26635, 1104, 1103, 6363, 1105, 1103, 188, 27695, 23041, 1104, 1103, 11160, 117, 6174, 1152, 1338, 117, 1149, 1104, 2184, 117, 1106, 1103, 13468, 117, 1105, 19205, 1194, 1103, 5020, 1104, 170, 13771, 117, 1142, 1110, 1184, 158, 25928, 1566, 1486, 131, 2690, 1150, 1125, 4547, 113, 1105, 2140, 1125, 114, 17698, 1146, 1103, 2212, 24034, 11192, 1895, 25068, 117, 2807, 13285, 1113, 1117, 3241, 1439, 5808, 1623, 1104, 170, 1554, 118, 4215, 1685, 11160, 5800, 117, 1150, 1108, 15062, 1113, 170, 3596, 2067, 1105, 1882, 1198, 1164, 1106, 2965, 1140, 117, 1229, 1103, 6363, 18370, 1213, 1123, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [19]:
type(t)

transformers.tokenization_utils_base.BatchEncoding

In [20]:
t.tokens()

['[CLS]',
 'U',
 '##lya',
 '##te',
 'and',
 'Ke',
 '##art',
 '##on',
 'climbed',
 'on',
 'toward',
 'the',
 'sound',
 'of',
 'the',
 'barking',
 'of',
 'the',
 'dogs',
 'and',
 'the',
 's',
 '##poradic',
 'roaring',
 'of',
 'the',
 'lion',
 ',',
 'till',
 'they',
 'came',
 ',',
 'out',
 'of',
 'breath',
 ',',
 'to',
 'the',
 'crest',
 ',',
 'and',
 'peering',
 'through',
 'the',
 'branches',
 'of',
 'a',
 'bush',
 ',',
 'this',
 'is',
 'what',
 'U',
 '##lya',
 '##te',
 'saw',
 ':',
 'Jones',
 'who',
 'had',
 'apparently',
 '(',
 'and',
 'actually',
 'had',
 ')',
 'ridden',
 'up',
 'the',
 'nearly',
 'imp',
 '##ass',
 '##able',
 'hillside',
 ',',
 'sitting',
 'calmly',
 'on',
 'his',
 'horse',
 'within',
 'forty',
 'feet',
 'of',
 'a',
 'full',
 '-',
 'grown',
 'young',
 'lion',
 '##ess',
 ',',
 'who',
 'was',
 'crouched',
 'on',
 'a',
 'flat',
 'rock',
 'and',
 'seemed',
 'just',
 'about',
 'to',
 'charge',
 'him',
 ',',
 'while',
 'the',
 'dogs',
 'whirled',
 'around',
 'her',
 '.',
 

In [21]:
# value of i indicates it is the i'th word
# in the input sentence (counting from 0)
t.word_ids()


[None,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 46,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 62,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 75,
 75,
 76,
 77,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 None]

In [22]:
def align_targets(labels, word_ids):
  aligned_labels = []
  for word in word_ids:
    if word is None:
      # it's a token like [CLS]
      label = -100
    else:
      # it's a real word
      label = label2id[labels[word]]

    # add the label 
    aligned_labels.append(label)

  return aligned_labels

In [23]:
# try our function
labels = data['train'][idx]['targets']
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
aligned_targets

[-100,
 2,
 2,
 2,
 8,
 2,
 2,
 2,
 9,
 5,
 10,
 4,
 2,
 10,
 4,
 2,
 10,
 4,
 2,
 8,
 4,
 0,
 0,
 2,
 10,
 4,
 2,
 11,
 10,
 6,
 9,
 11,
 5,
 10,
 2,
 11,
 10,
 4,
 2,
 11,
 8,
 9,
 10,
 4,
 2,
 10,
 4,
 2,
 11,
 4,
 9,
 4,
 2,
 2,
 2,
 9,
 11,
 2,
 6,
 9,
 1,
 11,
 8,
 1,
 9,
 11,
 9,
 10,
 4,
 1,
 0,
 0,
 0,
 2,
 11,
 9,
 1,
 10,
 4,
 2,
 10,
 3,
 2,
 10,
 4,
 0,
 0,
 0,
 0,
 2,
 2,
 11,
 6,
 9,
 9,
 10,
 4,
 0,
 2,
 8,
 9,
 1,
 1,
 5,
 9,
 6,
 11,
 10,
 4,
 2,
 9,
 10,
 6,
 11,
 -100]

In [24]:
aligned_labels = [id2label[i] if i >= 0 else None for i in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
  print(f"{x}\t{y}")

[CLS]	None
U	NOUN
##lya	NOUN
##te	NOUN
and	CONJ
Ke	NOUN
##art	NOUN
##on	NOUN
climbed	VERB
on	PRT
toward	ADP
the	DET
sound	NOUN
of	ADP
the	DET
barking	NOUN
of	ADP
the	DET
dogs	NOUN
and	CONJ
the	DET
s	ADJ
##poradic	ADJ
roaring	NOUN
of	ADP
the	DET
lion	NOUN
,	.
till	ADP
they	PRON
came	VERB
,	.
out	PRT
of	ADP
breath	NOUN
,	.
to	ADP
the	DET
crest	NOUN
,	.
and	CONJ
peering	VERB
through	ADP
the	DET
branches	NOUN
of	ADP
a	DET
bush	NOUN
,	.
this	DET
is	VERB
what	DET
U	NOUN
##lya	NOUN
##te	NOUN
saw	VERB
:	.
Jones	NOUN
who	PRON
had	VERB
apparently	ADV
(	.
and	CONJ
actually	ADV
had	VERB
)	.
ridden	VERB
up	ADP
the	DET
nearly	ADV
imp	ADJ
##ass	ADJ
##able	ADJ
hillside	NOUN
,	.
sitting	VERB
calmly	ADV
on	ADP
his	DET
horse	NOUN
within	ADP
forty	NUM
feet	NOUN
of	ADP
a	DET
full	ADJ
-	ADJ
grown	ADJ
young	ADJ
lion	NOUN
##ess	NOUN
,	.
who	PRON
was	VERB
crouched	VERB
on	ADP
a	DET
flat	ADJ
rock	NOUN
and	CONJ
seemed	VERB
just	ADV
about	ADV
to	PRT
charge	VERB
him	PRON
,	.
while	ADP
the	DET
dogs	NOUN
whirled	VER

In [25]:
# tokenize both inputs and targets
def tokenize_fn(batch):
  # tokenize the input sequence first
  # this populates input_ids, attention_mask, etc.
  tokenized_inputs = tokenizer(
    batch['inputs'], truncation=True, is_split_into_words=True
  )

  labels_batch = batch['targets'] # original targets
  aligned_labels_batch = []
  for i, labels in enumerate(labels_batch):
    word_ids = tokenized_inputs.word_ids(i)
    aligned_labels_batch.append(align_targets(labels, word_ids))
  
  # recall: the 'target' must be stored in key called 'labels'
  tokenized_inputs['labels'] = aligned_labels_batch

  return tokenized_inputs

In [26]:
# want to remove these from model inputs - they are neither inputs nor targets
data["train"].column_names

['inputs', 'targets']

In [27]:
tokenized_datasets = data.map(
  tokenize_fn,
  batched=True,
  remove_columns=data["train"].column_names,
)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [28]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [29]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [30]:
# https://stackoverflow.com/questions/11264684/flatten-list-of-lists
def flatten(list_of_lists):
  flattened = [val for sublist in list_of_lists for val in sublist]
  return flattened

In [31]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  preds = np.argmax(logits, axis=-1)

  # remove -100 from labels and predictions
  labels_jagged = [[t for t in label if t != -100] for label in labels]

  # do the same for predictions whenever true label is -100
  preds_jagged = [[p for p, t in zip(ps, ts) if t != -100] \
      for ps, ts in zip(preds, labels)
  ]

  # flatten labels and preds
  labels_flat = flatten(labels_jagged)
  preds_flat = flatten(preds_jagged)

  acc = accuracy_score(labels_flat, preds_flat)
  f1 = f1_score(labels_flat, preds_flat, average='macro')

  return {
    'f1': f1,
    'accuracy': acc,
  }

In [32]:
labels = [[-100, 0, 0, 1, 2, 1, -100]]
logits = np.array([[
  [0.8, 0.1, 0.1],
  [0.8, 0.1, 0.1],
  [0.8, 0.1, 0.1],
  [0.1, 0.8, 0.1],
  [0.1, 0.8, 0.1],
  [0.1, 0.8, 0.1],
  [0.1, 0.8, 0.1],
]])
compute_metrics((logits, labels))

{'f1': 0.6, 'accuracy': 0.8}

In [33]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "distilbert-finetuned-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [35]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0692,0.059947,0.943797,0.982335
2,0.0279,0.053,0.956641,0.985624


TrainOutput(global_step=3750, training_loss=0.06696873194376628, metrics={'train_runtime': 160.9021, 'train_samples_per_second': 186.449, 'train_steps_per_second': 23.306, 'total_flos': 387922021634304.0, 'train_loss': 0.06696873194376628, 'epoch': 2.0})

In [36]:
trainer.save_model('my_saved_model')

In [37]:
from transformers import pipeline

pipe = pipeline(
  "token-classification",
  model='my_saved_model',
  device=0,
)

In [38]:
s = "Bill Gates was the CEO of Microsoft in Seattle, Washington."
pipe(s)

[{'entity': 'NOUN',
  'score': 0.999678,
  'index': 1,
  'word': 'Bill',
  'start': 0,
  'end': 4},
 {'entity': 'NOUN',
  'score': 0.9995586,
  'index': 2,
  'word': 'Gates',
  'start': 5,
  'end': 10},
 {'entity': 'VERB',
  'score': 0.99980694,
  'index': 3,
  'word': 'was',
  'start': 11,
  'end': 14},
 {'entity': 'DET',
  'score': 0.99988675,
  'index': 4,
  'word': 'the',
  'start': 15,
  'end': 18},
 {'entity': 'NOUN',
  'score': 0.9996425,
  'index': 5,
  'word': 'CEO',
  'start': 19,
  'end': 22},
 {'entity': 'ADP',
  'score': 0.99985826,
  'index': 6,
  'word': 'of',
  'start': 23,
  'end': 25},
 {'entity': 'NOUN',
  'score': 0.999671,
  'index': 7,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity': 'ADP',
  'score': 0.9997894,
  'index': 8,
  'word': 'in',
  'start': 36,
  'end': 38},
 {'entity': 'NOUN',
  'score': 0.9996666,
  'index': 9,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity': '.',
  'score': 0.999821,
  'index': 10,
  'word': ',',
  'start