In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load the dataset
from datasets import load_dataset
raw_dataset = load_dataset("conll2003")
print(raw_dataset)

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [2]:
# look at first element of the dataset
item = raw_dataset['train'][0]
print(item['tokens'])
print(item['ner_tags'])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[3, 0, 7, 0, 0, 0, 7, 0, 0]


+ The tokens key of each element int he dataset returns a list of tokens
+ ner_tags key of each element returns a list of each token's NER tag
    + NER: Named Entity Recognition, used to identify specific entities in a piece of text
+ POS tagging: Parts of Speech tagging, classifies each token as the correct part of speech
+ IOB taggin: Inside out beginning, assigns labels to token that are apart of a specific "chunk" (group of words that correspond to a speciric class)

In [3]:
# see full list of NER labels in the dataset
print(raw_dataset['train'].features['ner_tags'].feature.names)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [4]:
# creating an AutoTokenizer object using the BERT pre-trained model
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
inputs = tokenizer(raw_dataset['train'][0]['tokens'], is_split_into_words=True)
print(inputs.tokens())



['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']


BERT: pre trained on large amounts of unlabeled text data, syntactic and semantic structres
+ has a fixed vocabulary, tokens not found represented by subtokens and characters
+ results in length mismatch which can be resulved by: tokenize_and_align_labels

+ [CLS] --> token that indicates start of new input
+ [SEP] --> token that indicates end of input

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# map() method can apply tokenize_and_align_labels() to all elements of the dataset
tokenized_dataset = raw_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_dataset["train"].column_names
)

print(tokenized_dataset)

Map: 100%|██████████| 14041/14041 [00:00<00:00, 19185.97 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 23979.38 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 27783.19 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})





+ input_ids: numerical representations of tokens
+ labels: contains correct class for each token
+ attention_mask: batches sequences together
+ token_type_ids: used in next sentence prediction tasks

Data Collator: using DataCollatorForTokenClassification, pads text and labels to length of longest in its batch, gives us uniform length samples

In [6]:
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

label_names = raw_dataset['train'].features['ner_tags'].feature.names
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

label_names = raw_dataset["train"].features["ner_tags"].feature.names
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", id2label=id2label, label2id=label2id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Now: create a TrainingArguments object that contains training parameters and configurations
+ passed into Trainer object, train() method called on object
+ can save model in directory

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model('./saved_model') #For reuse

  2%|▏         | 46/2634 [00:53<1:33:12,  2.16s/it]

KeyboardInterrupt: 