# Fine Tune BERT model NER dataset

## 1. Installs

In [None]:
%pip install transformers[torch] accelerate tokenizers seqeval evaluate datasets

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=280ea8cae5c89e01a0cef6e23d4569703dbc1c95a8d6cc293bde4bd963b0d031
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval, evaluate
Successfully installed evaluate-0.4.6 seqeval-1.2.2


## Imports

In [3]:
import numpy as np
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification
# BertTokenizerFast: For tokenizing the data
# DataCollatorForTokenClassification: 
# AutoModelForTokenClassification: will used to load pre-trained model

In [6]:
## Disable the w and b
import os
os.environ["WANDB_DISABLED"] = "true"

## Datasets
- CoNELL NER

In [None]:
%pip install -q kagglehub

In [15]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("juliangarratt/conll2003-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/juliangarratt/conll2003-dataset?dataset_version_number=1...


100%|██████████| 960k/960k [00:00<00:00, 128MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/juliangarratt/conll2003-dataset/versions/1





In [None]:
import glob
glob.glob(f"{path}/*/*")

['/root/.cache/kagglehub/datasets/juliangarratt/conll2003-dataset/versions/1/conll2003/eng.testa',
 '/root/.cache/kagglehub/datasets/juliangarratt/conll2003-dataset/versions/1/conll2003/eng.train',
 '/root/.cache/kagglehub/datasets/juliangarratt/conll2003-dataset/versions/1/conll2003/eng.testb']

In [None]:
data_files = dict(enumerate(glob.glob(f"{path}/*/*")))

In [38]:
data_files

{0: '/root/.cache/kagglehub/datasets/juliangarratt/conll2003-dataset/versions/1/conll2003/eng.testa',
 1: '/root/.cache/kagglehub/datasets/juliangarratt/conll2003-dataset/versions/1/conll2003/eng.train',
 2: '/root/.cache/kagglehub/datasets/juliangarratt/conll2003-dataset/versions/1/conll2003/eng.testb'}

In [39]:
from datasets import load_dataset

conll2023 = load_dataset("text", data_files=data_files)

Generating 0 split: 0 examples [00:00, ? examples/s]

Generating 1 split: 0 examples [00:00, ? examples/s]

Generating 2 split: 0 examples [00:00, ? examples/s]

In [46]:
raw_datasets = load_dataset("eriktks/conll2003",revision="convert/parquet")

conll2003/train/0000.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/312k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/283k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [47]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [53]:
raw_datasets['train'].features

{'id': Value('string'),
 'tokens': List(Value('string')),
 'pos_tags': List(ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'])),
 'chunk_tags': List(ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'])),
 'ner_tags': List(ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']))}

In [54]:
raw_datasets['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [57]:
raw_datasets['train'].features['pos_tags']

List(ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']))

In [58]:
raw_datasets['train'].features['chunk_tags']

List(ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP']))

## Tokenizers

In [59]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [61]:
ex1 = raw_datasets['train'][0]
ex1

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [62]:
ex1['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [65]:
ex1_tokens = tokenizer(ex1['tokens'], is_split_into_words=True)

In [66]:
ex1_tokens.keys()

KeysView({'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [72]:
len(ex1_tokens['input_ids']), len(ex1['tokens'])
# with this we can confirm that each has got thier individual numbers where 101 and 102 are start and end of token default

(11, 9)

In [73]:
## convert back to token
tokens = tokenizer.convert_ids_to_tokens(ex1_tokens['input_ids'])
tokens

['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']