# Fine Tune bert-base-uncased for Name entity recognition

In [None]:
# Dataset Structure 

""" I have my data in .json format in like below type structure.."""

# tokens,nertags
# [When, are, the, semester, breaks, in, the, un...   ,[O, O, O, B-SEMESTER, O, O, O, B-INSTITUTE, O] 

In [10]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [5]:
from datasets import Dataset

In [4]:
import pandas as pd

# Load Dataset

In [6]:
json_pd = pd.read_json('/content/eng_ner_tags.json')

In [1]:
from sklearn.preprocessing import LabelEncoder


In [7]:


# Flatten the list of all NER tags across rows to find unique tags
all_tags = [tag for sublist in json_pd['ner_tags'] for tag in sublist]

# Initialize LabelEncoder and fit on all unique tags
label_encoder = LabelEncoder()
label_encoder.fit(all_tags)

# Apply the label encoding to each list of NER tags
json_pd['encoded_ner_tags'] = json_pd['ner_tags'].apply(lambda tags: label_encoder.transform(tags))

print(json_pd)

                                               tokens  \
0   [When, are, the, semester, breaks, in, the, un...   
1   [iam, from, third, semester, what, is, the, my...   
2   [All, subject, list, of, ITI, depart, 3rd, sem...   
3   [All, subject, name, of, ITI, depart, third, s...   
4     [subject, list, of, iti, cse, 3rd, semester, ?]   
..                                                ...   
85   [Is, there, a, minor, in, economics, in, bba, ?]   
86  [electives, select, karne, ki, date, kab, hai,...   
87  [2nd, semester, me, kaunsa, course, easy, rahe...   
88  [summer, courses, ke, registration, kab, hoti,...   
89  [industrial, training, ke, marks, kaise, distr...   

                                             ner_tags  \
0      [O, O, O, B-SEMESTER, O, O, O, B-INSTITUTE, O]   
1   [O, O, B-TERM, B-SEMESTER, O, O, O, O, O, O, O...   
2   [O, O, O, O, B-DEPARTMENT, O, B-SEMESTER, I-SE...   
3   [O, O, B-TYPE, O, B-DEPARTMENT, O, B-SEMESTER,...   
4   [O, B-TYPE, O, B-DEPARTMEN

In [None]:
json_pd['encoded_ner_tags'].iloc[0]

array([187, 187, 187, 129, 187, 187, 187,  93, 187])

In [8]:
import json

# Save the label mapping
label_mapping = {label: idx for idx, label in enumerate(label_encoder.classes_)}
with open('ner_label_mapping.json', 'w') as f:
    json.dump(label_mapping, f)

# To load it back
with open('ner_label_mapping.json', 'r') as f:
    loaded_mapping = json.load(f)

print(loaded_mapping)

{'B-ACTION': 0, 'B-BRANCH': 1, 'B-DEGREE': 2, 'B-DEPARTMENT': 3, 'B-ELECTIVE': 4, 'B-INSTITUTE': 5, 'B-LAST_NAME': 6, 'B-MIDDLE_NAME': 7, 'B-MODE': 8, 'B-NAME': 9, 'B-PROJECT': 10, 'B-SEMESTER': 11, 'B-SUBJECT': 12, 'B-SUMMER': 13, 'B-TERM': 14, 'B-TYPE': 15, 'B-YEAR': 16, 'I-BRANCH': 17, 'I-DEGREE': 18, 'I-DEPARTMENT': 19, 'I-SEMESTER': 20, 'I-SUBJECT': 21, 'I-YEAR': 22, 'O': 23}


In [9]:
hf_data = Dataset.from_pandas(json_pd)

In [10]:
def add_token_length(example):
    # Add a new column that contains the length of each 'ner_tags' list
    example['features_name'] = list(loaded_mapping.keys())
    return example

# Use map to apply the function and add the new column
dataset = hf_data.map(add_token_length)


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [None]:
json_pd['ner_tags'].iloc[0]

['O', 'O', 'O', 'B-SEMESTER', 'O', 'O', 'O', 'B-ORG', 'O']

In [None]:
# from transformers import AutoTokenizer


# tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [13]:
example =  dataset['tokens'][0]
tokenized_input = tokenizer(example, is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'When',
 'are',
 'the',
 'semester',
 'breaks',
 'in',
 'the',
 'university',
 '?',
 '[SEP]']

In [14]:
def tokenize_and_align_labels(examples):

    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"encoded_ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [40]:
tokenized_wnut = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [41]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [17]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [18]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=b2ce4f51a3e5e29ea5145d0c3a721d282960db6c567bc5cb1c732f678d8a4cde
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [19]:
import evaluate

seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [20]:
import numpy as np
import tensorflow as tf

In [42]:
id2label =  loaded_mapping
label2id = {value: key for key, value in loaded_mapping.items()}


In [43]:
from transformers import create_optimizer

batch_size = 8
num_train_epochs = 3
num_train_steps = (len(tokenized_wnut["tokens"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=0.01,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [39]:
from transformers import TFAutoModelForTokenClassification

checkpoint = "bert-base-cased"

model = TFAutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)



from transformers import AutoTokenizer

checkpoint = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [44]:
# Split the dataset into training and test sets
split_dataset = tokenized_wnut.train_test_split(test_size=0.2)
# Access the split datasets
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

In [None]:
ORG_DATA = {'input_ids':train_dataset['input_ids'],'attention_mask':train_dataset['attention_mask'],'labels':train_dataset['labels']}

TypeError: '_PrefetchDataset' object is not subscriptable

In [None]:
test_tf = {'input_ids':test_dataset['input_ids'],'attention_mask':test_dataset['attention_mask'],'labels':test_dataset['labels']}

In [None]:
import tensorflow as tf

train_tf = tf.data.Dataset.from(train_tf)


ValueError: Can't convert non-rectangular Python sequence to Tensor.

In [None]:
# train_dataset['labels'][0]

[-100, 187, 187, 187, 93, -100, -100, 176, 187, -100]

In [45]:
columns = ["attention_mask", "input_ids", "labels", "token_type_ids"]
batch_size = 8

tf_train_dataset = train_dataset.to_tf_dataset(
    columns=columns,
    collate_fn=data_collator,
    batch_size=batch_size,
    shuffle=True,
)

tf_eval_dataset = test_dataset.to_tf_dataset(
    columns=columns,
    collate_fn=data_collator,
    batch_size=batch_size,
    shuffle=False,
)

In [35]:
data = {}

for i in tf_train_set.take(3):
  # Your existing dataset
   print(i)

({'input_ids': <tf.Tensor: shape=(8, 15), dtype=int64, numpy=
array([[  101,  4841,  1271,  1104, 27378,  8231,  3048, 24821,  2036,
          126, 14516,  1306,   136,   102,     0],
       [  101,  2548,  1104, 27378,  8231,  3048,  2987,  4035,  1403,
         3049, 14594,   136,   102,     0,     0],
       [  101,  4841,  2666,  1104, 27378,  8231,  3048, 24821,  2036,
          125, 14516,  1306,   136,   102,     0],
       [  101,  4841,  1104, 27378,  8231,  3048, 24821,  2036,   122,
        14516,  1306,   136,   102,     0,     0],
       [  101,   188, 22948,  7441,  2666,  1104,  6676,  4035,  1403,
          123, 14594,   123,  1214,   136,   102],
       [  101,  2548,  1271,  1104, 27378,  8231,  3048,  4252,  2853,
         1148, 14594,   136,   102,     0,     0],
       [  101,   188,  7777,  1742,  7441,  1104,  2987,  3752,  1248,
        14594,  1248,  1214,   136,   102,     0],
       [  101,  1143,  1732,   180,  1162,  2548,  2190,  3952, 24181,
         1197

In [None]:
dataset_TF = tf.data.Dataset.from_tensor_slices(data)


In [None]:
dataset_TF = dataset_TF.shuffle(5).batch(2)  # Adjust the batch size as needed


In [None]:
for j in dataset_TF.take(10):
  print(j)

{'input_ids': <tf.Tensor: shape=(2, 19), dtype=int64, numpy=
array([[  101,  1038,  1012,  4012,  2033,  3622,  9634, 15030, 18712,
         2050,   102,     0,     0,     0,     0,     0,     0,     0,
            0],
       [  101,  2942,  3330,  2033, 11073, 17710,  5622,  6672,  4563,
         3316, 10556,  4609,  9033,  2980,  2072, 15030,  2078,  1029,
          102]])>, 'attention_mask': <tf.Tensor: shape=(2, 19), dtype=int64, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>, 'labels': <tf.Tensor: shape=(2, 19), dtype=int64, numpy=
array([[-100,   33, -100, -100,  187,  187,    3,  187,  187, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100],
       [-100,   34,   57,  187,  101,  187,  187, -100,  147,   25,  110,
        -100,  187,  187, -100,  187, -100,  187, -100]])>}
{'input_ids': <tf.Tensor: shape=(2, 19), dtype=int64, numpy=
array([[  101,  4897,  2193,  4638, 1

In [63]:
from transformers import create_optimizer
import tensorflow as tf

num_epochs = 3
num_train_steps = len(train_dataset['tokens']) * num_epochs

# set up optimizer with learning rate decay
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01
)

# compile
model.compile(optimizer=optimizer)

In [None]:
# Define a custom loss function that uses SparseCategoricalCrossentropy
def custom_sparse_categorical_crossentropy(y_true, y_pred):
    # Create a mask to ignore -100 values
    mask = tf.not_equal(y_true, -100)

    # Replace -100 with a valid class (optional, TensorFlow will ignore these)
    y_true = tf.where(mask, y_true, tf.zeros_like(y_true))

    # Compute sparse categorical crossentropy
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    # Apply the mask to the loss function using sample weights
    loss = loss_fn(y_true, y_pred, sample_weight=tf.cast(mask, dtype=tf.float32))

    return loss

In [None]:
 [-100, 187, 187, 187, 187, 1, 187, 66, 66, 187, -100, -100, -100, -100, -1]

{'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}


In [59]:
# from transformers.keras_callbacks import KerasMetricCallback

# metric_callback = KerasMetricCallback(metric_fn=compute_metrics_fn, eval_dataset=tf_validation_set)

In [None]:
tf.keras.callbacks.metr

In [60]:
callbacks = [metric_callback]

In [65]:
ES_CALLLBACKS = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=2,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

In [58]:
def compute_metrics_fn(all):

  ytrue,y_pred = all

  print(all)

In [66]:
model.fit(x=tf_train_set,validation_data = tf_validation_set, epochs=30,callbacks=[ES_CALLLBACKS])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30


<tf_keras.src.callbacks.History at 0x7962cc9a51e0>

In [1]:
# Save the  model...

In [2]:
# Evaluate the model...

In [None]:
from seqeval.metrics import accuracy_score

In [74]:
text = "civil eng tech online"


In [68]:
# Update the model configuration
model.config.id2label = label2id
model.config.label2id = id2label

In [72]:
import time
from transformers import pipeline
classifier = pipeline("ner", model=model,tokenizer=tokenizer)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [75]:

classifier(text)

[{'entity': 'B-BRANCH',
  'score': 0.9667848,
  'index': 1,
  'word': 'civil',
  'start': 0,
  'end': 5},
 {'entity': 'I-DEPARTMENT',
  'score': 0.5313637,
  'index': 2,
  'word': 'en',
  'start': 6,
  'end': 8},
 {'entity': 'B-BRANCH',
  'score': 0.39473745,
  'index': 3,
  'word': '##g',
  'start': 8,
  'end': 9},
 {'entity': 'I-DEGREE',
  'score': 0.6446946,
  'index': 4,
  'word': 'tech',
  'start': 10,
  'end': 14}]

In [None]:
model.config.id2label

{'B-ACADEMIC': 0,
 'B-ACTION': 1,
 'B-ADDRESS': 2,
 'B-ADMISSION': 3,
 'B-ADMISSION_MODE': 4,
 'B-ADMIT': 5,
 'B-ADMIT_CARD': 6,
 'B-AGE': 7,
 'B-AGE_LIMIT': 8,
 'B-AMOUNT': 9,
 'B-ANNUAL': 10,
 'B-APPLICATION': 11,
 'B-AVAILABILITY': 12,
 'B-AVERAGE': 13,
 'B-BACKLOG': 14,
 'B-BEST': 15,
 'B-BREAKDOWN': 16,
 'B-CARD': 17,
 'B-CENTER': 18,
 'B-CGPA': 19,
 'B-CHANCE': 20,
 'B-CHARGE': 21,
 'B-CLASSES': 22,
 'B-CODE': 23,
 'B-CODING': 24,
 'B-COMPANY': 25,
 'B-COMPARISON': 26,
 'B-CONFIRMATION': 27,
 'B-COUNSELING': 28,
 'B-COURSE': 29,
 'B-CUTOFF': 30,
 'B-DATE': 31,
 'B-DEADLINE': 32,
 'B-DEGREE': 33,
 'B-DEPARTMENT': 34,
 'B-DEPOSIT': 35,
 'B-DEPT': 36,
 'B-DIFFICULTY': 37,
 'B-DIPLOMA': 38,
 'B-DISCUSSION': 39,
 'B-DOCUMENT': 40,
 'B-DOCUMENTS': 41,
 'B-DOMAIN': 42,
 'B-DOWNLOAD': 43,
 'B-DRESS': 44,
 'B-DURATION': 45,
 'B-ELECTIVE': 46,
 'B-ELIGIBILITY': 47,
 'B-EMAIL': 48,
 'B-ENTRANCE': 49,
 'B-ENTRY': 50,
 'B-EVENT': 51,
 'B-EXAM': 52,
 'B-FACILITY': 53,
 'B-FACULTY': 54,
 'B-FEE