# Fine Tuning bert-base-cased for Name entity recognition task in university dataset

**Insatll necessary libraries**

In [2]:
# @title
!pip install datasets
!pip install gdown

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

**Import libraries**

In [176]:
from datasets import Dataset
import pandas as pd
import gdown
import json
import tensorflow as tf
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
import ast

# Load Dataset

In [122]:

ds = load_dataset("AkashPrasadMishra/ner_university")

# Convert to Pandas
df = pd.DataFrame(ds['train'])

In [123]:
df

Unnamed: 0,index,tokens,ner_tags
0,0,"['When', 'are', 'the', 'semester', 'breaks', '...","['O', 'O', 'O', 'B-SEMESTER', 'O', 'O', 'O', '..."
1,1,"['iam', 'from', 'third', 'semester', 'what', '...","['O', 'O', 'B-TERM', 'B-SEMESTER', 'O', 'O', '..."
2,2,"['All', 'subject', 'list', 'of', 'ITI', 'depar...","['O', 'O', 'O', 'O', 'B-DEPARTMENT', 'O', 'B-S..."
3,3,"['All', 'subject', 'name', 'of', 'ITI', 'depar...","['O', 'O', 'B-TYPE', 'O', 'B-DEPARTMENT', 'O',..."
4,4,"['subject', 'list', 'of', 'iti', 'cse', '3rd',...","['O', 'B-TYPE', 'O', 'B-DEPARTMENT', 'B-DEPART..."
...,...,...,...
206,208,"['sylabus', 'names', 'of', 'mechanical', 'eng'...","['O', 'B-TYPE', 'O', 'B-BRANCH', 'B-DEPARTMENT..."
207,209,"['syllabus', 'of', 'electrical', 'eng', 'secon...","['O', 'O', 'B-BRANCH', 'B-DEPARTMENT', 'B-SEME..."
208,210,"['sylabus', 'names', 'of', 'mech', 'eng', '2',...","['O', 'B-TYPE', 'O', 'B-BRANCH', 'B-DEPARTMENT..."
209,211,"['syllabus', 'of', 'ex', 'eng', 'second', 'sem...","['O', 'O', 'B-BRANCH', 'B-DEPARTMENT', 'B-SEME..."


#Load named dataset entities

In [124]:

file_id = '1IRpWXGuXoRQITBcGjuYgfYucqtG-uIkP'
label_map = gdown.download(f'https://drive.google.com/uc?id={file_id}', 'label_map.json', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1IRpWXGuXoRQITBcGjuYgfYucqtG-uIkP
To: /content/label_map.json
100%|██████████| 3.26k/3.26k [00:00<00:00, 8.70MB/s]


In [125]:
df = df.drop(['index'],axis=1)

In [127]:

df['ner_tags'] = df['ner_tags'].apply(lambda x: ast.literal_eval(x))

df['tokens'] = df['tokens'].apply(lambda x: ast.literal_eval(x))

In [128]:


# Flatten the list of all NER tags across rows to find unique tags
all_tags = [tag for sublist in df['ner_tags'] for tag in sublist]

# Initialize LabelEncoder and fit on all unique tags
label_encoder = LabelEncoder()
label_encoder.fit(all_tags)

# Apply the label encoding to each list of NER tags
df['encoded_ner_tags'] = df['ner_tags'].apply(lambda tags: label_encoder.transform(tags))

print(df)

                                               tokens  \
0   [When, are, the, semester, breaks, in, the, un...   
1   [iam, from, third, semester, what, is, the, my...   
2   [All, subject, list, of, ITI, depart, 3rd, sem...   
3   [All, subject, name, of, ITI, depart, third, s...   
4     [subject, list, of, iti, cse, 3rd, semester, ?]   
..                                                ...   
85   [Is, there, a, minor, in, economics, in, bba, ?]   
86  [electives, select, karne, ki, date, kab, hai,...   
87  [2nd, semester, me, kaunsa, course, easy, rahe...   
88  [summer, courses, ke, registration, kab, hoti,...   
89  [industrial, training, ke, marks, kaise, distr...   

                                             ner_tags  \
0      [O, O, O, B-SEMESTER, O, O, O, B-INSTITUTE, O]   
1   [O, O, B-TERM, B-SEMESTER, O, O, O, O, O, O, O...   
2   [O, O, O, O, B-DEPARTMENT, O, B-SEMESTER, I-SE...   
3   [O, O, B-TYPE, O, B-DEPARTMENT, O, B-SEMESTER,...   
4   [O, B-TYPE, O, B-DEPARTMEN

In [129]:
df['encoded_ner_tags'].iloc[0]

array([25, 25, 25, 12, 25, 25, 25,  5, 25])

In [131]:

# Save the label mapping
label_mapping = {label: idx for idx, label in enumerate(label_encoder.classes_)}
with open('/content/label_map.json', 'w') as f:
    json.dump(label_mapping, f)

# To load it back
with open('/content/label_map.json', 'r') as f:
    loaded_mapping = json.load(f)

print(loaded_mapping)

{'B-ACTION': 0, 'B-BRANCH': 1, 'B-DEGREE': 2, 'B-DEPARTMENT': 3, 'B-ELECTIVE': 4, 'B-INSTITUTE': 5, 'B-LAST_NAME': 6, 'B-MIDDLE_NAME': 7, 'B-MODE': 8, 'B-NAME': 9, 'B-PROFESSION': 10, 'B-PROJECT': 11, 'B-SEMESTER': 12, 'B-SUBJECT': 13, 'B-SUMMER': 14, 'B-TERM': 15, 'B-TYPE': 16, 'B-YEAR': 17, 'I-BRANCH': 18, 'I-DEGREE': 19, 'I-DEPARTMENT': 20, 'I-PROFESSION': 21, 'I-SEMESTER': 22, 'I-SUBJECT': 23, 'I-YEAR': 24, 'O': 25}


In [136]:
id2label =  loaded_mapping
label2id = {value: key for key, value in loaded_mapping.items()}


In [134]:
df['ner_tags'].iloc[0]

['O', 'O', 'O', 'B-SEMESTER', 'O', 'O', 'O', 'B-INSTITUTE', 'O']

In [137]:
from transformers import TFAutoModelForTokenClassification

checkpoint = "bert-base-cased"

model = TFAutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)



from transformers import AutoTokenizer

checkpoint = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [132]:
hf_data = Dataset.from_pandas(df)

In [138]:
def add_token_length(example):
    # Add a new column that contains the length of each 'ner_tags' list
    example['features_name'] = list(loaded_mapping.keys())
    return example

# Use map to apply the function
dataset = hf_data.map(add_token_length)


Map:   0%|          | 0/211 [00:00<?, ? examples/s]

**Tokenize each tokens**

In [140]:
def tokenize_and_align_labels(examples):

    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"encoded_ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [141]:
tokenized_wnut = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/211 [00:00<?, ? examples/s]

**Train test Split**

In [145]:
# Split the dataset into training and test sets
split_dataset = tokenized_wnut.train_test_split(test_size=0.2)
# Access the split datasets
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

In [142]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [143]:
import numpy as np
import tensorflow as tf

In [155]:
columns = ["attention_mask", "input_ids", "labels", "token_type_ids"]
batch_size = 8

tf_train_dataset = train_dataset.to_tf_dataset(
    columns=columns,
    collate_fn=data_collator,
    batch_size=batch_size,
    shuffle=True,
)

tf_val_dataset = test_dataset.to_tf_dataset(
    columns=columns,
    collate_fn=data_collator,
    batch_size=batch_size,
    shuffle=False,
)

**Compile**

In [168]:
from transformers import create_optimizer
import tensorflow as tf

num_epochs = 3
num_train_steps = len(train_dataset['tokens']) * num_epochs

# set up optimizer with learning rate decay
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01
)

# compile
model.compile(optimizer=optimizer)

**Define Callbacks**

In [169]:
ES_CALLLBACKS = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=2,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

**Train The Model**

In [170]:
model.fit(x=tf_train_dataset,validation_data=tf_val_dataset, epochs=num_epochs,callbacks=[ES_CALLLBACKS])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7f4e46961ea0>

**Save The Model**

In [159]:
# save the model

model.save("/content/content/fine_tuned_ner")

**Evaluation**

In [173]:
# Evaluate the model...

model.evaluate(tf_val_dataset)



0.1664261519908905

**Model inference**

In [161]:
text = "civil eng tech online"

In [162]:
# Update the model configuration
model.config.id2label = label2id
model.config.label2id = id2label

In [163]:
import time
from transformers import pipeline
classifier = pipeline("ner", model=model,tokenizer=tokenizer)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [164]:

classifier(text)

[{'entity': 'B-BRANCH',
  'score': 0.9878621,
  'index': 1,
  'word': 'civil',
  'start': 0,
  'end': 5},
 {'entity': 'B-DEPARTMENT',
  'score': 0.8406503,
  'index': 2,
  'word': 'en',
  'start': 6,
  'end': 8},
 {'entity': 'B-BRANCH',
  'score': 0.78395194,
  'index': 3,
  'word': '##g',
  'start': 8,
  'end': 9},
 {'entity': 'B-BRANCH',
  'score': 0.25856835,
  'index': 4,
  'word': 'tech',
  'start': 10,
  'end': 14}]

In [165]:
model.config.id2label

{0: 'B-ACTION',
 1: 'B-BRANCH',
 2: 'B-DEGREE',
 3: 'B-DEPARTMENT',
 4: 'B-ELECTIVE',
 5: 'B-INSTITUTE',
 6: 'B-LAST_NAME',
 7: 'B-MIDDLE_NAME',
 8: 'B-MODE',
 9: 'B-NAME',
 10: 'B-PROFESSION',
 11: 'B-PROJECT',
 12: 'B-SEMESTER',
 13: 'B-SUBJECT',
 14: 'B-SUMMER',
 15: 'B-TERM',
 16: 'B-TYPE',
 17: 'B-YEAR',
 18: 'I-BRANCH',
 19: 'I-DEGREE',
 20: 'I-DEPARTMENT',
 21: 'I-PROFESSION',
 22: 'I-SEMESTER',
 23: 'I-SUBJECT',
 24: 'I-YEAR',
 25: 'O'}

**Load Saved Model**

In [None]:
saved_model = tf.saved_model.load("/content/content/fine_tuned_ner")

## **Thank You !**

---

