# **Import libraries**

In [16]:
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import torch

# **Defining Entity Types and Labels**

In [141]:
entity_types = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
# entity_types = [
#     "O",         # Outside any named entity
#     "B-PER",     # Beginning of a person name
#     "I-PER",     # Inside a person name
#     "B-ORG",     # Beginning of an organization
#     "I-ORG",     # Inside an organization
#     "B-LOC",     # Beginning of a location
#     "I-LOC",     # Inside a location
#     "B-MISC",    # Beginning of a miscellaneous entity (e.g., event, product)
#     "I-MISC",    # Inside a miscellaneous entity
#     "B-GPE",     # Beginning of a geopolitical entity (countries, cities, states)
#     "I-GPE",     # Inside a geopolitical entity
#     "B-DATE",    # Beginning of a date
#     "I-DATE",    # Inside a date
#     "B-TIME",    # Beginning of a time
#     "I-TIME",    # Inside a time
#     "B-MONEY",   # Beginning of a monetary amount
#     "I-MONEY",   # Inside a monetary amount
#     "B-PERCENT", # Beginning of a percentage
#     "I-PERCENT", # Inside a percentage
#     "B-EVENT",   # Beginning of an event (e.g., conference, tournament)
#     "I-EVENT",   # Inside an event
#     "B-PRODUCT", # Beginning of a product (e.g., technology, food)
#     "I-PRODUCT", # Inside a product
#     "B-LAW",     # Beginning of a law (e.g., legal cases, statutes)
#     "I-LAW",     # Inside a law
#     "B-WORK_OF_ART", # Beginning of a work of art (e.g., books, paintings)
#     "I-WORK_OF_ART", # Inside a work of art
#     "B-LANGUAGE", # Beginning of a language
#     "I-LANGUAGE"  # Inside a language
# ]
num_labels = len(entity_types)
# label_map = {label: i for i, label in enumerate(entity_types)}

# **Loading the Pre-Trained BERT Model and Tokenizer**

In [142]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
# model = BertModel.from_pretrained("bert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Sample Training Data**

In [143]:
train_dataset_sample = [
    {"text": "John works at Google in New York.", "labels": {"entities": [(0, 4, "PER"), (17, 22, "ORG"), (26, 34, "GPE")]}},
    {"text": "Apple Inc. is a technology company.", "labels": {"entities": [(0, 10, "ORG")]}},
]

# **Tokenizing and Formatting Data**

In [144]:
def tokenize_and_format_data(dataset, tokenizer):
    tokenized_data = []
    # Ensure entity_types is defined and accessible
    for sample in dataset:
        text = sample["text"]
        entities = sample["labels"]["entities"]

        # Add special tokens
        tokens = ["[CLS]"] + tokenizer.tokenize(text) + ["[SEP]"]
        labels = ['O'] * len(tokens)

        for start, end, entity_type in entities:
            prefix_tokens = tokenizer.tokenize(text[:start])
            # print(f"prefix_tokens{prefix_tokens}")
            # Account for [CLS] token
            start_token = len(prefix_tokens) + 1
            entity_tokens = tokenizer.tokenize(text[start:end])
            end_token = start_token + len(entity_tokens) - 1

            # Check if the start and end tokens are within the valid range
            if start_token < len(labels):
                labels[start_token] = f"B-{entity_type}"
            if end_token < len(labels):
                for i in range(start_token + 1, end_token + 1):
                    if i < len(labels):
                        labels[i] = f"I-{entity_type}"

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        # Ensure entity_types is defined and accessible
        label_ids = [entity_types.index(label) if label in entity_types else entity_types.index('O') for label in labels]
        padding_length = tokenizer.model_max_length - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * padding_length
        label_ids += [entity_types.index('O')] * padding_length
        tokenized_data.append({'input_ids': input_ids, 'labels': label_ids})

    dataset = TensorDataset(
        torch.tensor([item['input_ids'] for item in tokenized_data]),
        torch.tensor([item['labels'] for item in tokenized_data])
    )
    return dataset

# **Show Dataset**

In [145]:
import pandas as pd
# Call the function and get the dataset
dataset = tokenize_and_format_data(train_dataset_sample, tokenizer)

# Convert to DataFrame
df = pd.DataFrame(
    [
        {"input_ids": input_ids.tolist(), "label_ids": label_ids.tolist()}
        for input_ids, label_ids in dataset
    ]
)

# Display the DataFrame
display(df)

Unnamed: 0,input_ids,label_ids
0,"[101, 2198, 2573, 2012, 8224, 1999, 2047, 2259...","[0, 1, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[101, 6207, 4297, 1012, 2003, 1037, 2974, 2194...","[0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# **Preparing the Data for Training**

In [148]:
# Define batch_size
batch_size = 32
# Define learning rate
learning_rate = 5e-5
# Define number of epochs
num_epochs = 3
# Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)
train_data = tokenize_and_format_data(train_dataset_sample, tokenizer)
train_dataloader = DataLoader(train_data, batch_size=batch_size)




# **Fine-Tuning the Model**

In [150]:
def train_model(model, train_dataloader, optimizer, num_epochs):
  for epoch in range(num_epochs):
      model.train()
      for batch in tqdm(train_dataloader, desc="Training"):
          inputs, labels = batch
          outputs = model(inputs, labels=labels)
          loss = outputs.loss
          loss.backward()
          optimizer.step()
          optimizer.zero_grad()


In [151]:
train_model(model, train_dataloader, optimizer, num_epochs)

Training: 100%|██████████| 1/1 [00:13<00:00, 13.75s/it]
Training: 100%|██████████| 1/1 [00:15<00:00, 15.12s/it]
Training: 100%|██████████| 1/1 [00:15<00:00, 15.76s/it]


# **Saving the Fine-Tuned Model**

In [152]:
model.save_pretrained('fine_tuned_ner_model')

# **THANKS**