# Training a Model

This workbook takes annotated examples and uses them to train a small model for NER tasks.

Prior to running this workbook:
- define all expected labels under the `entity_types` variable
- have an input file with annotated training data

The training data should be a JSONL file with lines in this format:
```json
{"id":26,"text":"repealed 12 [ repealed 2023 - 13 - 11. ]","meta":{"ActId":"Civil Forfeiture Act","identity":73955,"sectionId":"12","sectionName":"Repealed"},"label":[]}
```

Variables to edit for alterning training strategy:
- running_locally
- batch_size
- num_epochs
- learning_rate

See the README file in this folder for more information on what these parameters change.

In [None]:
%pip install torch
%pip install tqdm
%pip install intel_extension_for_pytorch
%pip install transformers==4.45.0 # intel extension requirement

In [3]:
# Import necessary libraries
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader 
from tqdm import tqdm 
import torch
import intel_extension_for_pytorch as ipex

In [None]:
# Assuming a predefined set of entity types
entity_types = ["O", "B_ACT", "I_ACT", "B_REF_IN", "I_REF_IN", "B_REF_EX", "I_REF_EX"]
# Set num_labels
num_labels = len(entity_types)
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
running_locally = True
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels, attn_implementation="eager")
i_model = model if running_locally else ipex.fast_bert(model, dtype=torch.bfloat16)
# Define batch_size
batch_size = 32  # Adjust as needed
# Define learning rate
learning_rate = 5e-5  # Adjust as needed

In [6]:
def tokenize_and_format_data(dataset, tokenizer):
    tokenized_data = []
    for sample in dataset:
        text = sample["text"]
        entities = sample["label"]
        # Tokenize the input text using the BERT tokenizer

        tokens =  tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
        # Initialize labels for each token as 'O' (Outside)
        labels = ['O'] * len(tokens)
        # Update labels for entity spans
        for start, end, entity_type in entities:
            # Tokenize the prefix to get the correct offset
            prefix_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text[:start])))
            start_token = len(prefix_tokens) - 1
            # Tokenize the entity to get its length
            entity_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text[start:end])))
            end_token = start_token + len(entity_tokens) - 2
            labels[start_token] = f"B_{entity_type}"
            for i in range(start_token + 1, end_token):
                labels[i] = f"I_{entity_type}"
            
                # Convert tokens and labels to input IDs and label IDs
                input_ids = tokenizer.convert_tokens_to_ids(tokens)
                label_ids = [entity_types.index(label) for label in labels]
                # Pad input_ids and label_ids to the maximum sequence length
                padding_length = tokenizer.model_max_length - len(input_ids)
                input_ids += [tokenizer.pad_token_id] * padding_length
                label_ids += [entity_types.index('O')] * padding_length
                tokenized_data.append({'input_ids': input_ids,
                    'labels': label_ids
                })
    # Convert tokenized data to PyTorch dataset
    dataset = TensorDataset(
        torch.tensor([item['input_ids'] for item in tokenized_data]), torch.tensor([item['labels'] for item in tokenized_data])
    )
    return dataset

In [None]:
import json

train_dataset = []
with open("doccano_export.jsonl", "r") as input:
  for line in input:
    train_dataset.append(json.loads(line))

# Prepare data for fine-tuning
train_data = tokenize_and_format_data(train_dataset, tokenizer) 
train_dataloader = DataLoader(train_data, batch_size=batch_size)

In [None]:
# Fine-tune the model
optimizer = torch.optim.AdamW(i_model.parameters(), lr=learning_rate) 
num_epochs = 15  # Adjust as needed
for epoch in range(num_epochs):
    print("Epoch:", epoch)
    i_model.train()
    for batch in tqdm(train_dataloader, desc="Training"):
        inputs, labels = batch
        # Unpack the tuple
        outputs = i_model(inputs, labels=labels)
        loss =  outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(loss)


In [None]:
# Save the fine-tuned model for later 
i_model.save_pretrained('exported_models/fine_tuned_ner_model')
