# Data pre-processing

## data_preprocessing(dataset_json):

This function processes a dataset of lease agreements for training a token classification (NER) model. Here's what it does step by step:

### 1. Loop through each annotated data entry
Each item in `dataset_json` is expected to be a dictionary with:
- `"file_path"`: path to a `.docx` lease agreement
- `"entities"`: a dictionary of labeled entity spans (e.g., Lessor, Lessee, Address, Rent amount, Start date, End date and Security deposit)

---

### 2. Read the DOCX file
The lease agreement text is extracted from the `.docx` file using `python-docx`.

---

### 3. Tokenize the text (whitespace-based)
The text is split into tokens using simple `.split()`.
Character spans for each token are tracked (start and end positions in the full string) to later align with entity spans.

---

### 4. Initialize all token labels to 0 ("O" = no entity)
We create a label list (`labels`) with the same length as tokens. All labels start as 0, indicating "outside any entity".

---

### 5. Align annotated entity spans to tokens
For each labeled entity in the annotation:
- Get its character start/end position from the JSON
- Convert the entity name to an integer ID using `label2id`
- For each token, check if the token is **fully inside** the entity span.
- If yes, assign the corresponding label ID

---

### 6. Store the result
For each document, a dictionary is created with:
- `file_name`: name of the document
- `tokens`: list of token strings
- `labels`: list of corresponding integer labels

This dictionary is added to the `records` list — final output.

---

### Result
You get a list of dictionaries (`records`) that can be used directly for training a transformer-based NER model.


In [3]:
from pathlib import Path
from docx import Document
import pandas as pd
import os

# Label map for your entity types
label2id = {
    "LESSOR_NAME": 1,
    "LESSEE_NAME": 2,
    "PROPERTY_ADDRESS": 3,
    "LEASE_START_DATE": 4,
    "LEASE_END_DATE": 5,
    "RENT_AMOUNT": 6,
    "SECURITY_DEPOSIT_AMOUNT": 7
}
records = []
def data_preprocessing(dataset_json):
    for data_dic in dataset_json:
        doc_path = data_dic["file_path"]  # full path to the DOCX file

        # Read the DOCX file
        doc = Document(doc_path)
        text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]).strip()

        # Tokenize by whitespace, and track char positions
        tokens = []
        token_spans = []
        start = 0
        for word in text.split():
            start = text.find(word, start)
            end = start + len(word)
            tokens.append(word)
            token_spans.append((start, end))
            start = end

        # Initialize all labels to 0 ("O")
        labels = [0] * len(tokens)

        # Map character spans to token indices
        for entity_name, span in data_dic["entities"].items():
            ent_start = span["start"]
            ent_end = span["end"]
            label_id = label2id[entity_name]

            for idx, (tok_start, tok_end) in enumerate(token_spans):
                # Check if token overlaps with entity span
                if tok_start >= ent_start and tok_end <= ent_end:
                    labels[idx] = label_id

        # Create a record of data
        records.append({
            "file_name": doc_path,
            "tokens": tokens,
            "labels": labels
        })




In [4]:
file_path = "./tagged_dataset.json"

import json

# Load JSON from file
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)
    data_preprocessing(data)

dataset = pd.DataFrame(records)
print("Dataset shape: ", dataset.shape)


Dataset shape:  (72, 3)


# Output of Data pre-processing:

In [26]:
dataset.head()

Unnamed: 0,file_name,tokens,labels
0,./datasets/dataset-master/Lease_Agreement_1.docx,"[RESIDENTIAL, LEASE, AGREEMENT, This, Lease, A...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,./datasets/dataset-master/Lease_Agreement_2.docx,"[RESIDENTIAL, LEASE, AGREEMENT, This, Lease, A...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,./datasets/dataset-master/Lease_Agreement_3.docx,"[RESIDENTIAL, LEASE, AGREEMENT, This, Lease, A...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,./datasets/dataset-master/Lease_Agreement_4.docx,"[RESIDENTIAL, LEASE, AGREEMENT, This, Lease, A...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,./datasets/dataset-master/Lease_Agreement_5.docx,"[RESIDENTIAL, LEASE, AGREEMENT, This, Lease, A...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [5]:
# Saving the prepared data as csv file
dataset.to_csv("lease_token_labels.csv", index=False)

In [6]:
import pandas as pd

# Load data to run model
df = pd.read_csv("./lease_token_labels.csv")


In [7]:
import ast

# Converting the data to model suitable format.
def safe_parse_list(x):
    if isinstance(x, list):
        return x
    return ast.literal_eval(x)

df["labels"] = df["labels"].apply(safe_parse_list)
df["tokens"] = df["tokens"].apply(safe_parse_list)


In [8]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[["tokens", "labels"]])
print("Total length of Dataset: ", len(dataset))

Total length of Dataset:  72


# LegalBERT Model

###  Tokenizing and Label Alignment for LegalBERT (NER)

We use the `nlpaueb/legal-bert-base-uncased` tokenizer to split words into subwords.

However, since our labels are at the **word level**, we must align them with the **tokenized output**, which may split one word into multiple tokens.

Key rules:
- Only the **first subword** gets the label.
- Other subwords get `-100` so that the loss function ignores them.

This is handled using `word_ids()` which maps each token back to its source word.

We use `.map()` to apply this logic across the entire dataset.




In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["tokens"], is_split_into_words=True, truncation=True)
    word_ids = tokenized.word_ids()
    aligned_labels = []
    previous_word = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word:
            aligned_labels.append(example["labels"][word_idx])
        else:
            aligned_labels.append(example["labels"][word_idx] if example["labels"][word_idx] != 0 else -100)
        previous_word = word_idx
    tokenized["labels"] = aligned_labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels)


Map:   0%|          | 0/72 [00:00<?, ? examples/s]

In [23]:
tokenized_dataset

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 72
})

In [24]:
#Splitting dataset into training and validation
# 90% train, 10% validation
dataset_split = tokenized_dataset.train_test_split(test_size=0.1)
print(dataset_split.shape)


{'train': (64, 5), 'test': (8, 5)}


### Load and Configure LegalBERT for NER

We use `AutoModelForTokenClassification` to load LegalBERT for a Named Entity Recognition task.

Steps:

1. **Count unique labels** from the dataset and set `num_labels`
2. **Load LegalBERT** model with a custom classification head sized to the number of entity types
3.  **Define label mappings**:
   - `id2label`: maps label indices to names (e.g., `1 → LESSOR_NAME`)
   - `label2id`: reverse mapping for training
4.  **Inject label maps into the model config** so it can display correct entity names during evaluation


In [25]:
from transformers import AutoModelForTokenClassification

label_list = list(set(label for example in dataset["labels"] for label in example))
num_labels = len(set(label_list))
id2label = {i: str(i) for i in range(num_labels)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    "nlpaueb/legal-bert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    trust_remote_code=False, use_safetensors=True)
model.to("cuda")

# Define your label maps
id2label = {
    0: "O",
    1: "LESSOR_NAME",
    2: "LESSEE_NAME",
    3: "PROPERTY_ADDRESS",
    4: "LEASE_START_DATE",
    5: "LEASE_END_DATE",
    6: "RENT_AMOUNT",
    7: "SECURITY_DEPOSIT_AMOUNT"
}
label2id = {v: k for k, v in id2label.items()}


# Inject into model config
model.config.id2label = id2label
model.config.label2id = label2id

Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### compute_metrics(p): Evaluation Function for Token Classification

This function is used by the Hugging Face `Trainer` to evaluate the model's predictions during validation.

#### Steps:

1.  Converts logits into predicted label IDs using `argmax`.
2.  Filters out tokens with label `-100` (ignored tokens like padding and subwords).
3.  Calculates:
   - **Precision**, **Recall**, **F1-score** (weighted average)
   - **Accuracy**

These metrics are returned as a dictionary and logged during training and evaluation.


In [26]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_preds, true_labels = [], []
    for pred, label in zip(predictions, labels):
        for p_i, l_i in zip(pred, label):
            if l_i != -100:
                true_preds.append(p_i)
                true_labels.append(l_i)

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_preds, average='weighted')
    acc = accuracy_score(true_labels, true_preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


### TrainingArguments and Data Collator Setup

We configure training using Hugging Face’s `TrainingArguments` for LegalBERT fine-tuning.

**Key Settings**:
- `eval_strategy="epoch"` → evaluate after every epoch
- `save_strategy="epoch"` → save checkpoints after every epoch
- `load_best_model_at_end=True` → restore the model with the best validation performance
- `fp16=True` → enables mixed precision training for speed/memory efficiency
- `save_total_limit=2` → keeps the last 2 checkpoints only

We also use `DataCollatorForTokenClassification`, which handles dynamic padding for each batch.


In [27]:
training_args = TrainingArguments(
    output_dir="./legalbert-ner-30",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=2,
    fp16=True,
    load_best_model_at_end=True
)

data_collator = DataCollatorForTokenClassification(tokenizer)


### Trainer Setup and Training

We now initialize Hugging Face's `Trainer` with:

- Our `LegalBERT` token classification model
- The pre-tokenized training and evaluation datasets
- A `data_collator` for dynamic padding
- The `compute_metrics()` function for custom evaluation
- `TrainingArguments` for logging, saving, and training behavior


In [28]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.809053,0.936896,0.879367,0.936896,0.907221
2,No log,0.214261,0.936896,0.877774,0.936896,0.906372
3,No log,0.14794,0.964674,0.931091,0.964674,0.947467
4,No log,0.096714,0.965882,0.939992,0.965882,0.950717
5,No log,0.078332,0.975242,0.970691,0.975242,0.97195
6,No log,0.062715,0.983998,0.982442,0.983998,0.983194
7,0.483100,0.044564,0.989734,0.989553,0.989734,0.989518
8,0.483100,0.036253,0.990942,0.988953,0.990942,0.989533
9,0.483100,0.027879,0.99215,0.992816,0.99215,0.990322
10,0.483100,0.022957,0.993357,0.994669,0.993357,0.99376


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return fo

TrainOutput(global_step=240, training_loss=0.1251371572415034, metrics={'train_runtime': 148.0594, 'train_samples_per_second': 12.968, 'train_steps_per_second': 1.621, 'total_flos': 501716987412480.0, 'train_loss': 0.1251371572415034, 'epoch': 30.0})

In [29]:
#Saving the trained model for future use
trainer.save_model("legalbert-ner-model-30")
tokenizer.save_pretrained("legalbert-ner-model-30")


('legalbert-ner-model-30\\tokenizer_config.json',
 'legalbert-ner-model-30\\special_tokens_map.json',
 'legalbert-ner-model-30\\vocab.txt',
 'legalbert-ner-model-30\\added_tokens.json',
 'legalbert-ner-model-30\\tokenizer.json')

### Sample testing:

In [30]:
from transformers import pipeline

ner_pipe = pipeline("ner", model="legalbert-ner-model-30", tokenizer="legalbert-ner-model-30",aggregation_strategy="simple")

sample_text = "This Lease Agreement (\"Agreement\") is entered into on January 1, 2005, by and between: LESSOR: Union Pacific Railroad Company (\"Landlord\") LESSEE: CXT Incorporated (\"Tenant\")PROPERTY: The Landlord hereby leases to the Tenant the residential property located at: Grand Island, Nebraska 1. TERM OF LEASE The term of this lease shall commence on January 1, 2005 and shall terminate on December 31, 2009. This Agreement shall be considered a fixed-term lease. 2. RENT The Tenant agrees to pay the Landlord a monthly rent of $1,378. Rent is due on the 1st day of each month. If rent is not received by the 5th day of the month, a late fee of $50.00 will be assessed. 3. SECURITY DEPOSIT Upon execution of this Agreement, Tenant shall deposit with Landlord the sum of $5,000 as a security deposit. This deposit shall be held by the Landlord as security for the faithful performance by the Tenant of all terms, covenants, and conditions of this Agreement."
result = ner_pipe(sample_text)
print(result)


Device set to use cuda:0


[{'entity_group': 'LEASE_START_DATE', 'score': np.float32(0.74842185), 'word': 'january 1', 'start': 54, 'end': 63}, {'entity_group': 'LEASE_END_DATE', 'score': np.float32(0.38523662), 'word': ',', 'start': 63, 'end': 64}, {'entity_group': 'LEASE_END_DATE', 'score': np.float32(0.40854865), 'word': ',', 'start': 69, 'end': 70}, {'entity_group': 'LESSOR_NAME', 'score': np.float32(0.87431693), 'word': 'union pacific railroad company', 'start': 95, 'end': 125}, {'entity_group': 'LESSEE_NAME', 'score': np.float32(0.9378921), 'word': 'cxt incorporated', 'start': 147, 'end': 163}, {'entity_group': 'PROPERTY_ADDRESS', 'score': np.float32(0.92097855), 'word': ': grand island, nebraska', 'start': 260, 'end': 284}, {'entity_group': 'PROPERTY_ADDRESS', 'score': np.float32(0.5732186), 'word': '.', 'start': 286, 'end': 287}, {'entity_group': 'LEASE_START_DATE', 'score': np.float32(0.8060597), 'word': 'january 1, 2005', 'start': 343, 'end': 358}, {'entity_group': 'LEASE_END_DATE', 'score': np.float32

  return forward_call(*args, **kwargs)


In [31]:
# Beautifying the sample test result
def convert_output(ner_results):
    entities = []
    for ent in ner_results:
        if ent["entity_group"] != "O":
            entities.append({
                "label": ent["entity_group"],
                "text": ent["word"],
                "start": ent["start"],
                "end": ent["end"],
                "score": float(ent["score"])
            })
    return entities

# Example usage
cleaned_entities = convert_output(result)
for ent in cleaned_entities:
    print(f"{ent['label']:25} -> '{ent['text']}' ({ent['start']}-{ent['end']}) [{ent['score']:.2f}]")


LEASE_START_DATE          -> 'january 1' (54-63) [0.75]
LEASE_END_DATE            -> ',' (63-64) [0.39]
LEASE_END_DATE            -> ',' (69-70) [0.41]
LESSOR_NAME               -> 'union pacific railroad company' (95-125) [0.87]
LESSEE_NAME               -> 'cxt incorporated' (147-163) [0.94]
PROPERTY_ADDRESS          -> ': grand island, nebraska' (260-284) [0.92]
PROPERTY_ADDRESS          -> '.' (286-287) [0.57]
LEASE_START_DATE          -> 'january 1, 2005' (343-358) [0.81]
LEASE_END_DATE            -> 'december 31, 2009.' (382-400) [0.73]
SECURITY_DEPOSIT_AMOUNT   -> '1, 378' (521-526) [0.44]
LEASE_END_DATE            -> ',' (621-622) [0.41]
SECURITY_DEPOSIT_AMOUNT   -> '50. 00' (638-643) [0.47]
LEASE_END_DATE            -> ',' (714-715) [0.41]
SECURITY_DEPOSIT_AMOUNT   -> '$ 5, 000' (762-768) [0.72]


In [None]:
from transformers import pipeline

# Loading saved model for testing
ner_pipe = pipeline("ner", model="legalbert-ner-model-100", tokenizer="legalbert-ner-model-100",
                    aggregation_strategy="simple")


# Testing the model

In [None]:
import os
from docx import Document
import pandas as pd

dataframe = []


def read_docx_directory(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):
            file_path = os.path.join(folder_path, filename)
            try:
                doc = Document(file_path)
                text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
                doc_res = ner_pipe(text)
                desired_path = "." + file_path[file_path.find(r"\datasets"):]
                desired_path = desired_path.replace("\\", "/")
                res = {'FILE_PATH': desired_path, }
                for ent in doc_res:
                    if ent["entity_group"] != "O":
                        res[ent["entity_group"]] = ent["word"]
                        res["score"] = float(ent["score"])
                dataframe.append(res)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    return pd.DataFrame(dataframe)


results = read_docx_directory("./datasets/dataset-master/testing")
results.to_csv("bert_results.csv")