# Named Entity Recognition (NER) Training for Lease Documents

Train a custom Named Entity Recognition (NER) model using spaCy for extracting entities from lease documents.


## Import Required Libraries

In [1]:
import random
import spacy
from spacy.util import minibatch
from spacy.training.example import Example
import json
import docx
from pathlib import Path
import os

## Data Preparation

### Load and Process Training Data

In [2]:
train_data = []

# read tagged dataset json file
with open('../datasets/tagged_dataset.json', 'r') as f:
    tagged_data = json.load(f)

print(f"Loaded {len(tagged_data)} lease documents from tagged dataset.")

# Process each lease document in the dataset
for single_lease in tagged_data:
    file_path = single_lease['file_path']
    entities = single_lease['entities']
    
    # Read the text content based on file type
    file_extension = Path(file_path).suffix.lower()
    
    if file_extension == '.docx':
        # Read DOCX file using python-docx
        doc = docx.Document(file_path)
        file_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    elif file_extension == '.txt':
        # Read plain text file
        with open(file_path, 'r', encoding='utf-8') as file:
            file_text = file.read()
    else:
        print(f"Unsupported file type: {file_extension} for {file_path}")
        continue
    
    # Convert entity annotations to spaCy format (start, end, label)
    ent = []
    for entity, details in entities.items():
        start = details['start']
        end = details['end']
        # Validate entity positions against extracted text
        if start < len(file_text) and end <= len(file_text):
            # print(f"Entity: {entity}, Start: {start}, End: {end}, Text: {file_text[start:end]}")
            ent.append((start, end, entity))
        else:
            print(f"Warning: Entity position out of bounds for {entity} in {file_path}")
    
    # Add formatted training example to our dataset
    train_data.append((file_text, {"entities": ent}))

print(train_data)

Loaded 72 lease documents from tagged dataset.
[('\nRESIDENTIAL LEASE AGREEMENT\n\nThis Lease Agreement ("Agreement") is entered into on May 26, 2025, by and between:\n\nLESSOR: Ashley Martinez ("Landlord")\nLESSEE: Sarah Williams ("Tenant")\n\nPROPERTY: The Landlord hereby leases to the Tenant the residential property located at:\n5316 Pine Rd, Franklin, CA 70457\n\n1. TERM OF LEASE\nThe term of this lease shall commence on May 26, 2025 and shall terminate on May 26, 2026. This Agreement shall be considered a fixed-term lease.\n\n2. RENT\nThe Tenant agrees to pay the Landlord a monthly rent of $1038. Rent is due on the 1st day of each month. If rent is not received by the 5th day of the month, a late fee of $50.00 will be assessed.\n\n3. SECURITY DEPOSIT\nUpon execution of this Agreement, Tenant shall deposit with Landlord the sum of $1245 as a security deposit. This deposit shall be held by the Landlord as security for the faithful performance by the Tenant of all terms, covenants, a

## Model Setup

### Load Base Model and Configure NER Pipeline

In [3]:
# Load English language model and create a blank model for NER
nlp = spacy.load("en_core_web_md")

# Add NER pipe if it doesn't exist
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

### Add Custom Entity Labels

In [4]:
# Add labels to the NER component
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        if ent[2] not in ner.labels:
            ner.add_label(ent[2])

## Model Training

### Training Loop Configuration

In [None]:
# Disable other pipes and train only NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    # Initialize the optimizer with default settings
    optimizer = nlp.create_optimizer()
    
    # Training loop
    for itn in range(100):
        random.shuffle(train_data)
        losses = {}
        
        # Batch the examples
        losses = {}
        batches = minibatch(train_data, size=2)
        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
            
            # Update the model
            nlp.update(examples, drop=0.5, sgd=optimizer, losses=losses)
            
        print(f"Iteration {itn}, Losses: {losses}")

RESIDENTIAL LEASE AGREEMENT

This Lease Agreement..." with entities "[(120, 135, 'LESSOR_NAME'), (157, 172, 'LESSEE_NAM...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
 This Lease Agreement ..." with entities "[(125, 144, 'LESSOR_NAME'), (168, 198, 'LESSEE_NAM...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
RESIDENTIAL LEASE AGREEMENT

This Lease Agreement..." with entities "[(125, 137, 'LESSOR_NAME'), (159, 171, 'LESSEE_NAM...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
RESIDENTIAL LEASE AGREEMENT

This Lease Agreement..." with entities "[(123, 134, 'LESSOR_NAME'), (156, 168, 'LESSEE_NAM...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text

Iteration 0, Losses: {'ner': 1647.1835082282869}
Iteration 1, Losses: {'ner': 883.9675123983732}
Iteration 2, Losses: {'ner': 818.9494681369949}
Iteration 3, Losses: {'ner': 1036.4101664314396}
Iteration 4, Losses: {'ner': 1419.7191039515806}
Iteration 5, Losses: {'ner': 519.2446788843343}
Iteration 6, Losses: {'ner': 1151.887768724582}
Iteration 7, Losses: {'ner': 577.8010658372609}
Iteration 8, Losses: {'ner': 778.6897859671438}
Iteration 9, Losses: {'ner': 1120.4652351737288}


## Model Persistence

### Save the Trained Model

In [10]:
# Save the trained model
nlp.to_disk("./lease_ner_model")

## Model Testing

### Load and Test the Trained Model

In [None]:
nlp_lease = spacy.load("./lease_ner_model")

files_directory = "../datasets/dataset-master/testing"
test_files = [f for f in os.listdir(files_directory) if f.endswith(".docx")]

# Initialize results list
all_results = []

for file_path in test_files:
    full_file_path = files_directory + "/" + file_path
    
    # Read the document text
    if Path(full_file_path).suffix.lower() == '.docx':
        doc = docx.Document(full_file_path)
        test_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    elif Path(full_file_path).suffix.lower() == '.txt':
        with open(full_file_path, 'r', encoding='utf-8') as file:
            test_text = file.read()
    else:
        print(f"Unsupported file type: {Path(full_file_path).suffix} for {full_file_path}")
        continue
    
    # Process with NER model
    doc = nlp_lease(test_text)
    print(f"Entities in {full_file_path}:")
    
    # Initialize entities dictionary for this file
    file_entities = {
            "LESSOR_NAME": {
                "start": -1,
                "end": -1
            },
            "LESSEE_NAME": {
                "start": -1,
                "end": -1
            },
            "PROPERTY_ADDRESS": {
                "start": -1,
                "end": -1
            },
            "LEASE_START_DATE": {
                "start": -1,
                "end": -1
            },
            "LEASE_END_DATE": {
                "start": -1,
                "end": -1
            },
            "RENT_AMOUNT": {
                "start": -1,
                "end": -1
            },
            "SECURITY_DEPOSIT_AMOUNT": {
                "start": -1,
                "end": -1
            }
        }
    
    for ent in doc.ents:
        print(f"  {ent.text} -> {ent.label_}")
        # Store entity with start and end positions
        file_entities[ent.label_] = {
            "start": ent.start_char,
            "end": ent.end_char
        }
    
    # Add result for this file
    result_entry = {
        "file_path": full_file_path,
        "entities": file_entities
    }
    all_results.append(result_entry)
    
    print("\n" + "="*50 + "\n")

results_file_name = "fine_tuned_spacy_testing_results.json"
# Save all results to JSON file
with open(results_file_name, "w") as f:
    json.dump(all_results, f, indent=4)

print(f"Results saved to {results_file_name} with {len(all_results)} files processed.") 

FileNotFoundError: [Errno 2] No such file or directory: './datasets/dataset-master/testing'

## Notes

- The model is trained for 100 iterations with a batch size of 2
- Dropout rate of 0.5 is used for regularization to prevent overfitting
- Only the NER component is trained while other pipeline components are disabled
- The training data should be in the format: `[(text, {"entities": [(start, end, label), ...]}), ...]`
- Make sure your tagged dataset JSON contains file paths and entity annotations with start/end positions