In [12]:
import random
import spacy
from spacy.util import minibatch
from spacy.training.example import Example

In [26]:
import json


train_data = []

# read tagged dataset json file
with open('tagged_dataset.json', 'r') as f:
    tagged_data = json.load(f)
for single_lease in tagged_data:
    file_path = single_lease['file_path']
    entities = single_lease['entities']
    file = open(file_path, 'r')
    file_text = file.read()
    ent = []
    for entity, details in entities.items():
        start = details['start']
        end = details['end']
        # print(f"Entity: {entity}, Start: {start}, End: {end}, Text: {file_text[start:end]}")
        ent.append((start, end, entity))
    train_data.append((file_text, {"entities": ent}))

print(train_data)

[('\nRESIDENTIAL LEASE AGREEMENT\n\nThis Lease Agreement ("Agreement") is entered into on May 26, 2025, by and between:\n\nLESSOR: Ashley Martinez ("Landlord")\nLESSEE: Sarah Williams ("Tenant")\n\nPROPERTY: The Landlord hereby leases to the Tenant the residential property located at:\n5316 Pine Rd, Franklin, CA 70457\n\n1. TERM OF LEASE\nThe term of this lease shall commence on May 26, 2025 and shall terminate on May 26, 2026. This Agreement shall be considered a fixed-term lease.\n\n2. RENT\nThe Tenant agrees to pay the Landlord a monthly rent of $1038. Rent is due on the 1st day of each month. If rent is not received by the 5th day of the month, a late fee of $50.00 will be assessed.\n\n3. SECURITY DEPOSIT\nUpon execution of this Agreement, Tenant shall deposit with Landlord the sum of $1245 as a security deposit. This deposit shall be held by the Landlord as security for the faithful performance by the Tenant of all terms, covenants, and conditions of this Agreement.\n\n4. USE OF P

In [None]:
# Load English language model and create a blank model for NER
nlp = spacy.load("en_core_web_md")

# Add NER pipe if it doesn't exist
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add labels to the NER component
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        if ent[2] not in ner.labels:
            ner.add_label(ent[2])

# Disable other pipes and train only NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    # Initialize the optimizer with default settings
    optimizer = nlp.create_optimizer()
    
    # Training loop
    for itn in range(100):
        random.shuffle(train_data)
        losses = {}
        
        # Batch the examples
        losses = {}
        batches = minibatch(train_data, size=2)
        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
            
            # Update the model
            nlp.update(examples, drop=0.5, sgd=optimizer, losses=losses)
            
        print(f"Iteration {itn}, Losses: {losses}")

# Save the trained model
nlp.to_disk("lease_ner_model")

# Test the model
nlp_lease = spacy.load("lease_ner_model")

RESIDENTIAL LEASE AGREEMENT

This Lease Agreement..." with entities "[(126, 141, 'LESSOR_NAME'), (163, 175, 'LESSEE_NAM...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


Iteration 0, Losses: {'ner': np.float32(512.9102)}
Iteration 1, Losses: {'ner': np.float32(280.051)}
Iteration 1, Losses: {'ner': np.float32(280.051)}
Iteration 2, Losses: {'ner': np.float32(620.42816)}
Iteration 2, Losses: {'ner': np.float32(620.42816)}
Iteration 3, Losses: {'ner': np.float32(327.36417)}
Iteration 3, Losses: {'ner': np.float32(327.36417)}
Iteration 4, Losses: {'ner': np.float32(116.17943)}
Iteration 4, Losses: {'ner': np.float32(116.17943)}
Iteration 5, Losses: {'ner': np.float32(105.21931)}
Iteration 5, Losses: {'ner': np.float32(105.21931)}
Iteration 6, Losses: {'ner': np.float32(137.71495)}
Iteration 6, Losses: {'ner': np.float32(137.71495)}
Iteration 7, Losses: {'ner': np.float32(329.6546)}
Iteration 7, Losses: {'ner': np.float32(329.6546)}
Iteration 8, Losses: {'ner': np.float32(254.05553)}
Iteration 8, Losses: {'ner': np.float32(254.05553)}
Iteration 9, Losses: {'ner': np.float32(258.8409)}
Iteration 9, Losses: {'ner': np.float32(258.8409)}
Iteration 10, Losses:

In [39]:
f = open("/Users/akshaychavan/Documents/college/NLP/Lease_Documents_Text/test.txt")
test_text = f.read()
doc = nlp_lease(test_text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Cynthia Anderson LESSEE_NAME
066 Moody Mission Apt. PROPERTY_ADDRESS
North Davidland LESSOR_NAME
02 June LESSEE_NAME
02 June 2026 LEASE_END_DATE
$2492 RENT_AMOUNT
$4984 SECURITY_DEPOSIT_AMOUNT
