In [1]:
from spacy import Language
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
from pathlib import Path

spacy.require_gpu()
# Load a blank English pipeline with transformer
nlp = spacy.blank("en")
print(nlp.use_gpu)
# Add transformer + NER to the pipeline
transformer_config = {
    "model": {
        "@architectures": "spacy-transformers.TransformerModel.v3",
        "name": "nlpaueb/legal-bert-base-uncased",
        "tokenizer_config": {"use_fast": True},
        "transformer_config": {"output_attentions": False},
    }
}
nlp.add_pipe("transformer", config=transformer_config)
nlp.add_pipe("ner", last=True)


<spacy.pipeline.ner.EntityRecognizer at 0x18ecb954580>

In [20]:
import json
from docx import Document

def load_docx_text(docx_path):
    doc = Document(docx_path)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

file_path = ["D:\\NEU\Summer2025\\NLP\FinalProject\Project 1\\Lease-Buddy\\backend\\tagged_dataset3.json" ,"D:\\NEU\Summer2025\\NLP\FinalProject\Project 1\\Lease-Buddy\\backend\\tagged_dataset.json","D:\\NEU\Summer2025\\NLP\FinalProject\Project 1\\Lease-Buddy\\backend\\tagged_dataset4.json"]
# Load your JSON object
TRAIN_DATA = []
for file in file_path:
    with open(file, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
    print(file)

    for entry in raw_data:
        file_name = entry["file_path"].split("/")[-1].split(".")[0]
        doc_path = f"D:\\NEU\\Summer2025\\NLP\\FinalProject\\Lease-Buddy\\backend\\models\\NER\\Master Dataset\\{file_name}.docx"
        doc = Document(doc_path)
        text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]).strip()
        ents = []

        for label, span in entry["entities"].items():
            start = span["start"]
            end = span["end"]
            entity_text = text[start:end]

            # Optional: basic sanity check
            if text[start:end].strip():
                ents.append((start, end, label))

        TRAIN_DATA.append((text, {"entities": ents}))


D:\NEU\Summer2025\NLP\FinalProject\Project 1\Lease-Buddy\backend\tagged_dataset3.json
D:\NEU\Summer2025\NLP\FinalProject\Project 1\Lease-Buddy\backend\tagged_dataset.json
D:\NEU\Summer2025\NLP\FinalProject\Project 1\Lease-Buddy\backend\tagged_dataset4.json


In [21]:
print(len(TRAIN_DATA))

72


In [23]:
ner = nlp.get_pipe("ner")
ner.add_label("LESSOR_NAME")
ner.add_label("LESSEE_NAME")
ner.add_label("PROPERTY_ADDRESS")
ner.add_label("LEASE_START_DATE")
ner.add_label("LEASE_END_DATE")
ner.add_label("RENT_AMOUNT")
ner.add_label("SECURITY_DEPOSIT_AMOUNT")

# Convert training data to spaCy examples
examples = []
for text, annot in TRAIN_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annot)
    examples.append(example)


In [24]:
from spacy.training import Example
from spacy.util import minibatch
from tqdm import tqdm

optimizer = nlp.initialize()

for i in range(100):  # epochs
    losses = {}
    batches = minibatch(examples, size=2)
    for batch in tqdm(batches):
        nlp.update(batch, losses=losses)
    print(f"Epoch {i + 1}, Losses: {losses}")


36it [01:11,  2.00s/it]


Epoch 1, Losses: {'transformer': 0.0, 'ner': np.float32(10602.237)}


36it [01:11,  1.98s/it]


Epoch 2, Losses: {'transformer': 0.0, 'ner': np.float32(1517.2496)}


36it [01:10,  1.95s/it]


Epoch 3, Losses: {'transformer': 0.0, 'ner': np.float32(2752.5017)}


36it [03:09,  5.25s/it]


Epoch 4, Losses: {'transformer': 0.0, 'ner': np.float32(2716.4065)}


36it [03:53,  6.49s/it]


Epoch 5, Losses: {'transformer': 0.0, 'ner': np.float32(1579.731)}


36it [03:23,  5.66s/it]


Epoch 6, Losses: {'transformer': 0.0, 'ner': np.float32(909.9778)}


36it [04:14,  7.06s/it]


Epoch 7, Losses: {'transformer': 0.0, 'ner': np.float32(739.0276)}


36it [04:05,  6.81s/it]


Epoch 8, Losses: {'transformer': 0.0, 'ner': np.float32(1156.2622)}


36it [04:02,  6.75s/it]


Epoch 9, Losses: {'transformer': 0.0, 'ner': np.float32(366.6578)}


36it [04:02,  6.73s/it]


Epoch 10, Losses: {'transformer': 0.0, 'ner': np.float32(193.83557)}


36it [04:02,  6.74s/it]


Epoch 11, Losses: {'transformer': 0.0, 'ner': np.float32(161.51445)}


36it [04:02,  6.74s/it]


Epoch 12, Losses: {'transformer': 0.0, 'ner': np.float32(223.89786)}


36it [04:19,  7.20s/it]


Epoch 13, Losses: {'transformer': 0.0, 'ner': np.float32(121.466034)}


36it [04:00,  6.69s/it]


Epoch 14, Losses: {'transformer': 0.0, 'ner': np.float32(192.64685)}


36it [04:03,  6.77s/it]


Epoch 15, Losses: {'transformer': 0.0, 'ner': np.float32(124.91169)}


36it [01:27,  2.43s/it]


Epoch 16, Losses: {'transformer': 0.0, 'ner': np.float32(116.54868)}


36it [01:11,  1.98s/it]


Epoch 17, Losses: {'transformer': 0.0, 'ner': np.float32(138.6576)}


36it [01:10,  1.96s/it]


Epoch 18, Losses: {'transformer': 0.0, 'ner': np.float32(111.30799)}


36it [01:22,  2.29s/it]


Epoch 19, Losses: {'transformer': 0.0, 'ner': np.float32(162.73682)}


36it [01:37,  2.71s/it]


Epoch 20, Losses: {'transformer': 0.0, 'ner': np.float32(93.528305)}


36it [01:36,  2.69s/it]


Epoch 21, Losses: {'transformer': 0.0, 'ner': np.float32(120.12452)}


36it [01:26,  2.41s/it]


Epoch 22, Losses: {'transformer': 0.0, 'ner': np.float32(130.47559)}


36it [03:18,  5.51s/it]


Epoch 23, Losses: {'transformer': 0.0, 'ner': np.float32(98.93853)}


36it [04:00,  6.68s/it]


Epoch 24, Losses: {'transformer': 0.0, 'ner': np.float32(148.98027)}


36it [04:07,  6.87s/it]


Epoch 25, Losses: {'transformer': 0.0, 'ner': np.float32(101.09844)}


36it [04:02,  6.74s/it]


Epoch 26, Losses: {'transformer': 0.0, 'ner': np.float32(93.462)}


36it [03:03,  5.09s/it]


Epoch 27, Losses: {'transformer': 0.0, 'ner': np.float32(95.067245)}


36it [01:22,  2.28s/it]


Epoch 28, Losses: {'transformer': 0.0, 'ner': np.float32(82.39818)}


36it [02:16,  3.80s/it]


Epoch 29, Losses: {'transformer': 0.0, 'ner': np.float32(88.090065)}


36it [05:26,  9.07s/it]


Epoch 30, Losses: {'transformer': 0.0, 'ner': np.float32(59.977867)}


36it [03:34,  5.95s/it]


Epoch 31, Losses: {'transformer': 0.0, 'ner': np.float32(78.000496)}


36it [06:15, 10.42s/it]


Epoch 32, Losses: {'transformer': 0.0, 'ner': np.float32(63.50784)}


36it [03:15,  5.44s/it]


Epoch 33, Losses: {'transformer': 0.0, 'ner': np.float32(79.390724)}


36it [01:20,  2.24s/it]


Epoch 34, Losses: {'transformer': 0.0, 'ner': np.float32(58.23883)}


36it [01:13,  2.04s/it]


Epoch 35, Losses: {'transformer': 0.0, 'ner': np.float32(40.265583)}


36it [01:10,  1.96s/it]


Epoch 36, Losses: {'transformer': 0.0, 'ner': np.float32(34.757736)}


36it [01:35,  2.66s/it]


Epoch 37, Losses: {'transformer': 0.0, 'ner': np.float32(16.307724)}


36it [01:36,  2.69s/it]


Epoch 38, Losses: {'transformer': 0.0, 'ner': np.float32(21.808502)}


36it [01:37,  2.70s/it]


Epoch 39, Losses: {'transformer': 0.0, 'ner': np.float32(17.456564)}


36it [01:37,  2.70s/it]


Epoch 40, Losses: {'transformer': 0.0, 'ner': np.float32(16.023615)}


36it [01:37,  2.70s/it]


Epoch 41, Losses: {'transformer': 0.0, 'ner': np.float32(13.696202)}


36it [01:36,  2.67s/it]


Epoch 42, Losses: {'transformer': 0.0, 'ner': np.float32(16.778448)}


36it [01:35,  2.66s/it]


Epoch 43, Losses: {'transformer': 0.0, 'ner': np.float32(13.429127)}


36it [01:36,  2.69s/it]


Epoch 44, Losses: {'transformer': 0.0, 'ner': np.float32(14.114061)}


36it [01:36,  2.69s/it]


Epoch 45, Losses: {'transformer': 0.0, 'ner': np.float32(13.203835)}


36it [01:38,  2.75s/it]


Epoch 46, Losses: {'transformer': 0.0, 'ner': np.float32(13.329561)}


36it [01:36,  2.69s/it]


Epoch 47, Losses: {'transformer': 0.0, 'ner': np.float32(12.1308975)}


36it [01:36,  2.69s/it]


Epoch 48, Losses: {'transformer': 0.0, 'ner': np.float32(12.336259)}


36it [01:37,  2.72s/it]


Epoch 49, Losses: {'transformer': 0.0, 'ner': np.float32(10.778618)}


36it [01:37,  2.71s/it]


Epoch 50, Losses: {'transformer': 0.0, 'ner': np.float32(9.993546)}


36it [01:36,  2.68s/it]


Epoch 51, Losses: {'transformer': 0.0, 'ner': np.float32(10.374078)}


36it [01:35,  2.66s/it]


Epoch 52, Losses: {'transformer': 0.0, 'ner': np.float32(10.584973)}


36it [02:11,  3.66s/it]


Epoch 53, Losses: {'transformer': 0.0, 'ner': np.float32(9.937037)}


36it [04:06,  6.85s/it]


Epoch 54, Losses: {'transformer': 0.0, 'ner': np.float32(9.672998)}


36it [04:12,  7.02s/it]


Epoch 55, Losses: {'transformer': 0.0, 'ner': np.float32(9.88815)}


36it [02:12,  3.68s/it]


Epoch 56, Losses: {'transformer': 0.0, 'ner': np.float32(9.948992)}


36it [01:11,  2.00s/it]


Epoch 57, Losses: {'transformer': 0.0, 'ner': np.float32(9.842823)}


36it [01:12,  2.02s/it]


Epoch 58, Losses: {'transformer': 0.0, 'ner': np.float32(10.435325)}


36it [01:09,  1.94s/it]


Epoch 59, Losses: {'transformer': 0.0, 'ner': np.float32(12.61137)}


36it [01:10,  1.95s/it]


Epoch 60, Losses: {'transformer': 0.0, 'ner': np.float32(17.121765)}


36it [01:09,  1.94s/it]


Epoch 61, Losses: {'transformer': 0.0, 'ner': np.float32(64.0304)}


36it [01:08,  1.91s/it]


Epoch 62, Losses: {'transformer': 0.0, 'ner': np.float32(10.817339)}


36it [01:08,  1.91s/it]


Epoch 63, Losses: {'transformer': 0.0, 'ner': np.float32(34.111546)}


36it [01:09,  1.93s/it]


Epoch 64, Losses: {'transformer': 0.0, 'ner': np.float32(30.854439)}


36it [01:16,  2.13s/it]


Epoch 65, Losses: {'transformer': 0.0, 'ner': np.float32(24.057842)}


36it [01:35,  2.66s/it]


Epoch 66, Losses: {'transformer': 0.0, 'ner': np.float32(17.17992)}


36it [01:19,  2.22s/it]


Epoch 67, Losses: {'transformer': 0.0, 'ner': np.float32(15.526397)}


36it [01:11,  1.99s/it]


Epoch 68, Losses: {'transformer': 0.0, 'ner': np.float32(28.801867)}


36it [01:09,  1.94s/it]


Epoch 69, Losses: {'transformer': 0.0, 'ner': np.float32(19.41264)}


36it [01:10,  1.96s/it]


Epoch 70, Losses: {'transformer': 0.0, 'ner': np.float32(20.220097)}


36it [02:52,  4.81s/it]


Epoch 71, Losses: {'transformer': 0.0, 'ner': np.float32(31.407755)}


36it [04:06,  6.84s/it]


Epoch 72, Losses: {'transformer': 0.0, 'ner': np.float32(29.61544)}


36it [04:07,  6.87s/it]


Epoch 73, Losses: {'transformer': 0.0, 'ner': np.float32(41.861652)}


36it [04:02,  6.74s/it]


Epoch 74, Losses: {'transformer': 0.0, 'ner': np.float32(153.71074)}


36it [04:03,  6.77s/it]


Epoch 75, Losses: {'transformer': 0.0, 'ner': np.float32(42.313744)}


36it [01:19,  2.20s/it]


Epoch 76, Losses: {'transformer': 0.0, 'ner': np.float32(15.790389)}


36it [01:11,  1.99s/it]


Epoch 77, Losses: {'transformer': 0.0, 'ner': np.float32(113.2761)}


36it [01:13,  2.03s/it]


Epoch 78, Losses: {'transformer': 0.0, 'ner': np.float32(33.948845)}


36it [01:30,  2.52s/it]


Epoch 79, Losses: {'transformer': 0.0, 'ner': np.float32(30.592773)}


36it [01:37,  2.70s/it]


Epoch 80, Losses: {'transformer': 0.0, 'ner': np.float32(14.9271145)}


36it [01:37,  2.70s/it]


Epoch 81, Losses: {'transformer': 0.0, 'ner': np.float32(12.069328)}


36it [01:37,  2.70s/it]


Epoch 82, Losses: {'transformer': 0.0, 'ner': np.float32(20.716698)}


36it [01:36,  2.67s/it]


Epoch 83, Losses: {'transformer': 0.0, 'ner': np.float32(14.504044)}


36it [01:35,  2.64s/it]


Epoch 84, Losses: {'transformer': 0.0, 'ner': np.float32(10.120674)}


36it [01:35,  2.66s/it]


Epoch 85, Losses: {'transformer': 0.0, 'ner': np.float32(9.6668)}


36it [01:36,  2.69s/it]


Epoch 86, Losses: {'transformer': 0.0, 'ner': np.float32(13.116548)}


36it [01:35,  2.66s/it]


Epoch 87, Losses: {'transformer': 0.0, 'ner': np.float32(15.872009)}


36it [01:36,  2.67s/it]


Epoch 88, Losses: {'transformer': 0.0, 'ner': np.float32(13.183422)}


36it [01:36,  2.68s/it]


Epoch 89, Losses: {'transformer': 0.0, 'ner': np.float32(8.464504)}


36it [01:36,  2.68s/it]


Epoch 90, Losses: {'transformer': 0.0, 'ner': np.float32(13.635391)}


36it [01:36,  2.68s/it]


Epoch 91, Losses: {'transformer': 0.0, 'ner': np.float32(10.635466)}


36it [01:37,  2.71s/it]


Epoch 92, Losses: {'transformer': 0.0, 'ner': np.float32(52.894005)}


36it [01:37,  2.71s/it]


Epoch 93, Losses: {'transformer': 0.0, 'ner': np.float32(9.243723)}


36it [03:13,  5.36s/it]


Epoch 94, Losses: {'transformer': 0.0, 'ner': np.float32(8.920497)}


36it [04:11,  6.98s/it]


Epoch 95, Losses: {'transformer': 0.0, 'ner': np.float32(11.594985)}


36it [04:06,  6.83s/it]


Epoch 96, Losses: {'transformer': 0.0, 'ner': np.float32(12.791665)}


36it [04:09,  6.94s/it]


Epoch 97, Losses: {'transformer': 0.0, 'ner': np.float32(12.308514)}


36it [05:30,  9.18s/it]


Epoch 98, Losses: {'transformer': 0.0, 'ner': np.float32(13.290091)}


36it [05:35,  9.33s/it]


Epoch 99, Losses: {'transformer': 0.0, 'ner': np.float32(15.444285)}


36it [02:44,  4.58s/it]

Epoch 100, Losses: {'transformer': 0.0, 'ner': np.float32(11.513182)}





In [25]:
nlp.to_disk("custom_legal_ner_spacy_100")
# Later
nlp = spacy.load("custom_legal_ner_spacy_100")

In [26]:
doc = nlp("This Lease Agreement (\"Agreement\") is entered into on January 1, 2005, by and between: LESSOR: Union Pacific Railroad Company (\"Landlord\") LESSEE: CXT Incorporated (\"Tenant\")PROPERTY: The Landlord hereby leases to the Tenant the residential property located at: Grand Island, Nebraska 1. TERM OF LEASE The term of this lease shall commence on January 1, 2005 and shall terminate on December 31, 2009. This Agreement shall be considered a fixed-term lease. 2. RENT The Tenant agrees to pay the Landlord a monthly rent of $1,378. Rent is due on the 1st day of each month. If rent is not received by the 5th day of the month, a late fee of $50.00 will be assessed. 3. SECURITY DEPOSIT Upon execution of this Agreement, Tenant shall deposit with Landlord the sum of $5,000 as a security deposit. This deposit shall be held by the Landlord as security for the faithful performance by the Tenant of all terms, covenants, and conditions of this Agreement.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Union Pacific Railroad Company LESSOR_NAME
CXT Incorporated LESSEE_NAME
December 31, 2009 LEASE_END_DATE
$1,378 RENT_AMOUNT
$5,000 SECURITY_DEPOSIT_AMOUNT


In [27]:
print(doc.ents)

(Union Pacific Railroad Company, CXT Incorporated, December 31, 2009, $1,378, $5,000)


In [2]:
import spacy
# Later
nlp = spacy.load("custom_legal_ner_spacy_100")

  self._model.load_state_dict(torch.load(filelike, map_location=device))


In [15]:
import os
from docx import Document
import pandas as pd

dataframe = []
def read_docx_directory(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):
            file_path = os.path.join(folder_path, filename)
            try:
                doc = Document(file_path)
                text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
                doc_res = nlp(text)
                desired_path = file_path.replace("\\", "/")
                res = {'FILE_PATH' : desired_path,}
                for ent in doc_res.ents:
                    res[ent.label_] = ent.text
                dataframe.append(res)

            except Exception as e:
                print(f"Error reading {filename}: {e}")
    return pd.DataFrame(dataframe)


In [16]:
results = read_docx_directory("../datasets/dataset-master/testing")

In [17]:
results.to_csv("spacy_bert_result.csv")