The cell below is used to create a Golden Standard csv file.  Used to check if the model's NER prediction is correct.

Format:
| FILE_PATH                                               | LESSOR_NAME      | LESSEE_NAME      | PROPERTY_ADDRESS                                 | LEASE_START_DATE | LEASE_END_DATE   | RENT_AMOUNT | SECURITY_DEPOSIT_AMOUNT |
|----------------------------------------------------------|------------------|------------------|--------------------------------------------------|------------------|------------------|-------------|--------------------------|
| ./datasets/dataset-master/Lease_Agreement_1.docx   | Ashley Martinez  | Sarah Williams   | 5316 Pine Rd, Franklin, CA 70457                 | May 26, 2025     | May 26, 2026     | $1038       |  $1245                    |
| ./datasets/dataset-master/Lease_Agreement_2.docx  | Ashley Jones     | Jessica Miller   | 538 Spruce Ct, Springfield, NY 82660             | December 16, 2024| December 16, 2025| $1746       |  $2095                    |
| ./datasets/dataset-master/Lease_Agreement_3.docx   | Brian Miller     | Amanda Garcia    | 1807 Chestnut Blvd, Fairview, CA 68967           | September 20, 2024| September 20, 2025| $2611     |  $3133                    |
| ./datasets/dataset-master/Lease_Agreement_4.docx   | David Hernandez  | Michael Johnson  | 2658 Elm St, Franklin, GA 71686                  | May 28, 2025     | May 28, 2026     | $3330       |  $3996                   |


In [464]:
from docx import Document
import pandas as pd

dataframe = pd.DataFrame(columns=['FILE_PATH', 'LESSOR_NAME', 'LESSEE_NAME', 'PROPERTY_ADDRESS', 'LEASE_START_DATE', 'LEASE_END_DATE', 'RENT_AMOUNT', 'SECURITY_DEPOSIT_AMOUNT'])

def data_preprocessing(data):
    for f in data:
        file_name = f["file_path"]
        doc_path = file_name
        doc = Document(doc_path)
        text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]).strip()

        # Tokenize by whitespace, and track char positions
        tokens = []
        token_spans = []
        start = 0
        for word in text.split():
            start = text.find(word, start)  # Find next occurrence
            end = start + len(word)
            tokens.append(word)
            token_spans.append((start, end))
            start = end

        # Initialize all labels to 0 ("O")
        labels = [0] * len(tokens)
        row =[file_name]
        # Map character spans to token indices
        for entity_name, span in f["entities"].items():
            ent_start = span["start"]
            ent_end = span["end"]
            row.append(text[ent_start:ent_end])
        dataframe.loc[len(dataframe)] = row

    # Save to CSV
    dataframe.to_csv("test_data.csv", index=False)





In [465]:
import json

file_path = ".\\tagged_dataset.json"

with open(file_path) as f:
    data = json.load(f)
    data_preprocessing(data)

compare_result(pred_file, golden_file, result_file)

pred_file: Prediction result csv file path

golden_file: Original golden standard csv file path

result_file: File path to save the result file. By default, it is './result.csv'

What it does:

1. Loads both CSVs into DataFrames.

2. Iterates through each row in the pred_file.

3. Finds the corresponding row in the golden_file by matching FILE_PATH.

4. Compares each field in the row.

5. Appends a new row to the result:

    1 if prediction matches the golden truth.

    0 if it doesn't.

6. Saves the result as a CSV file.


In [466]:
import pandas as pd

def compare_results(pred_file, golden_file, result_file = 'result.csv'):
    # Load prediction and original CSVs
    pred_df = pd.read_csv(pred_file)
    orig_df = pd.read_csv(golden_file)

    # Output comparison dataframe
    result_dataframe = pd.DataFrame(columns=[
        'FILE_PATH', 'LESSOR_NAME', 'LESSEE_NAME',
        'PROPERTY_ADDRESS', 'LEASE_START_DATE', 'LEASE_END_DATE',
        'RENT_AMOUNT', 'SECURITY_DEPOSIT_AMOUNT'
    ])

    # Iterate through prediction file
    for i in range(len(pred_df)):
        file_path = pred_df.iloc[i]['FILE_PATH']
        orig = orig_df[orig_df['FILE_PATH'] == file_path]

        # Skip if no match in original
        if orig.empty:
            continue

        pred_row = pred_df[pred_df['FILE_PATH'] == file_path].iloc[0]
        orig_row = orig.iloc[0]

        comparison = {
            'FILE_PATH': file_path,
            'LESSOR_NAME': int(pred_row['LESSOR_NAME'] == orig_row['LESSOR_NAME']),
            'LESSEE_NAME': int(pred_row['LESSEE_NAME'] == orig_row['LESSEE_NAME']),
            'PROPERTY_ADDRESS': int(pred_row['PROPERTY_ADDRESS'] == orig_row['PROPERTY_ADDRESS']),
            'LEASE_START_DATE': int(pred_row['LEASE_START_DATE'] == orig_row['LEASE_START_DATE']),
            'LEASE_END_DATE': int(pred_row['LEASE_END_DATE'] == orig_row['LEASE_END_DATE']),
            'RENT_AMOUNT': int(pred_row['RENT_AMOUNT'] == orig_row['RENT_AMOUNT']),
            'SECURITY_DEPOSIT_AMOUNT': int(pred_row['SECURITY_DEPOSIT_AMOUNT'] == orig_row['SECURITY_DEPOSIT_AMOUNT']),
        }

        result_dataframe = pd.concat([result_dataframe, pd.DataFrame([comparison])], ignore_index=True)

    # Save to CSV
    result_dataframe.to_csv(result_file, index=False)

