The cell below is used to create a Golden Standard csv file, by reading the 'tagged_dataset.json.  Used to check if the model's NER prediction is correct.

Format:
| FILE_PATH                                               | LESSOR_NAME      | LESSEE_NAME      | PROPERTY_ADDRESS                                 | LEASE_START_DATE | LEASE_END_DATE   | RENT_AMOUNT | SECURITY_DEPOSIT_AMOUNT |
|----------------------------------------------------------|------------------|------------------|--------------------------------------------------|------------------|------------------|-------------|--------------------------|
| ./datasets/dataset-master/Lease_Agreement_1.docx   | Ashley Martinez  | Sarah Williams   | 5316 Pine Rd, Franklin, CA 70457                 | May 26, 2025     | May 26, 2026     | $1038       |  $1245                    |
| ./datasets/dataset-master/Lease_Agreement_2.docx  | Ashley Jones     | Jessica Miller   | 538 Spruce Ct, Springfield, NY 82660             | December 16, 2024| December 16, 2025| $1746       |  $2095                    |
| ./datasets/dataset-master/Lease_Agreement_3.docx   | Brian Miller     | Amanda Garcia    | 1807 Chestnut Blvd, Fairview, CA 68967           | September 20, 2024| September 20, 2025| $2611     |  $3133                    |
| ./datasets/dataset-master/Lease_Agreement_4.docx   | David Hernandez  | Michael Johnson  | 2658 Elm St, Franklin, GA 71686                  | May 28, 2025     | May 28, 2026     | $3330       |  $3996                   |


In [None]:
from docx import Document
import pandas as pd

dataframe = pd.DataFrame(columns=['FILE_PATH', 'LESSOR_NAME', 'LESSEE_NAME', 'PROPERTY_ADDRESS', 'LEASE_START_DATE', 'LEASE_END_DATE', 'RENT_AMOUNT', 'SECURITY_DEPOSIT_AMOUNT'])

def data_preprocessing(data):
    for f in data:
        file_name = f["file_path"]
        doc_path = file_name
        doc = Document(doc_path)
        text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]).strip()

        # Tokenize by whitespace, and track char positions
        tokens = []
        token_spans = []
        start = 0
        for word in text.split():
            start = text.find(word, start)  # Find next occurrence
            end = start + len(word)
            tokens.append(word)
            token_spans.append((start, end))
            start = end

        # Initialize all labels to 0 ("O")
        labels = [0] * len(tokens)
        row =[file_name]
        # Map character spans to token indices
        for entity_name, span in f["entities"].items():
            ent_start = span["start"]
            ent_end = span["end"]
            row.append(text[ent_start:ent_end])
        dataframe.loc[len(dataframe)] = row

    # Save to CSV
    dataframe.to_csv("golden_data.csv", index=False)





In [None]:
import json

file_path = "./tagged_testing_dataset.json"

with open(file_path) as f:
    data = json.load(f)
    data_preprocessing(data)

compare_result(pred_file, golden_file, result_file)

pred_file: Prediction result csv file path

golden_file: Original golden standard csv file path

result_file: File path to save the result file. By default, it is './result.csv'

What it does:

1. Loads both CSVs into DataFrames.

2. Iterates through each row in the pred_file.

3. Finds the corresponding row in the golden_file by matching FILE_PATH.

4. Compares each field in the row.

5. Appends a new row to the result:

    1 if prediction matches the golden truth.

    0 if it doesn't.

6. Saves the result as a CSV file.


In [3]:
import pandas as pd

def exact_match_results(pred_file, golden_file, result_file = 'result.csv'):
    # Load prediction and original CSVs
    pred_df = pd.read_csv(pred_file)
    orig_df = pd.read_csv(golden_file)

    # Output comparison dataframe
    result_dataframe = pd.DataFrame(columns=[
        'FILE_PATH', 'LESSOR_NAME', 'LESSEE_NAME',
        'PROPERTY_ADDRESS', 'LEASE_START_DATE', 'LEASE_END_DATE',
        'RENT_AMOUNT', 'SECURITY_DEPOSIT_AMOUNT'
    ])

    # Iterate through prediction file
    for i in range(len(pred_df)):
        file_path = pred_df.iloc[i]['FILE_PATH']
        orig = orig_df[orig_df['FILE_PATH'] == file_path]

        # Skip if no match in original
        if orig.empty:
            continue

        pred_row = pred_df[pred_df['FILE_PATH'] == file_path].iloc[0]
        orig_row = orig.iloc[0]
        print(pred_row['LESSOR_NAME'], orig_row['LESSEE_NAME'])
        comparison = {
            'FILE_PATH': file_path,
            'LESSOR_NAME': int(pred_row['LESSOR_NAME'] == orig_row['LESSOR_NAME']),
            'LESSEE_NAME': int(pred_row['LESSEE_NAME'] == orig_row['LESSEE_NAME']),
            'PROPERTY_ADDRESS': int(pred_row['PROPERTY_ADDRESS'] == orig_row['PROPERTY_ADDRESS']),
            'LEASE_START_DATE': int(pred_row['LEASE_START_DATE'] == orig_row['LEASE_START_DATE']),
            'LEASE_END_DATE': int(pred_row['LEASE_END_DATE'] == orig_row['LEASE_END_DATE']),
            #'RENT_AMOUNT': int(pred_row['RENT_AMOUNT'] == orig_row['RENT_AMOUNT']),
            'SECURITY_DEPOSIT_AMOUNT': int(pred_row['SECURITY_DEPOSIT_AMOUNT'] == orig_row['SECURITY_DEPOSIT_AMOUNT']),
        }

        result_dataframe = pd.concat([result_dataframe, pd.DataFrame([comparison])], ignore_index=True)

    # Save to CSV
    result_dataframe.to_csv(result_file, index=False)



In [4]:
from fuzzywuzzy import fuzz

def partial_match_results(pred_file, golden_file, result_file='partial_result.csv'):
    pred_df = pd.read_csv(pred_file)
    orig_df = pd.read_csv(golden_file)

    columns = [
        'LESSOR_NAME', 'LESSEE_NAME', 'PROPERTY_ADDRESS',
        'LEASE_START_DATE', 'LEASE_END_DATE',
         'SECURITY_DEPOSIT_AMOUNT'
    ]#'RENT_AMOUNT',

    result_dataframe = pd.DataFrame(columns=['FILE_PATH'] + columns)

    def fuzzy_score(pred, gold):
        if pd.isna(pred) or pd.isna(gold):
            return 0.0
        pred = str(pred).strip()
        gold = str(gold).strip()
        score = fuzz.token_sort_ratio(pred, gold)
        if score >= 95:
            return 1.0
        elif score >= 85:
            return 0.75
        elif score >= 70:
            return 0.5
        elif score >= 50:
            return 0.25
        else:
            return 0.0

    for i in range(len(pred_df)):
        file_path = pred_df.iloc[i]['FILE_PATH']
        orig = orig_df[orig_df['FILE_PATH'] == file_path]

        if orig.empty:
            continue

        pred_row = pred_df[pred_df['FILE_PATH'] == file_path].iloc[0]
        orig_row = orig.iloc[0]

        comparison = {'FILE_PATH': file_path}
        for col in columns:
            comparison[col] = fuzzy_score(pred_row[col], orig_row[col])

        result_dataframe = pd.concat([result_dataframe, pd.DataFrame([comparison])], ignore_index=True)

    result_dataframe.to_csv(result_file, index=False)


In [21]:
exact_match_results('fine_tuned_spacy_testing_results.csv', 'golden_data.csv', 'fine_tuned_spacy_exact_result.csv')
partial_match_results('fine_tuned_spacy_testing_results.csv', 'golden_data.csv', 'fine_tuned_spacy_partial_result.csv')

MEPC MILTON PARK NO. ADAPTIMMUNE THERAPEUTICS PLC
YOUHAN YOUXINPAI (BEIJING)
UCARSHOW HK LIMITED UCARBUY HOLDING LIMITED
UXIN LIMITED GLORYFIN INTERNATIONAL GROUP HOLDING COMPANY LIMITED
MEPC MILTON PARK NO. OXFORD IMMUNOTEC LIMITED
George Feldenkreis Supreme International, Inc
nan MICROSOFT CHINA CO., LTD
nan JINKO SOLAR CO., LTD
nan CHINA TIAN YUAN SPECIALTY CROP TECHNOLOGIES CO., LTD
BOYU FINANCING LEASE KAIFENG FINANCING LEASE
SICHUAN YOUXINPAI YOUXINPAI (BEIJING)
nan YOUXIN HONG KONG LIMITED
YOUXIN LIMITED PERFECT HARMONY GROUP LIMITED
HENZHEN NEW TAOYUAN INDUSTRIAL CO., LTD. SHENZHEN GUANGRONG ELECTRONIC CO., LTD


  result_dataframe = pd.concat([result_dataframe, pd.DataFrame([comparison])], ignore_index=True)


In [15]:
def evaluate_result_csv(result_file, threshold=0.5):
    df = pd.read_csv(result_file)

    if 'FILE_PATH' in df.columns:
        df = df.drop(columns=['FILE_PATH'])

    metrics = {}

    for col in df.columns:
        values = df[col].fillna(0).astype(float)

        tp = (values >= threshold).sum()
        total_pred = len(values)
        total_gold = len(values)  # assumes gold data is one row per file

        precision = tp / total_pred if total_pred else 0
        recall = tp / total_gold if total_gold else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0

        metrics[col] = {
            'total': total_pred,
            'true_positives': int(tp),
            'precision': round(precision, 3),
            'recall': round(recall, 3),
            'f1_score': round(f1, 3)
        }

    return metrics


In [23]:
file_name = "fine_tuned_spacy_partial_result"
evaluation_results = evaluate_result_csv(f"{file_name}.csv")

# save evaluation results to a json file
with open("fine_tuned_spacy_evaluation_results.json", "w") as f:
    json.dump(evaluation_results, f)