<a href="https://colab.research.google.com/github/asantos2000/master-degree-santos-anderson/blob/main/code/src/2_b_semantic_annotation-v2-colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2.B. SEMANTIC ANNOTATION

Google colab version.

In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  # backup on Google Drive
  !cp -r /content/drive/MyDrive/cfr2sbvr/modules/configuration configuration

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [2]:
!pip install -r /content/drive/MyDrive/cfr2sbvr/requirements.txt



In [3]:
# Standard library imports
import json
import os
import time
import re
import glob
import yaml
from collections import Counter, defaultdict
from datetime import datetime
from decimal import Decimal
from pathlib import Path
import logging
from logging.handlers import TimedRotatingFileHandler

# Third-party libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pydantic import BaseModel, Field
from sklearn.metrics import confusion_matrix, classification_report
from typing import List, Dict, Optional, Any, Tuple, Set

# OpenAI and instructor libraries
from openai import OpenAI
import instructor

# Franz AllegroGraph (AG) imports
from franz.openrdf.connect import ag_connect
from franz.openrdf.repository.repository import Repository, RepositoryConnection
from franz.openrdf.query.query import QueryLanguage
from franz.openrdf.sail.allegrographserver import AllegroGraphServer, Catalog

# inflect library
import inflect

## Settings

Default settings, check them before run the notebook.

### General functions (commented)

In [4]:
# def get_next_filename(file_dir: str, file_prefix: str, extension: str) -> str:
#     """
#     Generates the next filename in a sequence based on existing files in a directory,
#     considering the file extension.

#     The filename format is: `{file_prefix}-{YYYY-MM-DD}-{N}.{extension}`,
#     where `N` is an incrementing integer for files with the same date.

#     Args:
#         file_dir (str): The directory where the files are stored.
#         file_prefix (str): The prefix used in the filenames.
#         extension (str): The file extension (e.g., 'json', 'txt').

#     Returns:
#         str: The full path to the next filename in the sequence.

#     Example:
#         next_file = get_next_filename('../checkpoints', 'documents', 'json')
#         print(next_file)
#         # Output might be: ../checkpoints/documents-2024-10-19-5.json
#     """
#     today_str: str = datetime.today().strftime('%Y-%m-%d')
#     path: str = file_dir

#     # Ensure the directory exists
#     if not os.path.exists(path):
#         os.makedirs(path)

#     files = os.listdir(path)

#     # Create the pattern dynamically using file_prefix and extension
#     pattern = re.compile(
#         r'^' + re.escape(file_prefix) + r'-(\d{4}-\d{2}-\d{2})-(\d+)\.' + re.escape(extension) + r'$'
#     )

#     file_info_list = []

#     for filename in files:
#         match = pattern.match(filename)
#         if match:
#             date_str: str = match.group(1)
#             number: int = int(match.group(2))
#             file_info_list.append({'filename': filename, 'date': date_str, 'number': number})

#     if file_info_list:
#         # Sort by date and number in descending order
#         sorted_files = sorted(
#             file_info_list,
#             key=lambda x: (x['date'], x['number']),
#             reverse=True
#         )

#         latest_file_info = sorted_files[0]
#         latest_date: str = latest_file_info['date']
#         latest_number: int = latest_file_info['number']

#         if latest_date == today_str:
#             new_number: int = latest_number + 1
#         else:
#             new_number = 1
#     else:
#         new_number = 1

#     new_filename: str = f'{file_prefix}-{today_str}-{new_number}.{extension}'
#     new_filepath: str = os.path.join(path, new_filename)

#     return new_filepath


In [5]:
# # Load the YAML config file
# def load_config(config_file="/content/drive/MyDrive/cfr2sbvr/config.yaml"):
#     try:
#         with open(config_file, "r") as file:
#             config = yaml.safe_load(file)
#     except FileNotFoundError:
#         raise FileNotFoundError(f"Configuration file {config_file} not found.")
#     except yaml.YAMLError as exc:
#         raise ValueError(f"Error parsing YAML file {config_file}: {exc}")

#     # Ensure config structure is correct
#     if "LLM" not in config or "DEFAULT_CHECKPOINT_DIR" not in config:
#         raise ValueError("Required configuration keys are missing in the config file.")

#     # Set the OpenAI API key from environment variable if it's not set in config
#     config["LLM"]["OPENAI_API_KEY"] = os.getenv(
#         "OPENAI_API_KEY", config["LLM"].get("OPENAI_API_KEY")
#     )

#     # Dynamically set checkpoint and report files using the get_next_filename function
#     config["DEFAULT_CHECKPOINT_FILE"] = get_next_filename(
#         config["DEFAULT_CHECKPOINT_DIR"], "documents", "json"
#     )
#     config["DEFAULT_EXTRACTION_REPORT_FILE"] = get_next_filename(
#         config["DEFAULT_OUTPUT_DIR"], "extraction_report", "html"
#     )

#     return config

### Get configuration

In [6]:
import configuration.main as configuration

# Development mode
import importlib
importlib.reload(configuration)

# load config
config = configuration.load_config()

Generated files for analysis in this run

In [7]:
print(config["DEFAULT_CHECKPOINT_FILE"],
config["DEFAULT_EXTRACTION_REPORT_FILE"],
config["DEFAULT_EXCEL_FILE"])

/content/drive/MyDrive/cfr2sbvr/checkpoints/documents-2024-10-25-1.json /content/drive/MyDrive/cfr2sbvr/outputs/extraction_report-2024-10-25-1.html /content/drive/MyDrive/cfr2sbvr/outputs/compare_items_metrics.xlsx


### Logging configuration

In [8]:
# Ensure the ../logs directory exists
log_directory = os.path.join(os.getcwd(), config["DEFAULT_LOG_DIR"])
os.makedirs(log_directory, exist_ok=True)

# Path for the log file
log_file_path = os.path.join(log_directory, 'application.log')

# Set up TimedRotatingFileHandler to rotate logs every day
file_handler = TimedRotatingFileHandler(
    log_file_path, when="midnight", interval=1, backupCount=0  # Rotate every midnight, keep all backups
)

# Set the file handler's log format
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S'))

# Set up logging configuration
logging.basicConfig(
    level=config["LOG_LEVEL"],  # Set to the desired log level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Console log format
    datefmt='%Y-%m-%d %H:%M:%S',  # Custom date format
    handlers=[
        file_handler,  # Log to the rotating file in ../logs
        logging.StreamHandler()  # Log to console
    ]
)

# Example logger
logger = logging.getLogger(__name__)

# Log a test message to verify
logger.info("Logging is set up with daily rotation.")


In [9]:
def measure_time(func):
    """
    Decorator to measure the execution time of a function.
    """
    def wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        elapsed_time = end_time - start_time
        logger.info(f"Execution time for {func.__name__}: {elapsed_time:.2f} seconds")
        return result
    return wrapper

## Checkpoints

Documents, annoted datasets, statistics and metrics about the execution of the notebook are stored in checkpoints.

There are several checkpoints in the directory `../checkpoints`. We will restore the checkpoint pointed by the configuration `DEFAULT_CHECKPOINT_FILE`.

During the execution, it will restore the checkpoint at the beginning of the section and saved at the end. We can run and restore the checkpoint several times. If the run fails, check the closest checkpoint and restore it.

### General functions

In [10]:
def convert_set_to_list(data: Any) -> Any:
    """
    Recursively converts sets to lists in the data structure.

    Args:
        data (Any): The data structure to process, which can be a dict, list, set, or other types.

    Returns:
        Any: The data structure with all sets converted to lists.
    """
    if isinstance(data, dict):
        return {key: convert_set_to_list(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_set_to_list(item) for item in data]
    elif isinstance(data, set):
        return list(data)
    else:
        return data

In [11]:

# Define a model for the Document
class Document(BaseModel):
    id: str
    type: str  # New field to represent the type of the document
    content: Any  # Content can be any data type: list, dict, string, etc.

# Define the DocumentManager class
class DocumentManager(BaseModel):
    documents: Dict[Tuple[str, str], Document] = Field(default_factory=dict)  # Keys are tuples (id, type)

    def add_document(self, doc: Document) -> None:
        """
        Adds a document to the manager.

        Args:
            doc (Document): The document to add.
        """
        key = (doc.id, doc.type)
        self.documents[key] = doc

    def retrieve_document(self, doc_id: str, doc_type: str) -> Optional[Document]:
        """
        Retrieves a document by its id and type.

        Args:
            doc_id (str): The ID of the document.
            doc_type (str): The type of the document.

        Returns:
            Optional[Document]: The retrieved document, or None if not found.
        """
        key = (doc_id, doc_type)
        return self.documents.get(key)

    def list_document_ids(self, doc_type: Optional[str] = None) -> List[str]:
        """
        Lists all document ids, optionally filtered by type.

        Args:
            doc_type (Optional[str], optional): The type of documents to list. Defaults to None.

        Returns:
            List[str]: A list of document ids.
        """
        if doc_type:
            return [doc_id for (doc_id, d_type) in self.documents.keys() if d_type == doc_type]
        else:
            return [doc_id for (doc_id, _) in self.documents.keys()]

    def exclude_document(self, doc_id: str, doc_type: str) -> None:
        """
        Excludes a document by its id and type.

        Args:
            doc_id (str): The ID of the document to exclude.
            doc_type (str): The type of the document.
        """
        key = (doc_id, doc_type)
        if key in self.documents:
            del self.documents[key]

    def persist_to_file(self, filename: str) -> None:
        """
        Persists the current state to a file, converting tuple keys to strings and sets to lists.

        Args:
            filename (str): The filename to save the documents.
        """
        #serializable_documents = {f"{doc_id}|{doc_type}": doc.dict() for (doc_id, doc_type), doc in self.documents.items()}
        serializable_documents = {f"{doc_id}|{doc_type}": convert_set_to_list(doc.dict()) for (doc_id, doc_type), doc in self.documents.items()}
        with open(filename, 'w') as file:
            json.dump(serializable_documents, file, indent=4)

    @classmethod
    def restore_from_file(cls, filename: str) -> 'DocumentManager':
        """
        Restores the state from a file, converting string keys back to tuples.

        Args:
            filename (str): The filename to restore the documents from.

        Returns:
            DocumentManager: The restored DocumentManager instance.
        """
        with open(filename, 'r') as file:
            data = json.load(file)
            documents = {(doc_id.split('|')[0], doc_id.split('|')[1]): Document(**doc_data) for doc_id, doc_data in data.items()}
            return cls(documents=documents)


In [12]:
def restore_checkpoint(filename: Optional[str] = None) -> DocumentManager:
    """
    Restores the document manager from a checkpoint file.

    Args:
        filename (str, optional): The path to the checkpoint file. Defaults to DEFAULT_CHECKPOINT_FILE.

    Returns:
        DocumentManager: The restored DocumentManager instance.

    Raises:
        FileNotFoundError: If the checkpoint file does not exist.

    See Also:
        - Reset the values delete the documents.json file and run: manager = DocumentManager()
        - Restore the state from the documents.json file, run: DocumentManager.restore_from_file("documents.json")
        - Exclue a document: manager.exclude_document(doc_id="§ 275.0-2", doc_type="section")
        - List documents: manager.list_document_ids(doc_type="section")
        - Get a document: manager.retrieve_document(doc_id=doc, doc_type="section")
    """
    if filename is None:
        filename = config["DEFAULT_CHECKPOINT_FILE"]

    try:
        restored_docs = DocumentManager.restore_from_file(filename)
        logger.info("Checkpoint restored.")
    except FileNotFoundError:
        restored_docs = DocumentManager()
        logger.error(f"Checkpoint file '{filename}' not found, initializing new checkpoint.")

    return restored_docs

In [13]:
def save_checkpoint(manager: DocumentManager) -> None:
    """
    Saves the current state of the DocumentManager to a checkpoint file.

    Args:
        manager (DocumentManager): The DocumentManager instance to save.

    Raises:
        Exception: If there is an error saving the checkpoint.
    """
    try:
        manager.persist_to_file(filename=config["DEFAULT_CHECKPOINT_FILE"])
        logger.info("Checkpoint saved.")
    except FileNotFoundError:
        logger.error("Error saving checkpoint. Check the directory path and permissions.")

### Restore the checkpoint

In [14]:
# Restore the checkpoint
manager = restore_checkpoint()

ERROR:__main__:Checkpoint file '/content/drive/MyDrive/cfr2sbvr/checkpoints/documents-2024-10-25-1.json' not found, initializing new checkpoint.


## Datasets

Datasets used in the notebook. They are divided into sections and true tables. The sections are the documents from CFR and true tables are annoted  or "golden" datasets.

### General functions and data structures

In [15]:
def basic_text_stats(text: str) -> Tuple[int, int, int]:
    """
    Computes basic text statistics: number of lines, words, and average words per line.

    Args:
        text (str): The text to analyze.

    Returns:
        Tuple[int, int, int]: A tuple containing the number of lines, total words, and average words per line.
    """
    lines=len(text.split("\n"))
    words=len(text.split(" "))
    avg_words_per_line=round(words/lines)
    return lines, words, avg_words_per_line

In [16]:
def count_elements(data: Dict[str, Any]) -> Dict[str, int]:
    """
    Counts the occurrences of each entity by category and prints the counts.

    Args:
        data (dict): A dictionary containing the data to count.

    Returns:
        dict: A dictionary containing the counts of each entity by category.
    """
    facts_and_fact_types_count = len(data.get('facts_and_fact_types', []))

    terms_count = 0
    fact_symbols_count = 0

    for fact in data.get('facts_and_fact_types', []):
        # Count the terms in each fact
        terms_count += len(fact.get('terms', []))

        # Count the fact symbols in each fact
        fact_symbols_count += len(fact.get('fact_symbols', []))

    return {
        'facts_and_fact_types_count': facts_and_fact_types_count,
        'terms_count': terms_count,
        'fact_symbols_count': fact_symbols_count
    }

In [17]:
# Count the occurrences of each entity by category
def count_entities(source_list: List[Dict[str, Any]], group_by: str) -> Counter:
    """
    Counts the occurrences of each entity by category and prints the counts.

    Args:
        true_table (List[Dict[str, Any]]): A list of dictionaries representing entities.

    Returns:
        Counter: A dictionary containing the counts of each entity by category.
    """
    type_counts = Counter(item[group_by] for item in source_list)

    return type_counts

In [18]:
def get_section_from_kg(conn: Any, section_num: str) -> str:
    """
    Retrieves a section from the Knowledge Graph based on the section number.

    Args:
        conn: The connection object to the Knowledge Graph.
        section_num (str): The section number to query.

    Returns:
        str: The retrieved section content as a string.

    Raises:
        Exception: If there is an error executing the query.
    """
    # Query section number from KG
    query = """
    PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>
    PREFIX fro-leg-ref: <http://finregont.com/fro/ref/LegalReference.ttl#>

    SELECT ?section ?section_seq ?section_num ?section_subject ?section_citation ?section_notes ?divide ?divide_seq ?paragraph_enum ?paragraph_text
    WHERE {
      ?section a fro-cfr:CFR_Section ;
        fro-leg-ref:hasSequenceNumber ?section_seq ;
        fro-cfr:hasSectionNumber ?section_num ;
        fro-cfr:hasSectionSubject ?section_subject .
      OPTIONAL {?section fro-leg-ref:refers_toNote ?section_notes} .
      OPTIONAL {?section fro-cfr:hasSectionCitation ?section_citation} .

      ?divide fro-leg-ref:divides ?section ; # rdf:type fro-cfr:CFR_Parapraph
        fro-leg-ref:hasSequenceNumber ?divide_seq ;
        fro-cfr:hasParagraphText ?paragraph_text ;
        fro-leg-ref:hasSequenceNumber ?paragraph_seq .
      OPTIONAL {?divide fro-cfr:hasParagraphEnumText ?paragraph_enum} .
    """ + f"""
      FILTER("{section_num}" = ?section_num)
    """ + """
    }
    ORDER BY ?section_num ?section ?divide_seq
    """
    tuple_query = conn.prepareTupleQuery(QueryLanguage.SPARQL, query)
    result = tuple_query.evaluate()

    logger.debug(f"result.metadata: {result.metadata}")
    logger.debug(f"result.variable_names: {result.variable_names}")

    body_text = ""
    previous_section = None
    previous_paragraph_id = None
    with result:
      for binding_set in result:
          section = binding_set.getValue("section")
          section_seq = str(binding_set.getValue("section_seq")).replace('"', '')
          section_num = str(binding_set.getValue("section_num")).replace('"', '')
          section_subject = str(binding_set.getValue("section_subject")).replace('"', '')
          section_citation = str(binding_set.getValue("section_citation")).replace('"', '')
          section_notes = str(binding_set.getValue("section_notes")).replace('"', '')
          divide = binding_set.getValue("divide")
          divide_seq = str(binding_set.getValue("divide_seq")).replace('"', '')
          paragraph_enum = str(binding_set.getValue("paragraph_enum")).replace('"', '')
          paragraph_text = str(binding_set.getValue("paragraph_text")).replace('"', '')
          # Header
          if previous_section != section:
            previous_section = section
            header = f"""
    section_number: {section_num}
    section_subject: {section_subject}
    section_id: {section}
    citations: {section_citation}
    notes: {section_notes}
            """
          # Body
          if paragraph_enum != "None":
            body_text += f"""
    paragraph_enumeration: {paragraph_enum}
    paragraph_text: {paragraph_text}
    """
          else:
            body_text += f"""
    paragraph_text: {paragraph_text}
    """

    return header + body_text


In [19]:
def calculate_content_quantities_p1(doc_id, content_data, filename):
    elements = content_data.get("elements", [])
    logger.debug(elements)

    # Collect statistics
    num_elements = len(elements)
    fact_count = 0
    fact_type_count = 0
    rule_count = 0
    verb_count = 0
    term_count = 0

    # Process each element within the document
    for element in elements:
        classification = element.get("classification", "Unknown")
        if classification == "Fact":
            fact_count += 1
        elif classification == "Fact Type":
            fact_type_count += 1
        elif classification == "Rule":
            rule_count += 1
        verb_count += len(element.get("verb_symbols", []))
        term_count += len(element.get("terms", []))

    return {
        "document_id": doc_id,
        "quantity_of_elements": num_elements,
        "quantity_of_facts": fact_count,
        "quantity_of_fact_types": fact_type_count,
        "quantity_of_rules": rule_count,
        "quantity_of_verbs": verb_count,
        "quantity_of_terms": term_count,
        "filename": filename,
    }

In [20]:
def process_documents_p1(file_path, file_name, doc_ids):
    # Initialize data containers for the two tables
    table_data = []

    with open(file_path, 'r') as file:
        content = json.load(file)

        # Iterate over each document in the file
        for doc_id, content_data in content.items():
            logger.debug(doc_id, content_data)
            # Check if the document ID is in the list to process
            #if doc_id in doc_ids and 'content' in doc_data:
            if all([doc_id in doc_ids, 'content' in content_data]):
                table_data.append(calculate_content_quantities_p1(doc_id, content_data['content'], file_name))

    return table_data


In [21]:
def calculate_content_quantities_p2(doc_id, content_data, filename):
    terms_relationship = content_data['content'].get('terms_relationship', [])
    logger.debug(f"terms_relationship: {terms_relationship}")
    terms = content_data['content']['terms']
    logger.debug(f"terms: {terms}")

    # Count terms with and without definitions
    total_terms = len(terms)
    terms_with_definition = sum(1 for term in terms if term.get('definition'))
    terms_without_definition = total_terms - terms_with_definition

    # Check for term relationships and count them
    terms_relationship_count = len(terms_relationship)

    # Add data to table
    return {
        "document_id": doc_id,
        "count_of_terms": total_terms,
        "terms_with_definition": terms_with_definition,
        "terms_without_definition": terms_without_definition,
        "terms_relationship_count": terms_relationship_count,
        "filename": filename
    }

In [22]:
def process_documents_p2(file_path, file_name, doc_ids):
    table_data = []
    with open(file_path, 'r') as file:
        content = json.load(file)

        # Iterate over each document in the file
        for doc_id, doc_data in content.items():
            # Check if the document has terms in its content
            #if doc_id in doc_ids and 'content' in doc_data and 'terms' in doc_data['content']:
            if all([doc_id in doc_ids, 'content' in doc_data, 'terms' in doc_data['content']]):
                table_data.append(calculate_content_quantities_p2(doc_id, doc_data, file_name))
    return table_data

### Get section from KG CFR
Due the mistakes in the original dataset, we need to correct it. This function will not be used in the final version. Instead we will use variables (document_02, document_05, document_07) from the original dataset.

#### lab

Connection issue allegrograph cloud

In [23]:
# import ssl

# from urllib3 import PoolManager
# from urllib3.util import create_urllib3_context

# ctx = create_urllib3_context()
# ctx.load_default_certs()
# ctx.set_ciphers("AES256-GCM-SHA384")

# with PoolManager(ssl_context=ctx) as pool:
#     print(pool.request("GET", f'https://{config["ALLEGROGRAPH"]["HOST"]}').headers)

HTTPHeaderDict({'Date': 'Fri, 25 Oct 2024 01:54:21 GMT', 'Connection': 'Keep-Alive', 'Keep-Alive': 'timeout=10', 'Server': 'AllegroGraph/8.2.1 (AllegroServe/1.3.87)', 'Content-Type': 'text/html', 'Content-Length': '819', 'cache-control': 'no-store', 'last-modified': 'Thu, 25 Jul 2024 15:13:36 GMT'})


In [24]:
# import requests

# #requests.get("https://google.com")
# requests.get(f'https://{config["ALLEGROGRAPH"]["HOST"]}')

SSLError: HTTPSConnectionPool(host='ag1eawvuu0p3zv35.allegrograph.cloud', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:1007)')))

In [None]:
# from franz.openrdf.sail.allegrographserver import AllegroGraphServer

# print("Connecting to AllegroGraph server --",
#       "host:'%s' port:%s" % (config["ALLEGROGRAPH"]["HOST"], config["ALLEGROGRAPH"]["PORT"]))
# server = AllegroGraphServer(config["ALLEGROGRAPH"]["HOST"], config["ALLEGROGRAPH"]["PORT"],
#                             config["ALLEGROGRAPH"]["USER"], config["ALLEGROGRAPH"]["PASSWORD"])
# print("Available catalogs:")
# for cat_name in server.listCatalogs():
#     if cat_name is None:
#         print('  - <root catalog>')
#     else:
#         print('  - ' + str(cat_name))

In [25]:
f'https://{config["ALLEGROGRAPH"]["HOST"]}'

'https://ag1eawvuu0p3zv35.allegrograph.cloud'

#### Access allegrograph

In [None]:
# conn = ag_connect(repo=config["ALLEGROGRAPH"]["REPO"], catalog=config["ALLEGROGRAPH"]["CATALOG"],
#                 host=f'https://{config["ALLEGROGRAPH"]["HOST"]}:443',
#                 user=config["ALLEGROGRAPH"]["USER"], password=config["ALLEGROGRAPH"]["PASSWORD"])

In [None]:
# section_num = "§ 275.0-7"
# logger.info(get_section_from_kg(conn, section_num=section_num))

Print results formatted

In [None]:
# conn.close()

### Texts to extract the elements

CFR Sections 275.0-2, 275.0-5, 275.0-7

Section 275.0-2

In [26]:
manager.add_document(
    Document(
        id="§ 275.0-2",
        type="section",
content = """
§ 275.0-2 General procedures for serving non-residents.
(a) General procedures for serving process, pleadings, or other papers on non-resident investment advisers, general partners and managing agents.  Under Forms ADV and ADV-NR [17 CFR 279.1 and 279.4], a person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents:
  (1) A person may serve a non-resident investment adviser, non-resident general partner, or non-resident managing agent by furnishing the Commission with one copy of the process, pleadings, or papers, for each named party, and one additional copy for the Commission's records.
  (2) If process, pleadings, or other papers are served on the Commission as described in this section, the Secretary of the Commission (Secretary) will promptly forward a copy to each named party by registered or certified mail at that party's last address filed with the Commission.
  (3) If the Secretary certifies that the Commission was served with process, pleadings, or other papers pursuant to paragraph (a)(1) of this section and forwarded these documents to a named party pursuant to paragraph (a)(2) of this section, this certification constitutes evidence of service upon that party.
(b) Definitions.  For purposes of this section:
  (1) Managing agent  means any person, including a trustee, who directs or manages, or who participates in directing or managing, the affairs of any unincorporated organization or association other than a partnership.
  (2) Non-resident  means:
    (i) An individual who resides in any place not subject to the jurisdiction of the United States;
    (ii) A corporation that is incorporated in or that has its principal office and place of business in any place not subject to the jurisdiction of the United States; and
    (iii) A partnership or other unincorporated organization or association that has its principal office and place of business in any place not subject to the jurisdiction of the United States.
  (3) Principal office and place of business  has the same meaning as in § 275.203A-3(c) of this chapter.
"""
    )
)

Section 275.0-5

In [27]:
manager.add_document(
    Document(
        id="§ 275.0-5",
        type="section",
content = """
§ 275.0-5 Procedure with respect to applications and other matters.
The procedure hereinbelow set forth will be followed with respect to any proceeding initiated by the filing of an application, or upon the Commission's own motion, pursuant to any section of the Act or any rule or regulation thereunder, unless in the particular case a different procedure is provided:
(a) Notice of the initiation of the proceeding will be published in the Federal Register and will indicate the earliest date upon which an order disposing of the matter may be entered. The notice will also provide that any interested person may, within the period of time specified therein, submit to the Commission in writing any facts bearing upon the desirability of a hearing on the matter and may request that a hearing be held, stating his reasons therefor and the nature of his interest in the matter.
(b) An order disposing of the matter will be issued as of course following the expiration of the period of time referred to in paragraph (a) of this section, unless the Commission thereafter orders a hearing on the matter.
(c) The Commission will order a hearing on the matter, if it appears that a hearing is necessary or appropriate in the public interest or for the protection of investors,
  (1) upon the request of any interested person or
  (2) upon its own motion.
(d) Definition of application. For purposes of this rule, an “application” means any application for an order of the Commission under the Act other than an application for registration as an investment adviser.
"""
    )
)

Section 275.0-7

In [28]:
manager.add_document(
    Document(
        id="§ 275.0-7",
        type="section",
content = """
§ 275.0-7 Small entities under the Investment Advisers Act for purposes of the Regulatory Flexibility Act.
(a) For purposes of Commission rulemaking in accordance with the provisions of Chapter Six of the Administrative Procedure Act (5 U.S.C. 601 et seq.) and unless otherwise defined for purposes of a particular rulemaking proceeding, the term small business or small organization for purposes of the Investment Advisers Act of 1940 shall mean an investment adviser that:
  (1) Has assets under management, as defined under Section 203A(a)(3) of the Act (15 U.S.C. 80b-3a(a)(2)) and reported on its annual updating amendment to Form ADV (17 CFR 279.1), of less than $25 million, or such higher amount as the Commission may by rule deem appropriate under Section 203A(a)(1)(A) of the Act (15 U.S.C. 80b-3a(a)(1)(A));
  (2) Did not have total assets of $5 million or more on the last day of the most recent fiscal year; and
  (3) Does not control, is not controlled by, and is not under common control with another investment adviser that has assets under management of $25 million or more (or such higher amount as the Commission may deem appropriate), or any person (other than a natural person) that had total assets of $5 million or more on the last day of the most recent fiscal year.
(b) For purposes of this section:
  (1) Control  means the power, directly or indirectly, to direct the management or policies of a person, whether through ownership of securities, by contract, or otherwise.
    (i) A person is presumed to control a corporation if the person:
      (A) Directly or indirectly has the right to vote 25 percent or more of a class of the corporation's voting securities; or
      (B) Has the power to sell or direct the sale of 25 percent or more of a class of the corporation's voting securities.
    (ii) A person is presumed to control a partnership if the person has the right to receive upon dissolution, or has contributed, 25 percent or more of the capital of the partnership.
    (iii) A person is presumed to control a limited liability company (LLC) if the person:
      (A) Directly or indirectly has the right to vote 25 percent or more of a class of the interests of the LLC;
      (B) Has the right to receive upon dissolution, or has contributed, 25 percent or more of the capital of the LLC; or
      (C) Is an elected manager of the LLC.
    (iv) A person is presumed to control a trust if the person is a trustee or managing agent of the trust.
  (2) Total assets  means the total assets as shown on the balance sheet of the investment adviser or other person described above under paragraph (a)(3) of this section, or the balance sheet of the investment adviser or such other person with its subsidiaries consolidated, whichever is larger.
"""
    )
)

### True tables

True tables are annotated or "golden" datasets in which entities have been manually identified and labeled within the original source data.

True tables for sectiona 275.0-2, 275.0-5 and 275.0-7

Load true table for part 1.

In [29]:
with open(f"{config['DEFAULT_DATA_DIR']}/p1_true_table.json", 'r') as file:
    data = json.load(file)

    manager.add_document(
        Document.model_validate(data["§ 275.0-2_P1|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-5_P1|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-7_P1|true_table"])
    )

Load true table for part 2.

In [30]:
with open(f"{config['DEFAULT_DATA_DIR']}/p2_true_table.json", 'r') as file:
    data = json.load(file)

    manager.add_document(
        Document.model_validate(data["§ 275.0-2_P2|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-5_P2|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-7_P2|true_table"])
    )

### Elements classification taxonomy

Prompt

In [None]:
system_prompt_taxonomy_classification = """
Classify each element using the provided taxonomy. Use the example of the class to help you.

Answer adding taxonomy classification to each element in the following format:


elements:
  - doc_id: § 275.0-2_P1
    - id: 1
      expression: "A person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents."
      template_id: T7
      template: |
      {A|An|The|} <definitional rule statement subject>
      {<qualifying clause>|}
      <verb phrase> by definition
      <definition>
      classification": Formal intensional definitions
    - id: 2
      expression: "A person may serve a non-resident investment adviser, non-resident general partner, or non-resident managing agent by furnishing the Commission with one copy of the process, pleadings, or papers, for each named party, and one additional copy for the Commission's records."
      template_id: T8
      template: |
      {A|An|The|} <definitional rule statement subject>
      {<qualifying clause>|}
      <verb phrase> by definition
      <definition>
      classification": Formal extensional definitions
  - doc_id: § 275.0-5_P1
    - id: 1
      expression: "An order disposing of the matter will be issued as of course following the expiration of the period of time referred to in paragraph (a) of this section, unless the Commission thereafter orders a hearing on the matter."
      template_id: T9
      template: |
      {A|An|The|} <definitional rule statement subject>
      {<qualifying clause>|}
      <verb phrase> by definition
      <definition>
      classification": Formal extensional definitions
    ...
  ...
"""

In [47]:
example = {
  "§ 275.0-2_P1": {
    "id": "§ 275.0-2_P1",
    "type": "section",
    "elements": {
      "fact_types": [
        {
          "id": 1,
          "expression": "A person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents.",
          "template_id": "T7",
          "template": "{A|An|The|} <definitional rule statement subject> {<qualifying clause>|} <verb phrase> by definition <definition>",
          "classification": "Formal intensional definitions"
        }
      ],
      "rules": [],
      "terms": []
    }
  }
}

In [38]:
with open(f"{config['DEFAULT_DATA_DIR']}/classify_subtypes.json", 'r') as file:
    data = json.load(file)

    print(data)

[{'section_id': '9.2', 'section_title': 'Definitional rules', 'section_definition': 'Definitional rules constrains how we define a construct created or used by the organization or the industry within which it operates. Definitional rules can in turn be categorized as:', 'subsections': [{'section_id': '9.2.1', 'section_title': 'Formal term definitions', 'section_definition': 'A formal term definition defines a particular business term in a formal manner. They are categorized as:', 'subsections': [{'section_id': '9.2.1.1', 'section_title': 'Formal intensional definitions', 'section_definition': 'A formal intensional definition defines the subject business term using an intensional definition: one that cites both a hypernym (a term that refers to a superset of the set referred to by the original term) and the characteristics that distinguish members of the set referred to by the original term.', 'templates': ['T7']}, {'section_id': '9.2.1.2', 'section_title': 'Formal extensional definitio

In [46]:
def find_sections_by_title(data, title):
    result = []

    # Recursively search for matching section_title
    def search_sections(sections):
        for section in sections:
            if section['section_title'] == title:
                result.append(section)
            if 'subsections' in section:
                search_sections(section['subsections'])

    # Start the search from the root level
    search_sections(data)
    return result

In [45]:
# Example: Find sections with the title 'Definitional rules'
filtered_sections = find_sections_by_title(data, 'Definitional rules')

# Output the filtered sections
print(json.dumps(filtered_sections, indent=2))

[
  {
    "section_id": "9.2",
    "section_title": "Definitional rules",
    "section_definition": "Definitional rules constrains how we define a construct created or used by the organization or the industry within which it operates. Definitional rules can in turn be categorized as:",
    "subsections": [
      {
        "section_id": "9.2.1",
        "section_title": "Formal term definitions",
        "section_definition": "A formal term definition defines a particular business term in a formal manner. They are categorized as:",
        "subsections": [
          {
            "section_id": "9.2.1.1",
            "section_title": "Formal intensional definitions",
            "section_definition": "A formal intensional definition defines the subject business term using an intensional definition: one that cites both a hypernym (a term that refers to a superset of the set referred to by the original term) and the characteristics that distinguish members of the set referred to by the ori

In [43]:
# Example: Find sections with the title 'Definitional rules'
filtered_sections = find_sections_by_title(data, 'Formal term definitions')

# Output the filtered sections
print(json.dumps(filtered_sections, indent=2))

[]


### Save checkpoint

In [31]:
# Persist the state to a file
save_checkpoint(manager)

### Check the content of datasets

In [32]:
logger.info("SECTIONS:")
# List all document ids | type
logger.info(f"section docs: {manager.list_document_ids(doc_type='section')}")

# Retrieve a document by id | type
for doc in manager.list_document_ids(doc_type="section"):
    retrieved_doc = manager.retrieve_document(doc_id=doc, doc_type="section")
    logger.debug(retrieved_doc)
    lines, words, avg_words_per_line = basic_text_stats(retrieved_doc.content)
    logger.info(
        f"{doc}: Total number of lines: {lines}, total number of words: {words}, and average words per line: {avg_words_per_line}"
    )

retrieved_true_table_p1 = []
retrieved_true_table_p2 = []

for doc in manager.list_document_ids(doc_type="true_table"):
    logger.info(f"Processing document: {doc} ...")
    # Docs type true_table P1
    if doc.endswith("_P1"):
        retrieved_true_table_p1.append(
            calculate_content_quantities_p1(
                doc,
                manager.retrieve_document(
                    doc_id=doc, doc_type="true_table"
                ).model_dump()["content"],
                filename="p1_true_table.json",
            )
        )
        logger.info("retrieve P1")
    # Docs type true_table P2
    elif doc.endswith("_P2"):
        retrieved_true_table_p2.append(
            calculate_content_quantities_p2(
                doc,
                manager.retrieve_document(
                    doc_id=doc, doc_type="true_table"
                ).model_dump(),
                filename="p2_true_table.json",
            )
        )
        logger.info("retrieve P2")

# Convert collected data to a DataFrame
table_true_df_p1 = pd.DataFrame(retrieved_true_table_p1)
table_true_df_p2 = pd.DataFrame(retrieved_true_table_p2)

# Save DataFrames to CSV if needed
table_true_df_p1.to_excel(f"{config['DEFAULT_OUTPUT_DIR']}/P1_summary_true_table.xlsx", index=False)
table_true_df_p2.to_excel(f"{config['DEFAULT_OUTPUT_DIR']}/P2_summary_true_table.xlsx", index=False)

In [33]:
table_true_df_p1

Unnamed: 0,document_id,quantity_of_elements,quantity_of_facts,quantity_of_fact_types,quantity_of_rules,quantity_of_verbs,quantity_of_terms,filename
0,§ 275.0-2_P1,9,0,7,2,28,73,p1_true_table.json
1,§ 275.0-5_P1,5,0,4,1,21,41,p1_true_table.json
2,§ 275.0-7_P1,9,0,5,4,21,46,p1_true_table.json


Count of all runs in the checkpoints for P1.

In [34]:
table_true_df_p1.tail(6)

Unnamed: 0,document_id,quantity_of_elements,quantity_of_facts,quantity_of_fact_types,quantity_of_rules,quantity_of_verbs,quantity_of_terms,filename
0,§ 275.0-2_P1,9,0,7,2,28,73,p1_true_table.json
1,§ 275.0-5_P1,5,0,4,1,21,41,p1_true_table.json
2,§ 275.0-7_P1,9,0,5,4,21,46,p1_true_table.json


In [35]:
table_true_p1 = table_true_df_p1.groupby('document_id').describe()

table_true_p1

Unnamed: 0_level_0,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_facts,quantity_of_facts,...,quantity_of_verbs,quantity_of_verbs,quantity_of_terms,quantity_of_terms,quantity_of_terms,quantity_of_terms,quantity_of_terms,quantity_of_terms,quantity_of_terms,quantity_of_terms
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
document_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
§ 275.0-2_P1,1.0,9.0,,9.0,9.0,9.0,9.0,9.0,1.0,0.0,...,28.0,28.0,1.0,73.0,,73.0,73.0,73.0,73.0,73.0
§ 275.0-5_P1,1.0,5.0,,5.0,5.0,5.0,5.0,5.0,1.0,0.0,...,21.0,21.0,1.0,41.0,,41.0,41.0,41.0,41.0,41.0
§ 275.0-7_P1,1.0,9.0,,9.0,9.0,9.0,9.0,9.0,1.0,0.0,...,21.0,21.0,1.0,46.0,,46.0,46.0,46.0,46.0,46.0


Count of all runs in the checkpoints for P2.

In [36]:
table_true_df_p2.tail(6)

Unnamed: 0,document_id,count_of_terms,terms_with_definition,terms_without_definition,terms_relationship_count,filename
0,§ 275.0-2_P2,37,34,3,11,p2_true_table.json
1,§ 275.0-5_P2,29,29,0,8,p2_true_table.json
2,§ 275.0-7_P2,26,26,0,2,p2_true_table.json


In [37]:
table_true_p2 = table_true_df_p2.groupby('document_id').describe()

table_true_p2

Unnamed: 0_level_0,count_of_terms,count_of_terms,count_of_terms,count_of_terms,count_of_terms,count_of_terms,count_of_terms,count_of_terms,terms_with_definition,terms_with_definition,...,terms_without_definition,terms_without_definition,terms_relationship_count,terms_relationship_count,terms_relationship_count,terms_relationship_count,terms_relationship_count,terms_relationship_count,terms_relationship_count,terms_relationship_count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
document_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
§ 275.0-2_P2,1.0,37.0,,37.0,37.0,37.0,37.0,37.0,1.0,34.0,...,3.0,3.0,1.0,11.0,,11.0,11.0,11.0,11.0,11.0
§ 275.0-5_P2,1.0,29.0,,29.0,29.0,29.0,29.0,29.0,1.0,29.0,...,0.0,0.0,1.0,8.0,,8.0,8.0,8.0,8.0,8.0
§ 275.0-7_P2,1.0,26.0,,26.0,26.0,26.0,26.0,26.0,1.0,26.0,...,0.0,0.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0


## Processes

The execution part of the notebook. These code are in charge of the annotation process.

### extract / classify elements

#### General functions and data structures

Functions and data structures used in the notebook.

In [None]:
class Term(BaseModel):
    term: str = Field(..., description="The term is a word or a group of words that represents a specific concept, entity, or subject in a particular context")
    classification: str = Field(..., description="The classification of the term, either 'Common Noun' or 'Proper Noun'.")

class Element(BaseModel):
    id: int = Field(..., description="A unique numeric identifier for each fact, fact type, or rule.")
    expression: str = Field(..., description="The full sentence or phrase representing the fact, fact type, or rule.")
    terms: List[Term] = Field(..., description="A list of terms involved in the fact, fact type, or rule.")
    verb_symbols: List[str] = Field(..., description="A list of vers, verb phrases or prepositions connecting the terms.")
    classification: str = Field(..., description="Indicates whether the expression is classified as 'Fact', 'Fact Type', or 'Rule'.")
    source: str = Field(..., description="The paragraph ID of the document where the fact, fact type, or rule is located (e.g., '(a)', '(b)(2)').")

class ElementsDocumentModel(BaseModel):
    section: str = Field(..., description="The section ID of the document.")
    summary: str = Field(..., description="The summary of the document.")
    elements: List[Element] = Field(..., description="A list of facts, fact types, and rules extracted from the document.")

In [None]:
class Item(BaseModel):
    term: str = Field(..., description="The term is a word or a group of words that represents a specific concept, entity, or subject in a particular context")
    definition: Optional[str] = Field(None, description="Definition is a explanation or description of the meaning of the term.")

class TermsRelationship(BaseModel):
    term_1: str = Field(..., description="First term in the relationship.")
    term_2: str = Field(..., description="Second term in the relationship.")
    relation: str = Field(..., description="The typrelationship between the terms.")

class TermsDocumentModel(BaseModel):
    terms: List[Item] = Field(..., description="A list of terms.")
    terms_relationship: List[TermsRelationship] = Field(..., description="A list of relationships between terms.")

In [None]:
@measure_time
def query_instruct_llm(system_prompt: str, user_prompt: str, document_model: Any) -> Any:
    """
    Queries the LLM with the given system and user prompts.

    Args:
        system_prompt (str): The system prompt to set the context for the LLM.
        user_prompt (str): The user prompt containing the text to analyze.

    Returns:
        Any: The response from the LLM, parsed into a document_model object.

    Raises:
        Exception: If the API call fails.
    """
    client = instructor.from_openai(OpenAI()) #, mode=instructor.Mode.TOOLS_STRICT)
    resp = client.chat.completions.create(
        model=config["LLM"]["MODEL"],
        response_model=document_model,
        temperature=config["LLM"]["TEMPERATURE"],
        max_tokens=config["LLM"]["MAX_TOKENS"],
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
    )
    return resp

In [None]:
def save_compare_items_metrics(
    section_id: str,
    correct_items_len: int,
    predicted_items_len: int,
    common_items_len: int,
    missed_items_len: int,
    extra_items_len: int,
    precision: float,
    recall: float,
    f1: float,
    file_name: str = 'section_validation_metrics.xlsx'
) -> None:
    """
    Saves comparison metrics to an Excel file.

    Args:
        section_id (str): The ID of the section being analyzed.
        correct_items_len (int): Number of correct items.
        predicted_items_len (int): Number of predicted items.
        common_items_len (int): Number of items common between correct and predicted.
        missed_items_len (int): Number of missed items.
        extra_items_len (int): Number of extra items.
        precision (float): Precision metric.
        recall (float): Recall metric.
        f1 (float): F1 score.
        file_name (str, optional): The filename to save the metrics. Defaults to 'section_validation_metrics.xlsx'.

    Returns:
        None

    Raises:
        Exception: If there is an error writing to the Excel file.
    """
    data = {
        'section': [section_id],
        'correct_C': [correct_items_len],
        'predicted_P': [predicted_items_len],
        'C_intersec_P': [common_items_len],
        'C_less_P': [missed_items_len],
        'P_less_C': [extra_items_len],
        'precision': [precision],
        'recall': [recall],
        'f1': [f1],
        'timestamp': [datetime.now()]
    }

    # Convert the dictionary into a pandas DataFrame
    df = pd.DataFrame(data)

    # Check if the file exists
    if not os.path.isfile(file_name):
        # If the file doesn't exist, create it and write the data
        df.to_excel(file_name, index=False)
    else:
        # If the file exists, append the new data without writing the header
        with pd.ExcelWriter(file_name, mode='a', if_sheet_exists='overlay', engine='openpyxl') as writer:
            df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)

    logger.info(f"Data appended to {file_name}")

In [None]:
def compare_items(
    doc: str,
    llm_response: List[Dict[str, Any]],
    true_table: List[Dict[str, Any]],
    item_name: str,
    item_category: str
) -> Tuple[Set[str], Dict[str, str], Set[str], Dict[str, str], Set[str], Set[str], Set[str]]:
    """
    Compares the LLM response with the true table to identify matches, misses, and extras.

    Args:
        doc (str): The document identifier.
        llm_response (List[ItemModel]): The list of items returned by the LLM.
        true_table (List[Dict[str, Any]]): The true table containing correct items.

    Returns:
        Tuple containing:
            - predicted_items (Set[str]): Set of predicted entity names.
            - predicted_dict (Dict[str, str]): Dict mapping predicted entity names to categories.
            - correct_items (Set[str]): Set of correct entity names.
            - correct_dict (Dict[str, str]): Dict mapping correct entity names to categories.
            - common_items (Set[str]): Set of entity names common to both predicted and correct items.
            - missed_items (Set[str]): Set of missed entity names (in correct but not in predicted).
            - extra_items (Set[str]): Set of extra entity names (in predicted but not in correct).
    """
    # Create dictionaries keyed by 'entity'
    correct_dict = {item[item_name].lower(): item[item_category] for item in true_table}
    predicted_dict = {item[item_name].lower(): item[item_category] for item in llm_response}

    logger.debug(f"correct_dict: {correct_dict}")
    logger.debug(f"predicted_dict: {predicted_dict}")

    # Identify Common and Unique entities
    # Get sets of entities
    correct_items = set(correct_dict.keys())
    predicted_items = set(predicted_dict.keys())

    # Identify true positives, false negatives, and false positives
    common_items = correct_items & predicted_items  # Matched terms
    missed_items = correct_items - predicted_items  # Terms missed in predictions
    extra_items = predicted_items - correct_items   # Additional terms in predictions

    logger.debug(f"common_items: {common_items}")
    logger.debug(f"missed_items: {missed_items}")
    logger.debug(f"extra_items: {extra_items}")
    logger.info(f"""
Document: {doc}
Correct items; Predicted items; Common items; Missed items; Extra items; Precision; Recall; F1
{len(correct_items)}; {len(predicted_items)}; {len(common_items)}; {len(missed_items)}; {len(extra_items)}; {len(common_items) / len(correct_items)}; {len(common_items) / len(predicted_items)}; {2 * len(common_items) / (len(correct_items) + len(predicted_items))}
    """
    )

    return predicted_items, predicted_dict, correct_items, correct_dict, common_items, missed_items, extra_items

In [None]:
def plot_confusion_matrix(
    predicted_items: Set[str],
    predicted_dict: Dict[str, str],
    correct_items: Set[str],
    correct_dict: Dict[str, str]
) -> Dict[str, Any]:
    """
    Plots a confusion matrix and generates a classification report.

    Args:
        predicted_items (Set[str]): Set of predicted entity names.
        predicted_dict (Dict[str, str]): Dict mapping predicted entity names to categories.
        correct_items (Set[str]): Set of correct entity names.
        correct_dict (Dict[str, str]): Dict mapping correct entity names to categories.

    Returns:
        Dict[str, Any]: The classification report as a dictionary.
    """
    # Assuming correct_dict and predicted_dict are already defined
    comparison_results = []

    # Convert the lists to sets to use the union() method
    predicted_items_set = set(predicted_items)
    correct_items_set = set(correct_items)

    for item in correct_items_set.union(predicted_items_set):
        correct_category = correct_dict.get(item)
        predicted_category = predicted_dict.get(item)

        # Replace None with 'None' string
        if correct_category is None:
            correct_category = 'None'
        if predicted_category is None:
            predicted_category = 'None'

        # Determine if types match
        category_match = correct_category == predicted_category

        # Append to comparison_results
        comparison_results.append({
            'Item': item,
            'Correct category': correct_category,
            'Predicted category': predicted_category,
            'Category match': category_match
        })

    # Create the DataFrame
    df = pd.DataFrame(comparison_results)


    # Filter out rows where either the correct or predicted category is 'None'
    df_filtered = df[(df['Correct category'] != 'None') & (df['Predicted category'] != 'None')]

    # Create the confusion matrix
    confusion_matrix = pd.crosstab(
        df_filtered['Correct category'],
        df_filtered['Predicted category'],
        rownames=['Actual'],
        colnames=['Predicted'],
        margins=True
    )

    # Visualize the confusion matrix
    cm = confusion_matrix.iloc[:-1, :-1] #if 'All' in confusion_matrix.index else confusion_matrix

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion matrix of category predictions')
    plt.ylabel('Actual category')
    plt.xlabel('Predicted category')
    plt.xticks(rotation=90)
    plt.show()

    # Prepare data for classification report
    types = sorted(set(df_filtered['Correct category']) | set(df_filtered['Predicted category']))
    type_to_int = {t: i for i, t in enumerate(types)}

    y_true = df_filtered['Correct category'].map(type_to_int)
    y_pred = df_filtered['Predicted category'].map(type_to_int)

    # Generate classification report
    report = classification_report(y_true, y_pred, target_names=types, output_dict=True)
    return report


In [None]:
def generate_report(checkpoint_file: str, output_file: str) -> None:
    """
    Generates an HTML report from the checkpoint data and saves it to a file.

    Args:
        checkpoint_file (str): The path to the checkpoint file.
        output_file (str): The path to save the HTML report.

    Returns:
        None

    Raises:
        Exception: If there is an error reading the checkpoint file or writing the report.
    """
    # JSON data (you can replace this with reading from a file if needed)
    # load json from file
    data = json.load(
        open(checkpoint_file)
    )

    # Function to generate HTML report
    def generate_html_report(data):
        html_content = """
        <!DOCTYPE html>
        <html>
        <head>
            <title>Term Extraction Report</title>
            <style>
                body { font-family: Arial, sans-serif; margin: 20px; }
                h1 { color: #333; }
                h2 { color: #555; }
                table { width: 100%; border-collapse: collapse; margin-bottom: 20px; }
                th, td { border: 1px solid #ccc; padding: 8px; text-align: left; }
                th { background-color: #f5f5f5; }
                .section { margin-bottom: 60px; }
                .statistics, .classification-report { margin-bottom: 40px; }
                pre { background-color: #f5f5f5; padding: 10px; }
                ul { list-style-type: disc; margin-left: 20px; }
            </style>
        </head>
        <body>
            <h1>Term Extraction Report</h1>
            <p>The algorithm extracted entities from the sections. For each section, there is a table with the true values, followed by the LLM's response, and an analysis comparing the LLM's findings with the expected entities.</p>
        """

        # Iterate over each section
        for key in data:
            item = data[key]
            if item['type'] == 'section':
                section_id = item['id']
                section_content = item['content']
                html_content += f"<div class='section'>\n<h2>Section: {section_id}</h2>\n"
                html_content += f"<pre>{section_content}</pre>\n"

                # True table
                true_table_key = f"{section_id}|true_table"
                if true_table_key in data:
                    html_content += "<h3>True Values</h3>\n"
                    html_content += "<table>\n<tr><th>Signifier</th><th>Definition</th><th>Concept classification</th><th>Source</th></tr>\n"
                    for row in data[true_table_key]['content']:
                        signifier = row.get('signifier', 'N/A')
                        definition = row.get('definition', 'N/A')
                        concept_classification = row.get('concept_classification', 'N/A')
                        sources = row.get('sources', {})
                        source_text = f"Section {sources[0].get('section', 'N/A')} Paragraph {sources[0].get('paragraph', 'N/A')}"
                        html_content += f"<tr><td>{signifier}</td><td>{definition}</td><td>{concept_classification}</td><td>{source_text}</td></tr>\n"
                    html_content += "</table>\n"

                # LLM response
                llm_response_key = f"{section_id}|llm_response"
                if llm_response_key in data:
                    html_content += "<h3>LLM Extracted Entities</h3>\n"
                    html_content += "<table>\n<tr><th>Signifier</th><th>Definition</th><th>Concept classification</th><th>Source</th></tr>\n"
                    for row in data[llm_response_key]['content']:
                        signifier = row.get('signifier', 'N/A')
                        definition = row.get('definition', 'N/A')
                        concept_classification = row.get('concept_classification', 'N/A')
                        sources = row.get('sources', {})
                        source_text = f"Section {sources[0].get('section', 'N/A')} Paragraph {sources[0].get('paragraph', 'N/A')}"
                        html_content += f"<tr><td>{signifier}</td><td>{definition}</td><td>{concept_classification}</td><td>{source_text}</td></tr>\n"
                    html_content += "</table>\n"

                # Statistics
                statistics_key = f"{section_id}|statistics"
                if statistics_key in data:
                    html_content += "<h3>Comparison Statistics</h3>\n"
                    stats = data[statistics_key]['content']
                    html_content += "<div class='statistics'>\n"
                    html_content += "<h4>Common Items</h4>\n<ul>\n"
                    for item_name in stats.get('common_items', []):
                        html_content += f"<li>{item_name}</li>\n"
                    html_content += "</ul>\n"

                    html_content += "<h4>Missed Items</h4>\n<ul>\n"
                    for item_name in stats.get('missed_items', []):
                        html_content += f"<li>{item_name}</li>\n"
                    html_content += "</ul>\n"

                    html_content += "<h4>Extra Items</h4>\n<ul>\n"
                    for item_name in stats.get('extra_items', []):
                        html_content += f"<li>{item_name}</li>\n"
                    html_content += "</ul>\n"

                    # Add Type Mismatches
                    predicted_dict = stats.get('predicted_dict', {})
                    correct_dict = stats.get('correct_dict', {})
                    common_items = stats.get('common_items', [])

                    mismatches = []
                    correctly_concept_classification_matched = 0
                    for signifier in common_items:
                        predicted_concept_classification = predicted_dict.get(signifier)
                        correct_concept_classification = correct_dict.get(signifier)
                        if predicted_concept_classification != correct_concept_classification:
                            mismatches.append((signifier, correct_concept_classification, predicted_concept_classification))
                        else:
                            correctly_concept_classification_matched += 1

                    html_content += "<h4>Type Mismatches</h4>\n<ul>\n"
                    for signifier, correct_cat, predicted_cat in mismatches:
                        html_content += f"<li>Type mismatch for '{signifier}': Correct concept_classification='{correct_cat}', Predicted concept_classification='{predicted_cat}'</li>\n"
                    html_content += "</ul>\n"

                    # Totals
                    total_matched = len(common_items)
                    total_correctly_concept_classification_matched = correctly_concept_classification_matched
                    total_missed = len(stats.get('missed_items', []))
                    total_extra = len(stats.get('extra_items', []))

                    html_content += f"<p>Total matched: {total_matched}, Correctly classification matched: {total_correctly_concept_classification_matched}, Missed: {total_missed}, Extra: {total_extra}</p>\n"

                    html_content += "</div>\n"

                # Classification Report
                classification_key = f"{section_id}|classification_report"
                if classification_key in data:
                    html_content += "<h3>Classification Report</h3>\n"
                    report = data[classification_key]['content']
                    html_content += "<div class='classification-report'>\n"
                    html_content += "<table>\n<tr><th>concept_classification</th><th>Precision</th><th>Recall</th><th>F1-Score</th><th>Support</th></tr>\n"
                    for concept_classification, metrics in report.items():
                        if concept_classification in ['accuracy', 'macro avg', 'weighted avg']:
                            continue

                        precision = metrics.get('precision', 0.0)
                        recall = metrics.get('recall', 0.0)
                        f1_score = metrics.get('f1-score', 0.0)
                        support = metrics.get('support', 0)

                        html_content += f"<tr><td>{concept_classification}</td><td>{precision:.2f}</td><td>{recall:.2f}</td><td>{f1_score:.2f}</td><td>{support}</td></tr>\n"
                    html_content += "</table>\n"
                    html_content += "</div>\n"

                html_content += "</div>\n"

        html_content += """
        </body>
        </html>
        """
        return html_content

    # Generate the HTML report
    html_report = generate_html_report(data)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_report)

    logger.info(f"Report generated and saved to {output_file}")

In [None]:
def normalize_words(text: str) -> str:
    p = inflect.engine()
    return p.singular_noun(text.lower())

In [None]:
def extract_unique_terms(document: ElementsDocumentModel) -> List[str]:
    """
    Extracts unique terms from the 'terms' attribute of elements within an ElementsDocumentModel instance.

    Args:
        document (ElementsDocumentModel): The document containing elements, each with a list of terms.

    Returns:
        List[str]: A list of unique terms found across all elements in the document.

    This function iterates through each element of the document, accesses the terms list in each element, and collects
    the unique terms. It uses a set to ensure that the terms are unique before converting it back to a list for the output.
    """

    # Initialize a set to store unique terms
    unique_terms: Set[str] = set()

    # Loop through each element in the 'elements' list of the document
    for element in document.elements:
        # Loop through the 'terms' list in each element
        for term_info in element.terms:
            # Add the term to the set
            unique_terms.add(term_info.term)

    # Convert the set to a list and return it
    return list(unique_terms)

#### Prompt to extract / classify elements.

Prompt strucuture is based on [1]. It is a zero-shot prompt following the concept of chain of thought.

Following the approaches are taken.

##### 1. facts and fact types
Try to extract all facts and fact types from a given document.

This approach has successful results. It is focused on extracting the elements, and achive the best results, similar to the approach 3.

In [None]:
system_prompt_facts = """

You are tasked with extracting **facts**, **fact types**, and their **relationships** from a given document. Follow these steps carefully:

#### Steps to Perform:

1. **Identify Facts and Fact Types**:
   - A **fact** is a specific instance or statement that describes an event or condition.
   - A **fact type** is a general template or relationship that defines how entities interact.
   - For each fact or fact type:
     - Extract the **expression** that represents the fact or fact type.
     - List the **terms** (Nouns or Proper nouns) involved in the fact or fact type.
     - Identify the **fact symbols** (verbs, verb phrases, or prepositions) connecting the terms.
     - Classify the expression as either a **Fact** or **Fact Type**.
     - Note the section or paragraph where the fact or fact type appears as the **source**.

2. **Classify Terms**:
   - For each fact or fact type, classify all **terms**:
     - Label each term as either a **Noun** or **Proper Noun**.
   - Ensure that the terms are extracted accurately and classified correctly.

3. **Define term**:
   - For each term look in the document for the term definition. If the term definition is not found, use "missing".:

4. **Identify Fact Symbols**:
   - Extract the verbs or prepositions that define the relationships between the terms. These are referred to as **fact symbols**.
   - Each fact or fact type should have a list of fact symbols.

5. **Source Information**:
   - Record the paragraph or section of the document where each fact or fact type is found as **source** information (e.g., “(a)(1)”, “(b)”).

6. **Recognize Term Relationships**:
   - Identify relationships between terms:
     - **Synonyms**: Terms that can be used interchangeably without changing the meaning.
     - **Hypernym-Hyponym**: A broader term (hypernym) that includes a more specific term (hyponym).
   - For each pair of terms:
     - Identify the relationship (either "Synonym" or "Hypernym-Hyponym").
     - Ensure that both terms involved in the relationship are valid terms from the document.

7. **Structure the Output in JSON Format**:
   - Create a JSON object with the following structure:
     - **facts_and_fact_types**: A list of dictionaries, where each dictionary contains:
       - **id**: A unique identifier for the fact or fact type.
       - **expression**: The extracted fact or fact type.
       - **terms**: A list of dictionaries, where each dictionary has a term and its classification (either "Noun" or "Proper Noun").
       - **fact_symbols**: A list of verb phrases or prepositions connecting the terms.
       - **classification**: Either "Fact" or "Fact Type".
       - **source**: The section or paragraph where the fact or fact type appears.
     - **terms_relationship**: A list of dictionaries, where each dictionary contains:
       - **terms**: A list of two related terms.
       - **relation**: Either "Synonym" or "Hypernym-Hyponym".

#### Example Output:

```json
{
  "facts_and_fact_types": [
    {
      "id": 1,
      "expression": "A person serves a non-resident investment adviser by furnishing the Commission with process, pleadings, or papers.",
      "terms": [
        {"Person": "Noun"},
        {"Non-resident investment adviser": "Noun"},
        {"Commission": "Proper Noun"},
        {"Process": "Noun"},
        {"Pleadings": "Noun"},
        {"Papers": "Noun"}
      ],
      "fact_symbols": ["serves", "by furnishing", "with"],
      "classification": "Fact Type",
      "source": "(a)"
    }
  ],
  "terms_relationship": [
    {
      "terms": [
        "Principal office",
        "Place of business"
      ],
      "relation": "Synonym"
    }
  ]
}
```

#### Guidelines:
- Be precise in identifying **terms** and **fact symbols**.
- Classify the relationships between terms accurately as **Synonym** or **Hypernym-Hyponym**.
- Ensure the final output adheres to the specified JSON structure.

#### Start of the document
"""

##### 2. facts, fact types, rules, and terms with definitions

Try to extract all facts, fact types, rules, and terms with definitions from a given document. Try to extract the relationships for each term  as well.

**Results**

The result are fairly consistent, but it failed to extract term's definitions, even when the definition was clear in the text, like in the document 275.0-7 from the fragment "... the **term** small business or small organization for purposes of the Investment Advisers Act of 1940 shall **mean** an investment adviser that: ...". The prompt failed to define small business and small organization, what are the main purpose of the document. It also failed to recognize that small business and small organization are synonyms.

In [None]:
system_prompt_v1 = """
You are tasked with extracting **facts**, **fact types**, **rules**, and their **relationships** from a given document. Follow these steps carefully:

<steps>

1. Summarize the document. Use the summary to verify if all important facts, fact types, and rules are present.

2. **Identify Facts, Fact Types, and Rules**:
   - A **fact** is a specific instance or statement that describes an event or condition. Facts are statements of truth without any directive element. They are often associated with relationships between terms or entities. e.g., "John works for X Inc.".
   - A **fact type** is a general, abstract template that describes the potential relationships between terms or entities. It serves as a model for generating specific facts. e.g., "Person works for Company".
   - A **rule** rule is generally defined as a statement that governs or constrains some aspect of the business. It specifies what must be done or what is not allowed, often guiding actions, decisions, and behaviors within an organization. Rules enforce compliance, limit possibilities, or prescribe specific behaviors in response to business situations. e.g., "A customer must provide identification before opening an account.".
   - For each fact, fact type, or rule:
     - Extract the **expression** that represents the fact, fact type, or rule.
     - List the **terms** involved in the fact, fact type, or rule.
     - Identify the **verb symbols** (verbs, verb phrases, or prepositions) connecting the terms.
     - Classify the expression as either a **Fact**, **Fact Type**, or **Rule**.
     - Note the section or paragraph where the fact, fact type, or rule appears as the **source**.
     - For each term look in the document for the term definition. If the term definition is not found, use "missing".:

3. Classify Terms:
   - For each fact, fact type, or rule classify all **terms**:
     - Label each term as either a **Common Noun** or **Proper Noun**.
   - Ensure that the terms are extracted accurately and classified correctly.

4. Define term:
   - For each term look in the document for the term definition, explaining, or meaning. If the term definition is not found, use "missing".:

4. Identify Verb Symbols:
   - Extract the verbs or prepositions that define the relationships between the terms. These are referred to as **verb symbols**.
   - Each fact, fact type, or rule should have a list of verb symbols.

5. Source Information:
   - Record the paragraph or section of the document where each fact, fact type, or rule is found as **source** information (e.g., "(a)(1)", "(b)").

6. Recognize term relationships:
   - Identify relationships between terms:
     - **Synonyms**: Terms that can be used interchangeably without changing the meaning.
     - **Hypernym-Hyponym**: A broader term (hypernym) that includes a more specific term (hyponym).
   - For each pair of terms:
     - Identify the relationship (either "Synonym" or "Hypernym-Hyponym").
     - Ensure that both terms involved in the relationship are valid terms from the document.

7. Answer only with the output example structure in JSON format. All the values are optional.

<output_example>

```json
{
  "section": "§ 123.4-5",
  "elements": [
    {
      "id": 1,
      "expression": "A person serves a non-resident investment adviser by furnishing the Commission with process, pleadings, or papers.",
      "terms": [
        {
            "term": "Person",
            "classification": "Noun",
            "definition": "missing"
        },
      ...
      ],
      "verb_symbols": ["serves", "by furnishing", "with"],
      "classification": "Fact Type",
      "source": "(a)"
    }
  ],
  "terms_relationship": [
    {
      "terms": [
        "Principal office",
        "Place of business"
      ],
      "relation": "Synonym"
    }
  ]
},
...
```
</output_example>

</steps>
"""

The v2 is a variation of the v1, with more concise description of the steps, and changing the organization of the text. The results are the same, but there was miss classification of the expressions.

In [None]:
system_prompt_v2 = """
Extract facts, fact types, and their relationships from a given document, and structure the output in a specified JSON format.

Follow the steps to identify and classify expressions, using document details to find definitions and source information.

# Steps

1. **Summarize the Document:**
   - Provide a summary to ensure the completeness of identified facts, fact types, and rules.

2. **Identify Facts, Fact Types, and Rules:**
   - Define and extract each:
     - **Fact:** Instance or statement of event/condition, e.g., "John works for X Inc."
     - **Fact Type:** Template for relationships, e.g., "Person works for Company."
     - **Rule:** Governing statement, e.g., "A customer must provide identification before opening an account."
   - For each, document:
     - **Expression**
     - **Terms** involved
     - **Verb Symbols** connecting the terms
     - **Classification** as Fact, Fact Type, or Rule
     - **Source** paragraph or section in the document

3. **Classify Terms:**
   - Classify each term as **Common Noun** or **Proper Noun**.

4. **Define Term:**
   - Locate definitions for terms in the document, or mark as "missing."

5. **Identify Verb Symbols:**
   - Extract verbs or prepositions (verb symbols) that define term relationships.

6. **Source Information:**
   - Note the document source (section/paragraph) for each expression.

7. **Recognize Term Relationships:**
   - Identify pairs of terms with relationships:
     - **Synonyms:** interchangeable terms.
     - **Hypernym-Hyponym:** broader (hypernym) includes more specific (hyponym).
   - Ensure relationship validity using document terms.

# Output Format

Produce a structured JSON format based on the specified template. Ensure all necessary fields are populated accurately, even if some fields are optional or marked as "missing".

# Examples

**Example JSON Structure:**

```json
{
  "section": "§ 123.4-5",
  "elements": [
    {
      "id": 1,
      "expression": "A person serves a non-resident investment adviser by furnishing the Commission with process, pleadings, or papers.",
      "terms": [
        {
            "term": "Person",
            "classification": "Noun",
            "definition": "missing"
        },
        // Additional terms...
      ],
      "verb_symbols": ["serves", "by furnishing", "with"],
      "classification": "Fact Type",
      "source": "(a)"
    }
  ],
  "terms_relationship": [
    {
      "terms": [
        "Principal office",
        "Place of business"
      ],
      "relation": "Synonym"
    }
  ]
}
```

# Notes

- Ensure extracted expressions are fully detailed and clearly classified.
- Pay careful attention to identifying and classifying terms accurately.
- Follow the precise JSON format for all outputs, populating fields as required.
"""


The v3 is back to v1, changing the organization of the text.

**Results**

The results are the same of v1 and v2. 5 elements were extracted. 16 terms were extracted with 2 definitions.

In [None]:
system_prompt_v3 = """
You are tasked with extracting elements and **relationships** from a given legal document. Please follow these steps carefully and ensure all instructions are adhered to:

**Steps**:

1. **Summarize the document**:
   - Summarize the document to understand its purpose and use it to verify if all important terms, term definitions, facts, fact types, and rules are identified in subsequent steps.

2. **Identify Facts, Fact Types, and Rules**:
   - **Definitions**:
     - **Fact**: A specific instance or statement that describes an event or condition without any directive element. Facts often involve relationships between terms or entities. Example: "John works for X Inc."
     - **Fact Type**: A general, abstract template that describes potential relationships between terms or entities, serving as a model for generating specific facts. Example: "Person works for Company."
     - **Rule**: A statement that governs or constrains some aspect of the business, specifying what must be done or what is not allowed. Rules enforce compliance, limit possibilities, or prescribe specific behaviors in response to business situations. Example: "A customer must provide identification before opening an account."
   - **For each fact, fact type, or rule**:
     - **Extract the Expression**: Identify the exact sentence or phrase from the document representing the fact, fact type, or rule.
     - **Extract Terms**: List all the terms involved in the expression.
     - **Extract Verb Symbols**: Identify verbs, verb phrases, or prepositions that connect the terms in the expression.
     - **Classification**: Classify the expression as either a **Fact**, **Fact Type**, or **Rule**.
     - **Source**: Note the specific paragraph or section of the document where the expression is found (e.g., "(a)(1)", "(b)").

3. **Classify Terms**:
   - For each term extracted classify it as either a **Common Noun** or a **Proper Noun**.

4. **Define Terms**:
   - For each term:
     - Search the entire document for the term's definition, explanation, or meaning. Also, look in the document summary.
     - If the definition is found, include it.
     - If the definition is not found in the document, use **None**.

5. **Identify Relationships Between Terms**:
   - **Types of Relationships**:
     - **Synonym**: Terms that can be used interchangeably without changing the meaning.
     - **Hypernym-Hyponym**: A broader term (hypernym) that includes a more specific term (hyponym).
   - **For each pair of terms in the document**:
     - Identify if a relationship exists as either "Synonym" or "Hypernym-Hyponym".
     - Only include relationships where both terms are present in the document.

6. **Provide JSON Output**:
   - Format your answer as per the output example below.
   - **All values are optional**: Include as much information as is available based on the document.
   - **Do not include any additional text or explanation outside the JSON structure**.

**Output Example**:

```json
{
  "section": "§ 123.4-5",
  "elements": [
    {
      "id": 1,
      "expression": "A person serves a non-resident investment adviser by furnishing the Commission with process, pleadings, or papers.",
      "terms": [
        {
          "term": "Person",
          "classification": "Common Noun",
          "definition": "An individual or legal entity."
        },
        {
          "term": "Non-resident investment adviser",
          "classification": "Common Noun",
          "definition": null
        },
        ...
      ],
      "verb_symbols": ["serves", "by furnishing", "with"],
      "classification": "Fact Type",
      "source": "(a)"
    },
    ...
  ],
  "terms_relationship": [
    {
      "terms": [
        "Principal office",
        "Place of business"
      ],
      "relation": "Synonym"
    },
    {
      "terms": [
        "Person",
        "Individual"
      ],
      "relation": "Synonym"
    },
    ...
  ]
}
```
"""

In [None]:
response_prompt_v3 = {
  "section": "§ 275.0-7",
  "elements": [
    {
      "id": 1,
      "expression": "An investment adviser that has assets under management of less than $25 million is considered a small business for the purposes of the Investment Advisers Act of 1940.",
      "terms": [
        {
          "term": "Investment adviser",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Assets under management",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "$25 million",
          "classification": "Proper Noun",
          "definition": None
        },
        {
          "term": "Small business",
          "classification": "Common Noun",
          "definition": None
        }
      ],
      "verb_symbols": ["has", "is considered"],
      "classification": "Fact Type",
      "source": "(a)(1)"
    },
    {
      "id": 2,
      "expression": "An investment adviser is considered a small organization if it did not have total assets of $5 million or more on the last day of the most recent fiscal year.",
      "terms": [
        {
          "term": "Investment adviser",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Total assets",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "$5 million",
          "classification": "Proper Noun",
          "definition": None
        },
        {
          "term": "Small organization",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Fiscal year",
          "classification": "Common Noun",
          "definition": None
        }
      ],
      "verb_symbols": ["did not have", "is considered"],
      "classification": "Fact Type",
      "source": "(a)(2)"
    },
    {
      "id": 3,
      "expression": "An investment adviser is not considered a small business if it controls, is controlled by, or is under common control with another investment adviser that has assets under management of $25 million or more.",
      "terms": [
        {
          "term": "Investment adviser",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Control",
          "classification": "Common Noun",
          "definition": "The power, directly or indirectly, to direct the management or policies of a person."
        },
        {
          "term": "Common control",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "$25 million",
          "classification": "Proper Noun",
          "definition": None
        },
        {
          "term": "Small business",
          "classification": "Common Noun",
          "definition": None
        }
      ],
      "verb_symbols": ["controls", "is controlled by", "is under"],
      "classification": "Rule",
      "source": "(a)(3)"
    },
    {
      "id": 4,
      "expression": "Control means the power, directly or indirectly, to direct the management or policies of a person, whether through ownership of securities, by contract, or otherwise.",
      "terms": [
        {
          "term": "Control",
          "classification": "Common Noun",
          "definition": "The power, directly or indirectly, to direct the management or policies of a person."
        },
        {
          "term": "Person",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Securities",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Contract",
          "classification": "Common Noun",
          "definition": None
        }
      ],
      "verb_symbols": ["means", "to direct", "whether through"],
      "classification": "Fact",
      "source": "(b)(1)"
    },
    {
      "id": 5,
      "expression": "A person is presumed to control a corporation if the person has the right to vote 25 percent or more of a class of the corporation's voting securities.",
      "terms": [
        {
          "term": "Person",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Corporation",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Voting securities",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "25 percent",
          "classification": "Proper Noun",
          "definition": None
        }
      ],
      "verb_symbols": ["is presumed", "to control", "has the right to vote"],
      "classification": "Fact Type",
      "source": "(b)(1)(i)(A)"
    }
  ],
  "terms_relationship": [
    {
      "terms": [
        "Investment adviser",
        "Small business"
      ],
      "relation": "Hypernym-Hyponym"
    },
    {
      "terms": [
        "Investment adviser",
        "Small organization"
      ],
      "relation": "Hypernym-Hyponym"
    }
  ]
}


In [None]:
len(response_prompt_v3["elements"]), len(response_prompt_v3["terms_relationship"])

##### 3. facts, fact types, rules, and terms

Try to extract all facts, fact types, rules, and terms without definitions from a given document, and do not try to extract the relationships for each term.

This approach is very similar to the approach used in the previous, but it is more focused on extracting the elements. It is divided in two parts:
- Extract the elements
- Extract the definitions and relationships

**Results**

The result are consistents, 7 elements and 21 terms with definitions are extracted. in contrast, the previous approach, 5 elements and 16 terms with 2 definitions were extracted. An improvement of 40% extracting facts and rules, 31% extracting terms, and 1050% extracting definitions.

ts are extracted in the first part. For the second part the result are much better than the previous approach, more definitions and relationships are extracted.

The prompt for the first part is similar to the previous one, but without the steps 4 and 5. The definition and relationships elements are removed from the output json.

> The summary of the document was added to the output json.

In [None]:
system_prompt_v4_1 = """
You are tasked with extracting elements from a given legal document. Please follow these steps carefully and ensure all instructions are adhered to:

# Steps

1. **Summarize the document** to understand its purpose and use it to verify if all important terms,facts, fact types, and rules are identified in subsequent steps.

2. **Identify elements**:
   - **About the elements**:
     - **Fact**: A specific instance or statement that describes an event or condition without any directive element. Facts often involve relationships between terms or entities. Example: "John works for X Inc."
     - **Fact Type**: A general, abstract template that describes potential relationships between terms or entities, serving as a model for generating specific facts. Example: "Person works for Company."
     - **Rule**: A statement that governs or constrains some aspect of the business, specifying what must be done or what is not allowed. Rules enforce compliance, limit possibilities, or prescribe specific behaviors in response to business situations. Example: "A customer must provide identification before opening an account."
     - **Term**: A word or a group of words that represents a specific concept, entity, or subject in a particular context.
   - **For each fact, fact type, or rule**:
     - **Extract the Expression**: Identify the exact sentence or phrase from the document representing the fact, fact type, or rule.
     - **Extract and classify Terms**:
       - **Extract all the terms involved in the expression.
       - **Classify each term** as either **Common Noun** or **Proper Noun**.
       - If a Term contains nouns separated by "and," ",", or "or," split it into two or more terms. For example, "Principal office and place of business" should be split into "Principal office" and "Place of business".
     - **Extract Verb Symbols**: Identify verbs, verb phrases, or prepositions that connect the terms in the expression.
     - **Classification**: Classify the expression as either a **Fact**, **Fact Type**, or **Rule**.
     - **Source**: Note the specific paragraph or section of the document where the expression is found (e.g., "(a)(1)", "(b)").

3. **Provide JSON Output**:
   - Format your answer as per the output example below.
   - **All values are optional**: Include as much information as is available based on the document.
   - **Do not include any additional text or explanation outside the JSON structure**.

**Output Example**:

```json
{
  "section": "§ 123.4-5",
  "elements": [
    {
      "id": 1,
      "expression": "A person serves a non-resident investment adviser by furnishing the Commission with process, pleadings, or papers.",
      "terms": [
        {
          "term": "Person",
          "classification": "Common Noun"
        },
        {
          "term": "Non-resident investment adviser",
          "classification": "Common Noun"
        },
        ...
      ],
      "verb_symbols": ["serves", "by furnishing", "with"],
      "classification": "Fact Type",
      "source": "(a)"
    },
    ...
  ]
}
```
"""

In [None]:
response_prompt_v4_1 = {
  "section": "§ 275.0-7",
  "summary": "The definition of small entities under the Investment Advisers Act for the purposes of the Regulatory Flexibility Act. It details criteria for qualifying as a small business or organization and provides definitions for 'control' and 'total assets' within this context.",
  "elements": [
    {
      "id": 1,
      "expression": "The term small business or small organization for purposes of the Investment Advisers Act of 1940 shall mean an investment adviser that has assets under management of less than $25 million.",
      "terms": [
        {
          "term": "Small business",
          "classification": "Common Noun"
        },
        {
          "term": "Small organization",
          "classification": "Common Noun"
        },
        {
          "term": "Investment adviser",
          "classification": "Common Noun"
        },
        {
          "term": "Assets under management",
          "classification": "Common Noun"
        },
        {
          "term": "$25 million",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["mean", "has"],
      "classification": "Fact",
      "source": "(a)(1)"
    },
    {
      "id": 2,
      "expression": "An investment adviser did not have total assets of $5 million or more on the last day of the most recent fiscal year.",
      "terms": [
        {
          "term": "Investment adviser",
          "classification": "Common Noun"
        },
        {
          "term": "Total assets",
          "classification": "Common Noun"
        },
        {
          "term": "$5 million",
          "classification": "Common Noun"
        },
        {
          "term": "Fiscal year",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["did not have"],
      "classification": "Fact",
      "source": "(a)(2)"
    },
    {
      "id": 3,
      "expression": "An investment adviser does not control, is not controlled by, and is not under common control with another investment adviser that has assets under management of $25 million or more.",
      "terms": [
        {
          "term": "Investment adviser",
          "classification": "Common Noun"
        },
        {
          "term": "Control",
          "classification": "Common Noun"
        },
        {
          "term": "$25 million",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["does not control", "is not controlled by", "is not under common control with"],
      "classification": "Fact",
      "source": "(a)(3)"
    },
    {
      "id": 4,
      "expression": "Control means the power, directly or indirectly, to direct the management or policies of a person, whether through ownership of securities, by contract, or otherwise.",
      "terms": [
        {
          "term": "Control",
          "classification": "Common Noun"
        },
        {
          "term": "Power",
          "classification": "Common Noun"
        },
        {
          "term": "Management",
          "classification": "Common Noun"
        },
        {
          "term": "Policies",
          "classification": "Common Noun"
        },
        {
          "term": "Person",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["means", "to direct"],
      "classification": "Fact Type",
      "source": "(b)(1)"
    },
    {
      "id": 5,
      "expression": "A person is presumed to control a corporation if the person directly or indirectly has the right to vote 25 percent or more of a class of the corporation's voting securities.",
      "terms": [
        {
          "term": "Person",
          "classification": "Common Noun"
        },
        {
          "term": "Corporation",
          "classification": "Common Noun"
        },
        {
          "term": "Voting securities",
          "classification": "Common Noun"
        },
        {
          "term": "25 percent",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["is presumed to control", "has the right to vote"],
      "classification": "Rule",
      "source": "(b)(1)(i)(A)"
    },
    {
      "id": 6,
      "expression": "A person is presumed to control a partnership if the person has the right to receive upon dissolution, or has contributed, 25 percent or more of the capital of the partnership.",
      "terms": [
        {
          "term": "Person",
          "classification": "Common Noun"
        },
        {
          "term": "Partnership",
          "classification": "Common Noun"
        },
        {
          "term": "Dissolution",
          "classification": "Common Noun"
        },
        {
          "term": "Capital",
          "classification": "Common Noun"
        },
        {
          "term": "25 percent",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["is presumed to control", "has the right to receive", "has contributed"],
      "classification": "Rule",
      "source": "(b)(1)(ii)"
    },
    {
      "id": 7,
      "expression": "Total assets means the total assets as shown on the balance sheet of the investment adviser or other person with its subsidiaries consolidated, whichever is larger.",
      "terms": [
        {
          "term": "Total assets",
          "classification": "Common Noun"
        },
        {
          "term": "Balance sheet",
          "classification": "Common Noun"
        },
        {
          "term": "Investment adviser",
          "classification": "Common Noun"
        },
        {
          "term": "Subsidiaries",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["means", "shown on"],
      "classification": "Fact Type",
      "source": "(b)(2)"
    }
  ]
}

In [None]:
len(response_prompt_v4_1["elements"])

The steps 4 and 5 are adapted from the previous approach. The system prompt for the second part is:

In [None]:
system_prompt_v4_2 = """
You are tasked with extracting definitions and **relationships** of terms in the terms list searching a given legal document. Please follow these steps carefully and ensure all instructions are adhered to:

# Steps

1. **Summarize the document** to understand its purpose and use it to verify if all important terms, term definitions, facts, fact types, and rules are identified in subsequent steps.

2. **Define terms**:
  - For each term:
    - Search the entire document for the term's definition, explanation, or meaning. Also, look in the document summary.
    - If the definition is found, include it.
    - If the definition is not found in the document, use null.

3. **Identify synonym relationships between terms**:
  - For each term in the terms list:
    - Compare it against other terms in the text to find synonyms.
    - Ensure both terms exist within the same document context.
  - List all valid synonym pairs identified.

4. **Provide JSON Output**:
  - Format your answer as per the output example below.
  - **All values are optional**: Include as much information as is available based on the document.
  - **Do not include any additional text or explanation outside the JSON structure**.

**Output Example**:

```json
{
  "terms": [
    {
      "term": "Person",
      "definition": "A person is a person."
    },
    {
      "term": "Capital",
      "definition": "The total assets of a person."
    },
    ...
  ],
  "relationships": [
    {
      "term_1": "Person",
      "term_2": "Capital",
      "relationship": "Synonym"
    },
    {
      "term_1": "Capital",
      "term_2": "Person",
      "relationship": "Synonym"
    },
    ...
  ]
}
"""

In the "user prompt", along with the document, a unique list of terms from the result of the previous part, is provided. The drawback of this approach is the document needs to be provided again. It means spending more tokens.

As commented above, the output is better than the previous approach. 21 terms are extracted with definitions, and 6 relationships are identified. More important that the terms small business, and, small organization are extracted.

In [None]:
response_prompt_v4_2 = {
  "terms": [
    {
      "term": "$5 million",
      "definition": "An amount referenced as a threshold for total assets of an investment adviser or other entity on the last day of the most recent fiscal year."
    },
    {
      "term": "Control",
      "definition": "The power, directly or indirectly, to direct the management or policies of a person, whether through ownership of securities, by contract, or otherwise."
    },
    {
      "term": "Capital",
      "definition": "The amount of financial contribution or investment in a partnership or LLC, particularly relevant to the right to receive upon dissolution or contribution of 25 percent or more."
    },
    {
      "term": "Dissolution",
      "definition": "The act of formally ending a partnership or LLC, at which point capital contributions may be distributed."
    },
    {
      "term": "25 percent",
      "definition": "A threshold used to presume control over a corporation, partnership, or LLC, based on ownership, voting rights, or capital contribution."
    },
    {
      "term": "Subsidiaries",
      "definition": "Companies that are controlled by another company, typically through ownership of more than 50% of the subsidiary’s voting stock."
    },
    {
      "term": "Management",
      "definition": "The act of overseeing and controlling the policies or operations of an entity."
    },
    {
      "term": "Corporation",
      "definition": "A legal entity that is presumed to be controlled if a person has the right to vote or sell 25 percent or more of its voting securities."
    },
    {
      "term": "Balance sheet",
      "definition": "A financial statement that reports total assets, used to determine control and asset thresholds for investment advisers."
    },
    {
      "term": "Assets under management",
      "definition": "The total market value of investments that an investment adviser manages on behalf of clients."
    },
    {
      "term": "$25 million",
      "definition": "An amount referenced as a threshold for assets under management to determine whether an entity qualifies as a small business or small organization under the Investment Advisers Act."
    },
    {
      "term": "Fiscal year",
      "definition": "A one-year period used for accounting purposes and preparing financial statements, relevant to determining total assets."
    },
    {
      "term": "Voting securities",
      "definition": "Securities that give the holder the right to vote on matters of corporate policy or management, used to determine control."
    },
    {
      "term": "Power",
      "definition": "The ability to influence or direct the management or policies of a person or entity, often associated with control."
    },
    {
      "term": "Total assets",
      "definition": "The total value of all assets as shown on an entity's balance sheet, including those of subsidiaries, used to assess financial thresholds."
    },
    {
      "term": "Investment adviser",
      "definition": "An individual or firm that manages the investments of clients, subject to regulations under the Investment Advisers Act of 1940."
    },
    {
      "term": "Person",
      "definition": "An individual, corporation, partnership, LLC, trust, or other entity, potentially subject to control rules under the Investment Advisers Act."
    },
    {
      "term": "Small business",
      "definition": "An investment adviser with less than $25 million in assets under management and less than $5 million in total assets, or as otherwise defined by the Commission."
    },
    {
      "term": "Partnership",
      "definition": "A business structure where control is presumed if a person owns or contributes 25 percent or more of the partnership's capital."
    },
    {
      "term": "Small organization",
      "definition": "An entity, such as an investment adviser, that qualifies as a small business under the Investment Advisers Act by meeting specific asset thresholds."
    },
    {
      "term": "Policies",
      "definition": "The principles or rules governing the management and control of an entity, relevant to determining control under the Investment Advisers Act."
    }
  ],
  "relationships": [
    {
      "term_1": "Small business",
      "term_2": "Small organization",
      "relationship": "Synonym"
    },
    {
      "term_1": "$5 million",
      "term_2": "Total assets",
      "relationship": "Hypernym-Hyponym"
    },
    {
      "term_1": "$25 million",
      "term_2": "Assets under management",
      "relationship": "Hypernym-Hyponym"
    },
    {
      "term_1": "Person",
      "term_2": "Corporation",
      "relationship": "Hypernym-Hyponym"
    },
    {
      "term_1": "Person",
      "term_2": "Partnership",
      "relationship": "Hypernym-Hyponym"
    },
    {
      "term_1": "Person",
      "term_2": "Investment adviser",
      "relationship": "Hypernym-Hyponym"
    }
  ]
}


In [None]:
len(response_prompt_v4_2["terms"]), len(response_prompt_v4_2["relationships"])

##### Save checkpoint

Define which prompt will be used in the experiment.

In [None]:
system_prompt_part_1 = system_prompt_v4_1
system_prompt_part_2 = system_prompt_v4_2

manager.add_document(
    Document(
        id="prompt-v4-P1",
        type="prompt",
        content=f"""
{system_prompt_part_1}
        """,
    )
)

manager.add_document(
    Document(
        id="prompt-v4-P2",
        type="prompt",
        content=f"""
{system_prompt_part_2}
        """,
    )
)

In [None]:
# Persist the state to a file
save_checkpoint(manager)

#### Query LLM with documents

In [None]:
manager = restore_checkpoint()

In [None]:
for doc in manager.list_document_ids(doc_type="section"):
    logger.info(f"Processing document: {doc}")
    retrieved_doc = manager.retrieve_document(doc_id=doc, doc_type="section")

    # Part 1 - Extraction of elements
    user_prompt = f"""
# Document

{manager.retrieve_document(doc_id=doc, doc_type="section").content}
    """

    logger.info("P1. Extracting elements...")
    response_part_1 = query_instruct_llm(system_prompt_part_1, user_prompt, ElementsDocumentModel)

    logger.debug(response_part_1)

    doc_1 = Document(
        id=f"{doc}_P1",
        type="llm_response",
        content=response_part_1
    )
    manager.add_document(doc_1)

    # Part 2 - Definition of terms and relationships
    terms_list_part_1 = extract_unique_terms(response_part_1)

    user_prompt = f"""
# Terms list

{terms_list_part_1}

# Document
{manager.retrieve_document(doc_id=doc, doc_type="section").content}
    """

    logger.info("P2. Extracting terms and relationships...")
    response_part_2 = query_instruct_llm(system_prompt_part_2, user_prompt, TermsDocumentModel)

    logger.debug(response_part_2)

    doc_2 = Document(
        id=f"{doc}_P2",
        type="llm_response",
        content=response_part_2
    )
    manager.add_document(doc_2)

    logger.info("Saving llm_response to checkpoint...")

    # Save each document to save money.
    save_checkpoint(manager)

logger.info(f"Finished processing documents.")

Avarage execution time: 32s / per document.

#### Restore checkpoint

In [None]:
# Restore checkpoint
manager = restore_checkpoint()

#### Check content of llm_responses

Create P1 dataframe

In [None]:
# Define the path where your JSON files are located
json_files_path = f"{config['DEFAULT_CHECKPOINT_DIR']}/*.json"

# List of all JSON files in the directory
all_files = glob.glob(json_files_path)

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each file
for file_path in all_files:
    with open(file_path, "r") as file:
        # Load JSON content
        try:
            data = json.load(file)
        except json.JSONDecodeError:
            print(f"Error reading {file_path}, skipping this file.")
            continue

        # Filter keys that end with '_P1|llm_response'
        filtered_data = {
            key: value
            for key, value in data.items()
            if key.endswith("_P1|llm_response")
        }

        # Check if filtered_data is empty
        if not filtered_data:
            print(f"No matching keys in {file_path}, skipping this file.")
            continue

        # Normalize the JSON data to create a DataFrame, expand elements, terms, and verb_symbols
        try:
            df = pd.json_normalize(
                filtered_data.values(),
                sep="_",
                record_path=[
                    "content",
                    "elements",
                    "terms",
                ],  # Expands elements -> terms
                meta=[
                    "id",
                    "type",
                    ["content", "section"],
                    ["content", "summary"],
                    ["content", "elements", "id"],
                    ["content", "elements", "expression"],
                    ["content", "elements", "classification"],
                    ["content", "elements", "source"],
                ],
                meta_prefix="meta_",
            )

            # Expand verb_symbols into separate columns (as a list is in the elements, but not nested further)
            df_verb_symbols = pd.json_normalize(
                filtered_data.values(),
                sep="_",
                record_path=["content", "elements"],
                meta=["id", "type", ["content", "section"], ["content", "summary"]],
                meta_prefix="meta_",
            )

            # Join verb_symbols with the original df
            df["verb_symbols"] = df_verb_symbols["verb_symbols"].apply(
                lambda x: ", ".join(x) if isinstance(x, list) else x
            )

        except KeyError as e:
            print(f"Error normalizing data from {file_path}: {e}")
            continue

        # Extract just the filename without directory
        filename = os.path.basename(file_path)

        # Add a column for the filename
        df["filename"] = filename

        # Append the DataFrame to the list
        dataframes.append(df)

# Check if there are any dataframes to concatenate
if dataframes:
    # Concatenate all DataFrames into one
    elements_p1_df = pd.concat(dataframes, ignore_index=True)
else:
    print("No valid dataframes to concatenate.")

In [None]:
elements_p1_df

In [None]:
elements_p1_df.describe()

Create dataframe for P2

In [None]:
import os
import json
import pandas as pd
import glob

# Define the path where your JSON files are located
json_files_path = f"{config['DEFAULT_CHECKPOINT_DIR']}/*.json"

# List of all JSON files in the directory
all_files = glob.glob(json_files_path)

# Initialize an empty list to store DataFrames for terms and relationships
terms_dataframes = []
terms_relationship_dataframes = []

# Loop through each file
for file_path in all_files:
    with open(file_path, "r") as file:
        # Load JSON content
        try:
            data = json.load(file)
        except json.JSONDecodeError:
            print(f"Error reading {file_path}, skipping this file.")
            continue

        # Filter keys that end with '_P2|llm_response'
        filtered_data = {
            key: value
            for key, value in data.items()
            if key.endswith("_P2|llm_response")
        }

        # Check if filtered_data is empty
        if not filtered_data:
            print(f"No matching keys in {file_path}, skipping this file.")
            continue

        # Process each matched entry in filtered_data
        for key, value in filtered_data.items():
            try:
                # Extract the terms
                terms = value['content']['terms']
                terms_df = pd.DataFrame(terms)
                terms_df['response_id'] = value['id']  # Add response ID to track origin
                terms_dataframes.append(terms_df)

                # Extract the terms_relationship if available
                terms_relationship = value['content'].get('terms_relationship', [])
                if terms_relationship:
                    terms_relationship_df = pd.DataFrame(terms_relationship)
                    terms_relationship_df['response_id'] = value['id']  # Add response ID to track origin
                    terms_relationship_dataframes.append(terms_relationship_df)

            except KeyError as e:
                print(f"Error processing data from {file_path}: {e}")
                continue

# Concatenate all the terms DataFrames
if terms_dataframes:
    terms_p2_df = pd.concat(terms_dataframes, ignore_index=True)
    print("Terms DataFrame:")
else:
    print("No valid terms dataframes to concatenate.")

# Concatenate all the terms_relationship DataFrames
if terms_relationship_dataframes:
    relationship_p2_df = pd.concat(terms_relationship_dataframes, ignore_index=True)
    print("Terms Relationship DataFrame:")
else:
    print("No valid terms relationship dataframes to concatenate.")


In [None]:
terms_p2_df

In [None]:
terms_pred_df = pd.DataFrame(manager.retrieve_document("§ 275.0-2_P2", doc_type="llm_response").model_dump()["content"]["terms"])

terms_pred_df

In [None]:
terms_true_df = pd.DataFrame(manager.retrieve_document("§ 275.0-2_P2", doc_type="true_table").model_dump()["content"]["terms"])

terms_true_df

In [None]:
# List of document IDs to process
document_ids_to_process_p1 = ['§ 275.0-2_P1|llm_response', '§ 275.0-5_P1|llm_response', '§ 275.0-7_P1|llm_response']
document_ids_to_process_p2 = ['§ 275.0-2_P2|llm_response', '§ 275.0-5_P2|llm_response', '§ 275.0-7_P2|llm_response']

table_pred_data_p1 = []
table_pred_data_p2 = []
# Loop through the directory and process each JSON file
for filename in os.listdir(config['DEFAULT_CHECKPOINT_DIR']):
    if filename.endswith(".json"):
        file_path = os.path.join(config['DEFAULT_CHECKPOINT_DIR'], filename)
        table_pred_data_p1 += process_documents_p1(file_path, filename, document_ids_to_process_p1)
        table_pred_data_p2 += process_documents_p2(file_path, filename, document_ids_to_process_p2)

# Convert collected data to a DataFrame
table_pred_df_p1 = pd.DataFrame(table_pred_data_p1)
table_pred_df_p2 = pd.DataFrame(table_pred_data_p2)

# Save DataFrames to CSV if needed
table_pred_df_p1.to_excel(f"{config['DEFAULT_OUTPUT_DIR']}/P1_summary_table.xlsx", index=False)
table_pred_df_p2.to_excel(f"{config['DEFAULT_OUTPUT_DIR']}/P2_summary_table.xlsx", index=False)


Count of all runs in the checkpoints for P1.

In [None]:
table_pred_df_p1.tail(6)

In [None]:
table_pred_p1 = table_pred_df_p1.groupby('document_id').aggregate(["min", "max", "sum"])
table_pred_p1

Count of all runs in the checkpoints for P2.

In [None]:
table_pred_df_p2.tail(6)

In [None]:
table_pred_p2 = table_pred_df_p2.groupby('document_id').describe()

table_pred_p2

Compare true tables with predicted tables.

In [None]:
true_table = table_true_df_p1.groupby('document_id').describe()
pred_table = table_pred_df_p1.groupby('document_id').describe()
true_table.index = true_table.index.map(lambda x: x.replace("|true_table", ""))  # Example: renaming '§' to 'Section'
pred_table.index = pred_table.index.map(lambda x: x.replace("|llm_response", ""))  # Example: renaming '§' to 'Section'

true_table.compare(pred_table)

In [None]:
# Stop here. Next sections still in progress.
raise SystemExit("Stop here. Next sections still in progress.")

#### Evaluate documents

In [None]:
for doc in manager.list_document_ids(doc_type="section"):
    logger.info(f"Processing document: {doc}")
    retrieved_llm_response = manager.retrieve_document(doc_id=doc, doc_type="llm_response")
    true_table_doc = manager.retrieve_document(doc_id=doc, doc_type="true_table")


    logger.debug(retrieved_llm_response)

    (
        predicted_items,
        predicted_dict,
        correct_items,
        correct_dict,
        common_items,
        missed_items,
        extra_items,
    ) = compare_items(doc, retrieved_llm_response.content, true_table_doc.content, "signifier", "concept_classification")

    content = {
        "predicted_items": predicted_items,
        "predicted_dict": predicted_dict,
        "correct_items": correct_items,
        "correct_dict": correct_dict,
        "common_items": common_items,
        "missed_items": missed_items,
        "extra_items": extra_items
    }

    # Save statistics
    document = Document(
        id=doc,
        type="statistics",
        content=content
    )


    logger.debug(document)

    # Save statistics
    manager.add_document(document)

    logger.info("Saving statistics to checkpoint...")
    save_checkpoint(manager)

    # Save metrics to excel for further analysis
    save_compare_items_metrics(
        doc,
        len(correct_items),
        len(predicted_items),
        len(common_items),
        len(missed_items),
        len(extra_items),
        len(common_items) / len(correct_items),
        len(common_items) / len(predicted_items),
        2 * len(common_items) / (len(correct_items) + len(predicted_items)),
        #file_name=f'../outputs/section_{doc.replace(".", "_").replace("-", "_")}_validation_metrics.xlsx'
        file_name=config["DEFAULT_EXCEL_FILE"]
    )

In [None]:
# Restore checkpoint
manager = restore_checkpoint()

#### Confusion matrix

In [None]:
for doc in manager.list_document_ids(doc_type="statistics"):
    logger.info(f"Processing document: {doc}")
    retrieved_statistics = manager.retrieve_document(doc_id=doc, doc_type="statistics")

    logger.debug(retrieved_statistics)

    predicted_dict = retrieved_statistics.content["predicted_dict"]
    predicted_items = retrieved_statistics.content["predicted_items"]
    correct_items = retrieved_statistics.content["correct_items"]
    correct_dict = retrieved_statistics.content["correct_dict"]

    report = plot_confusion_matrix(predicted_items, predicted_dict, correct_items, correct_dict)

    # Save statistics
    document = Document(
        id=doc,
        type="classification_report",
        content=report
    )


    logger.debug(document)

    # Save statistics
    manager.add_document(document)

logger.info("Saving statistics to checkpoint...")
save_checkpoint(manager)

#### Generate report

Generate report with the content of an checkpoint.

In [None]:
generate_report(config["DEFAULT_CHECKPOINT_FILE"], config["DEFAULT_EXTRACTION_REPORT_FILE"])

### define vocabulary

#### General functions

In [None]:
def remove_section_symbol(input_string: str) -> str:
    """
    Removes the '§' symbol from the input string and trims whitespace.

    Args:
        input_string (str): The string from which to remove the '§' symbol.

    Returns:
        str: The cleaned string without the '§' symbol and leading/trailing whitespace.

    Raises:
        TypeError: If 'input_string' is not a string.
    """
    if not isinstance(input_string, str):
        raise TypeError("input_string must be a string")
    return input_string.replace("§", "").strip()

In [None]:
def define_vocabulary(section_id: str, source_section: str) -> str:
    """
    Determines the vocabulary section ID based on the term's source section.

    Args:
        section_id (str): The section ID of the current document.
        source_section: The section id.

    Returns:
        str: The appropriate vocabulary section ID.

    Raises:
        KeyError: If 'source' or 'section' key is missing in the term.
        TypeError: If 'section_id' is not a string or 'term' is not a dictionary.
    """

    # if not isinstance(section_id, str):
    #     raise TypeError("section_id must be a string")
    # if not isinstance(term, dict):
    #     raise TypeError("term must be a dictionary")
    # if "sources" not in term or "section" not in term["source"]:
    #     raise KeyError("term must contain 'source' with 'section'")

    section_id = remove_section_symbol(section_id)

    try:
        term_section_id = remove_section_symbol(source_section)
    except KeyError:
        term_section_id = section_id

    return section_id if term_section_id == section_id else term_section_id


#### Validation


In [None]:

correct_dict = {}
predicted_dict = {}
for doc in manager.list_document_ids(doc_type="llm_response"):
    logger.info(f"Processing document: {doc} ...")
    retrieved_llm_response = manager.retrieve_document(doc_id=doc, doc_type="llm_response")
    retrieved_true_table = manager.retrieve_document(doc_id=doc, doc_type="true_table")

    correct_dict.update({item['signifier']: item.get("sources")[0]["section"] for item in retrieved_true_table.content})
    predicted_dict.update({item['signifier']: item.get("sources")[0]["section"] for item in retrieved_llm_response.content})

In [None]:
# Identify Common and Unique Signifiers
# Get sets of signifiers
correct = set(correct_dict.keys())
predicted = set(predicted_dict.keys())

In [None]:
# Assuming correct_dict and predicted_dict are already defined
comparison_results = []

for term in correct.union(predicted):
    correct_section = correct_dict.get(term)
    predicted_section = predicted_dict.get(term)

    # Determine if types match
    type_match = correct_section == predicted_section

    # Append to comparison_results
    comparison_results.append({
        'Term': term,
        'Correct source': correct_section,
        'Predicted source': predicted_section,
        'Section Match': type_match
    })

# Create the DataFrame
df = pd.DataFrame(comparison_results)

# Create the confusion matrix
confusion_matrix = pd.crosstab(
    df['Correct source'],
    df['Predicted source'],
    rownames=['Actual'],
    colnames=['Predicted'],
    margins=True
)

print("Confusion Matrix:")
print(confusion_matrix)

# Visualize the confusion matrix
cm = confusion_matrix.iloc[:-1, :-1] if 'All' in confusion_matrix.index else confusion_matrix

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix of source predictions')
plt.ylabel('Actual source')
plt.xlabel('Predicted source')
plt.show()

# Replace None values with a placeholder
df['Correct source'].fillna('Unknown', inplace=True)
df['Predicted source'].fillna('Unknown', inplace=True)

# Prepare data for classification report
types = sorted(set(df['Correct source']) | set(df['Predicted source']))
type_to_int = {t: i for i, t in enumerate(types)}

y_true = df['Correct source'].map(type_to_int)
y_pred = df['Predicted source'].map(type_to_int)

# Generate classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=types))

### similarity search (P5)

Try a similarity search to find the entity in the graph. If not found, create a new entity and corresponding embedding. If exists, create a link between the two.

Use similarity search to find similar terms in the graph

#### General functions

In [None]:
def signifier_sources(sources: list) -> list:
    """
    Extract desgnations sources

    Args:
        sources (list): List of sources

    Returns:
        list: List of sources
    """
    # Extract desgnations sources
    sources_lst = []
    for source in sources:
        source_section = str(source.get("section"))
        source_paragraph = str(source.get("paragraph"))
        sources_lst.append(source_section + source_paragraph)
    return sources_lst

In [None]:
def transform_title_cased(input_string: str) -> str:
    """
    Transform the input string to title case, which capitalizes the first letter of each word.

    Args:
        input_string (str): The string to transform.

    Returns:
        title_case_string (str): The transformed string.
    """
    title_case_string = input_string.title()
    # Remove all spaces
    transformed_string = title_case_string.replace(" ", "")
    return transformed_string

In [None]:
def normalize_ns_string(input_string: str) -> str:
    """
    Transform the input string to title case, which capitalizes the first letter of each word.

    Args:
        input_string (str): The string to normalize.

    Returns:
        normalized_string (str): The normalized string.
    """
    normalized_string = remove_section_symbol(input_string)

    # Remove all spaces, change points and hyphens to underscores
    return normalized_string.replace(" ", "").replace("-", "_").replace(".", "_")


In [None]:
def upsert_fact_to_kg(conn, fact):
    """
    Add a fact to the knowledge graph. If exists, replace it.
    Context:
        Facts build on concepts: Facts are statements or assertions about the relationships
        between these concepts. They describe how terms relate to each other in specific ways.
        Example "A customer places an order.".

    Args:
        conn (Connection): The connection to the knowledge graph database.
        fact (str): The fact to add to the knowledge graph.

    Returns:
        True if the fact was added successfully, False otherwise.
    """
    pass

In [None]:
def upsert_rule_to_kg(conn: RepositoryConnection, fact:Dict[str, Any]) -> bool:
    """
    Add a rule to the knowledge graph. If exists, replace it.

    Context:
        Rules build on facts: Rules are constructed based on these facts to enforce
        certain conditions, constraints, or actions within the business.
        Rules dictate what must or must not happen under certain circumstances by referencing
        the relationships described by facts
        Example "A customer must not place more than one order at a time."
    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        fact (str): The fact to add to the knowledge graph.

    Returns:
        True if the rule was added successfully, False otherwise.
    """
    pass

In [None]:
class Designation(BaseModel):
    signifier: str
    expression: str
    concept_type: str
    closeMatch: Optional[List[str]]
    exactMatch: Optional[List[str]]
    vocabulary_name: str
    sources: Optional[List[str]]

def upsert_designation_to_kg(conn: RepositoryConnection, designation: Designation) -> bool:
    """
    Add a term to the knowledge graph. If exists, replace it.

    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        term (Term): The term to add to the knowledge graph.

    Returns:
        True if the term was added successfully, False otherwise.
    """
    signifier = designation.signifier
    expression = designation.expression
    designation_class = transform_title_cased(signifier)
    concept_type = designation.concept_type
    vocabulary_namespace = f"cfr-sbvr:CFR_SBVR_{designation.vocabulary_name}_NS"

    if concept_type == "IndividualNounConcept":
        designation_type = "Name"
    else:
        designation_type = "Term"

    logger.info(f"Format {signifier} to {designation_class}.")

    # Constructing closeMatch triples
    close_matches_triples = ""
    if designation.closeMatch:
        for close_match in designation.closeMatch:
            close_matches_triples += f"sbvr:closeMatch {close_match} ;\n"

    # Construct exactMatch triple if exactMatch is provided
    exact_match_triples = ""
    if designation.exactMatch:
        for exact_match in designation.exactMatch:
            exact_match_triples += f"sbvr:exactMatch {exact_match} ;\n"

    # Construct surces triple if sources is provided
    sources_triples = ""
    if designation.sources:
        for source in designation.sources:
            sources_triples += f'sbvr:referenceSupportsMeaning "{source}" ;\n'

    designation_upsert_query = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:{designation_class} ?p ?o .
}}
INSERT {{
    cfr-sbvr:{designation_class} a sbvr:{designation_type},
            sbvr:IntensionalDefinition,
            sbvr:{concept_type} ;
        sbvr:signifier "{signifier}" ;
        {exact_match_triples}
        {close_matches_triples}
        {sources_triples}
        sbvr:isImplicitlyUnderstood "false"^^xsd:boolean ;
        sbvr:expression "{expression}" ;
        sbvr:designationIsInNamespace {vocabulary_namespace} .
}}
WHERE {{
    # Match all existing triples related to {designation_class}
    OPTIONAL {{ cfr-sbvr:{designation_class} ?p ?o . }}
}}
    """

    logger.debug(f"SPARQL Query: {designation_upsert_query}")

    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, designation_upsert_query).evaluate()
        logger.info(f"Designation '{signifier}' upserted successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to upsert designation {signifier}: {e}")
        return False


In [None]:
def create_vocabulary(conn: RepositoryConnection, vocabulary_name: str) -> bool:
    """
    Create a new vocabulary in the knowledge graph.

    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        vocabulary (str): The name of the vocabulary to create.

    Returns:
        True if the vocabulary was created successfully, False otherwise.
    """

    query_remove_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

DELETE DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    cfr-sbvr:CFR_SBVR_VOC sbvr:vocabulary1IncorporatesVocabulary2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC .
}}
}}
    """

    query_add_triples = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC ?p ?o .
}}

INSERT {{
    cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC
        a owl:Class, sbvr:Vocabulary .
}}
WHERE {{
    # Match all existing triples related to cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC
    OPTIONAL {{ cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC ?p ?o . }}
}}
    """

    query_add_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

INSERT DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    cfr-sbvr:CFR_SBVR_VOC sbvr:vocabulary1IncorporatesVocabulary2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC .
}}
}}
    """

    logger.debug(f"SPARQL Query: {query_remove_association}")

    logger.debug(f"SPARQL Query: {query_add_triples}")

    logger.debug(f"Vocabulary name: cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC")

    # Remove associated vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_remove_association).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} delete associated successfully.")
    except Exception as e:
        logger.error(f"Failed to delete associated vocabulary {vocabulary_name}: {e}")

    # create new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_triples).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} created successfully.")
    except Exception as e:
        logger.error(f"Failed to create vocabulary {vocabulary_name}: {e}")

    # Add association with new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_association).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} associated successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to associate vocabulary {vocabulary_name}: {e}")
        return False


In [None]:
def create_vocabulary_namespace(conn: RepositoryConnection, vocabulary_name: str) -> bool:
    """
    Create a new vocabulary namespace in the knowledge graph.

    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        vocabulary_namespace (str): The name of the vocabulary namespace to create.

    Returns:
        True if the vocabulary namespace was created successfully, False otherwise.
    """

    query_remove_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

DELETE DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    cfr-sbvr:CFR_SBVR_NS sbvr:namespace1IncorporatesNamespace2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS .
}}
}}
    """

    query_add_triples = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX dct: <http://purl.org/dc/terms/>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS ?p ?o .
}}

INSERT {{
cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS
        a owl:Class, sbvr:VocabularyNamespace;
    sbvr:namespaceHasURI <http://cfr2sbvr.com/cfr/CFR_SBVR_{vocabulary_name}_NS#> ;
    sbvr:vocabularyIsExpressedInLanguage cfr-sbvr:EnglishLanguage ;
    sbvr:vocabularyNamespaceIsDerivedFromVocabulary cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC ;
    dct:title "Semantics of Business Vocabulary and Business Rules (SBVR) for Code of Federal Regulations (CFR)" ;
    skos:definition "SBVR-CFR is an adopted standard of the Object Management Group (OMG) intended to be the basis for formal and detailed natural language declarative description of CFR regulations" ;
    dct:source <https://github.com/asantos2000/dissertacao-santos-anderson-2024> .
}}
WHERE {{
    # Match all existing triples related to cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS
    OPTIONAL {{ cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS ?p ?o . }}
}}
    """
    query_add_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

INSERT DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    cfr-sbvr:CFR_SBVR_NS sbvr:namespace1IncorporatesNamespace2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS .
}}
}}
    """

    logger.debug(f"SPARQL Query: {query_remove_association}")
    logger.debug(f"SPARQL Query: {query_add_triples}")
    logger.debug(f"SPARQL Query: {query_add_association}")

    # Remove associated vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_remove_association).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} delete associated successfully.")
    except Exception as e:
        logger.error(f"Failed to delete associated vocabulary {vocabulary_name}: {e}")

    # create new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_triples).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} created successfully.")
    except Exception as e:
        logger.error(f"Failed to create vocabulary {vocabulary_name}: {e}")

    # Add association with new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_association).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} associated successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to associate vocabulary {vocabulary_name}: {e}")
        return False


In [None]:
def get_from_kg(conn: RepositoryConnection, signifier: str, kg: str, vector_db: str) -> List[Dict[str, Any]]:
    """
    Queries the knowledge graph to retrieve similar terms to the given term.

    Args:
        conn (RepositoryConnection): The AllegroGraph repository connection.
        term (str): The term to search for similar terms in the knowledge graph.
        kg (str): The name of the knowledge graph to query.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing information about similar terms,
        including URIs, scores, definitions, and related predicates.
    """

    if kg not in {config["FIBO_GRAPH"], config["CFR_SBVR_GRAPH"]}:
        raise ValueError(f"Unsupported knowledge graph: {kg}")

    query_string = f"""
PREFIX llm: <http://franz.com/ns/allegrograph/8.0.0/llm/>
PREFIX fibo: <https://spec.edmcouncil.org/fibo/ontology/master/2024Q2/QuickFIBOProd#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>

SELECT ?uri (xsd:decimal(?score) as ?score_percent) ?s ?p ?definition
FROM {kg}
WHERE {{
    (?uri ?score ?originalText ?p) llm:nearestNeighbor ("{signifier}" "{vector_db}" 5 0.8) .
    ?s ?p ?originalText .

    OPTIONAL {{ ?s skos:definition ?definition . }}
    OPTIONAL {{ ?s sbvr:expression ?definition . }}
}}
ORDER BY DESC(?score)
    """

    logger.debug(f"SPARQL Query: {query_string}")

    tuple_query = conn.prepareTupleQuery(QueryLanguage.SPARQL, query_string)

    try:
        result = tuple_query.evaluate()
        logger.debug(f"Result metadata: {result.metadata}")

        with result:
            similar_signifiers = [
                {
                    "uri": str(binding.getValue("uri")),
                    "score_percent": Decimal(binding.getValue("score_percent").getLabel()),
                    "located_signifier_uri": str(binding.getValue("s")),
                    "located_signifier_uri_local_name": binding.getValue("s").getLocalName(),
                    "located_signifier_predicate": str(binding.getValue("p")),
                    "definition": str(binding.getValue("definition"))
                }
                for binding in result
            ]
    except Exception as e:
        logger.error(f"Error evaluating SPARQL query: {e}")
        raise

    logger.info(f"Found {len(similar_signifiers)} similar signifier(s) for '{signifier}' on {kg}.")

    return similar_signifiers


In [None]:
def get_similar_signifiers(conn: RepositoryConnection, signifier: str) -> Tuple[list]:
    """
    Get similar signifiers for a given signifier.

    Args:
        conn (allegrograph.AllegroGraphConnection): An AllegroGraph connection object.
        signifier (str): The signifier to search for.

    Returns:
        list (Tuple[list]): A list of exact and close matches for the signifier.
    """
    fibo_similarity =  get_from_kg(conn, signifier, config["FIBO_GRAPH"], config["FIBO_GRAPH_VECTOR_STORE"])
    cfr_sbvr_similarity = get_from_kg(conn, signifier, config["CFR_SBVR_GRAPH"], config["CFR_SBVR_GRAPH_VECTOR_STORE"])

    exact_match = []
    close_match = []

    for item in fibo_similarity:
        if item["score_percent"] > config["SIMILARITY_THRESHOLD"]:
            exact_match.append(item.get("located_signifier_uri"))
        else:
            close_match.append(item.get("located_signifier_uri"))

    for item in cfr_sbvr_similarity:
        if item["score_percent"] > SIMILARITY_THRESHOLD:
            exact_match.append(item.get("located_signifier_uri"))
        else:
            close_match.append(item.get("located_signifier_uri"))

    logger.info(f"Found {len(exact_match)} exact matche(s) and {len(close_match)} close matche(s) for '{signifier}'.")

    return exact_match, close_match



In [None]:
# Restore checkpoint
manager = restore_checkpoint()

### Main

Orchestrates the process of the semantic annotation.

Processing terms, names, vocabularies and vocabulary namespaces

In [None]:
# Connect to AllegroGraph
conn = ag_connect(repo=REPO, catalog=CATALOG, host=HOST, port=PORT,
                user=USER, password=PASSWORD)

In [None]:
for doc in manager.list_document_ids(doc_type="llm_response"):
    logger.info(f"Processing document: {doc} ...")
    retrieved_llm_response = manager.retrieve_document(doc_id=doc, doc_type="llm_response")

    for response in retrieved_llm_response.content:

        logger.debug(response)

        signifier = response['signifier']
        expression = response['definition']
        concept_type = response['concept_classification']
        sources = response.get("sources")

        logger.info(f"Processing '{signifier}' ...")

        # define vocabulary
        # Assume first occorrence of section is the correct section
        # In case do not have section, use section_id.
        # TODO: Improve this
        vocabulary = define_vocabulary(doc, sources[0]["section"])
        vocabulary = normalize_ns_string(vocabulary)

        logger.info(f"Processing vocabulary {vocabulary}")

        if create_vocabulary(conn, vocabulary):
            logger.info(f"Vocabulary {vocabulary} upserted")
            if create_vocabulary_namespace(conn, vocabulary):
                logger.info(f"Vocabulary namespace {vocabulary} upserted")
            else:
                logger.info(f"Vocabulary namespace {vocabulary} not upserted")
        else:
            logger.info(f"Vocabulary {vocabulary} not upserted")

        # similar search
        exact_match, close_match = get_similar_signifiers(conn, signifier)

        # create designation
        designation = Designation(
            signifier=signifier,
            expression=expression,
            concept_type=concept_type,
            closeMatch=close_match,
            exactMatch=exact_match,
            vocabulary_name=vocabulary,
            sources=signifier_sources(sources) # Associate desgnations the their sources
        )

        upsert_designation_to_kg(conn, designation)

        logger.debug(f"Processed {designation}")
        logger.info(f"Signifier '{signifier}' done.")

    logger.info(f"{doc} done.")


In [None]:
conn.close()

## Notes

- Kernel conda environment: ipt-cfr2sbvr - Python version 3.11.9