<a href="https://colab.research.google.com/github/asantos2000/master-degree-santos-anderson/blob/main/code/src/chap_6_semantic_annotation_elements_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Semantic Annotation - Elements extraction

Extract and identify elements.

Chapter 6. Ferramentas de suporte
- Section 6.2 Implementação dos principais componentes
  - Section 6.2.3 Anotações semânticas
    - Section Algoritmo "extract / classify elements"

## Google colab

In [2]:
%load_ext autoreload
%autoreload 2

import sys
import os

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  !rm -rf cfr2sbvr configuration checkpoint
  !git clone https://github.com/asantos2000/master-degree-santos-anderson.git cfr2sbvr
  %pip install -r cfr2sbvr/code/requirements.txt
  !cp -r cfr2sbvr/code/src/configuration .
  !cp -r cfr2sbvr/code/src/checkpoint .
  !cp -r cfr2sbvr/code/config.colab.yaml config.yaml
  DEFAULT_CONFIG_FILE="config.yaml"
else:
  DEFAULT_CONFIG_FILE="../config.yaml"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Imports

In [3]:
# Standard library imports
import json
import glob
from datetime import datetime
from decimal import Decimal

# Third-party libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pydantic import BaseModel, Field
from sklearn.metrics import confusion_matrix, classification_report
from typing import List, Dict, Optional, Any, Tuple, Set

# Franz AllegroGraph (AG) imports
from franz.openrdf.connect import ag_connect
from franz.openrdf.repository.repository import RepositoryConnection
from franz.openrdf.query.query import QueryLanguage

# inflect library
import inflect

# Local application/library-specific imports
import checkpoint.main as checkpoint
from checkpoint.main import restore_checkpoint, save_checkpoint, Document, DocumentProcessor
import configuration.main as configuration
import logging_setup.main as logging_setup
import token_estimator.main as token_estimator
from token_estimator.main import estimate_tokens
import rules_taxonomy_provider.main as rules_taxonomy_provider
from rules_taxonomy_provider.main import RuleInformationProvider, RulesTemplateProvider
import llm_query.main as llm_query
from llm_query.main import query_instruct_llm

DEV_MODE = True

if DEV_MODE:
    # Development mode
    import importlib
    importlib.reload(configuration)
    importlib.reload(logging_setup)
    importlib.reload(checkpoint)
    importlib.reload(token_estimator)
    importlib.reload(rules_taxonomy_provider)
    importlib.reload(llm_query)

## Settings

Default settings, check them before run the notebook.

### Get configuration

In [4]:
# load config
DEFAULT_CONFIG_FILE = "../config.yaml"
config = configuration.load_config(DEFAULT_CONFIG_FILE)

Generated files for analysis in this run

In [5]:
print(config["DEFAULT_CHECKPOINT_FILE"],
config["DEFAULT_EXTRACTION_REPORT_FILE"],
config["DEFAULT_EXCEL_FILE"])

../data/checkpoints/documents-2024-11-07-1.json ../outputs/extraction_report-2024-11-07-1.html ../outputs/compare_items_metrics.xlsx


### Logging configuration

In [6]:
logger = logging_setup.setting_logging(config["DEFAULT_LOG_DIR"], config["LOG_LEVEL"])

2024-11-07 17:05:45 - INFO - Logging is set up with daily rotation.


## Checkpoints

Documents, annoted datasets, statistics and metrics about the execution of the notebook are stored by checkpoint module.

Checkpoints are stored / retrieved at the directory `DEFAULT_CHECKPOINT_FILE` in the configuration file.

During the execution, it will restore the checkpoint at the beginning of the section and saved at the end. We can run and restore the checkpoint several times. If the run fails, check the closest checkpoint and restore it.

### Restore the checkpoint

In [7]:
# Restore the checkpoint

# For development only
config["DEFAULT_CHECKPOINT_FILE"] = "../data/checkpoints/documents-2024-11-01-3.json"

manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])

2024-11-07 17:06:34 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-11-01-3.json
2024-11-07 17:06:34 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-11-01-3.json.


## Datasets

Datasets used in the notebook. They are divided into sections and true tables. The sections are the documents from CFR and true tables are annoted  or "golden" datasets.

### General functions and data structures

In [8]:
def basic_text_stats(text: str) -> Tuple[int, int, int]:
    """
    Computes basic text statistics: number of lines, words, and average words per line.

    Args:
        text (str): The text to analyze.

    Returns:
        Tuple[int, int, int]: A tuple containing the number of lines, total words, and average words per line.
    """
    lines=len(text.split("\n"))
    words=len(text.split(" "))
    avg_words_per_line=round(words/lines)
    return lines, words, avg_words_per_line

In [9]:
def get_section_from_kg(conn: Any, section_num: str) -> str:
    """
    Retrieves a section from the Knowledge Graph based on the section number.

    Args:
        conn: The connection object to the Knowledge Graph.
        section_num (str): The section number to query.

    Returns:
        str: The retrieved section content as a string.

    Raises:
        Exception: If there is an error executing the query.
    """
    # Query section number from KG
    query = """
    PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>
    PREFIX fro-leg-ref: <http://finregont.com/fro/ref/LegalReference.ttl#>

    SELECT ?section ?section_seq ?section_num ?section_subject ?section_citation ?section_notes ?divide ?divide_seq ?paragraph_enum ?paragraph_text
    WHERE {
      ?section a fro-cfr:CFR_Section ;
        fro-leg-ref:hasSequenceNumber ?section_seq ;
        fro-cfr:hasSectionNumber ?section_num ;
        fro-cfr:hasSectionSubject ?section_subject .
      OPTIONAL {?section fro-leg-ref:refers_toNote ?section_notes} .
      OPTIONAL {?section fro-cfr:hasSectionCitation ?section_citation} .

      ?divide fro-leg-ref:divides ?section ; # rdf:type fro-cfr:CFR_Parapraph
        fro-leg-ref:hasSequenceNumber ?divide_seq ;
        fro-cfr:hasParagraphText ?paragraph_text ;
        fro-leg-ref:hasSequenceNumber ?paragraph_seq .
      OPTIONAL {?divide fro-cfr:hasParagraphEnumText ?paragraph_enum} .
    """ + f"""
      FILTER("{section_num}" = ?section_num)
    """ + """
    }
    ORDER BY ?section_num ?section ?divide_seq
    """
    tuple_query = conn.prepareTupleQuery(QueryLanguage.SPARQL, query)
    result = tuple_query.evaluate()

    logger.debug(f"result.metadata: {result.metadata}")
    logger.debug(f"result.variable_names: {result.variable_names}")

    body_text = ""
    previous_section = None
    previous_paragraph_id = None
    with result:
      for binding_set in result:
          section = binding_set.getValue("section")
          section_seq = str(binding_set.getValue("section_seq")).replace('"', '')
          section_num = str(binding_set.getValue("section_num")).replace('"', '')
          section_subject = str(binding_set.getValue("section_subject")).replace('"', '')
          section_citation = str(binding_set.getValue("section_citation")).replace('"', '')
          section_notes = str(binding_set.getValue("section_notes")).replace('"', '')
          divide = binding_set.getValue("divide")
          divide_seq = str(binding_set.getValue("divide_seq")).replace('"', '')
          paragraph_enum = str(binding_set.getValue("paragraph_enum")).replace('"', '')
          paragraph_text = str(binding_set.getValue("paragraph_text")).replace('"', '')
          # Header
          if previous_section != section:
            previous_section = section
            header = f"""
    section_number: {section_num}
    section_subject: {section_subject}
    section_id: {section}
    citations: {section_citation}
    notes: {section_notes}
            """
          # Body
          if paragraph_enum != "None":
            body_text += f"""
    paragraph_enumeration: {paragraph_enum}
    paragraph_text: {paragraph_text}
    """
          else:
            body_text += f"""
    paragraph_text: {paragraph_text}
    """

    return header + body_text


In [10]:
def calculate_content_quantities_p1(doc_id, content_data, filename):
    elements = content_data.get("elements", [])
    logger.debug(elements)

    # Collect statistics
    num_elements = len(elements)
    fact_count = 0
    fact_type_count = 0
    rule_count = 0
    verb_count = 0
    term_count = 0

    # Process each element within the document
    for element in elements:
        classification = element.get("classification", "Unknown")
        if classification == "Fact":
            fact_count += 1
        elif classification == "Fact Type":
            fact_type_count += 1
        elif classification == "Rule":
            rule_count += 1
        verb_count += len(element.get("verb_symbols", []))
        term_count += len(element.get("terms", []))

    return {
        "document_id": doc_id,
        "quantity_of_elements": num_elements,
        "quantity_of_facts": fact_count,
        "quantity_of_fact_types": fact_type_count,
        "quantity_of_rules": rule_count,
        "quantity_of_verbs": verb_count,
        "quantity_of_terms": term_count,
        "filename": filename,
    }

In [11]:
def process_documents_p1(file_path, file_name, doc_ids):
    # Initialize data containers for the two tables
    table_data = []

    with open(file_path, 'r') as file:
        content = json.load(file)

        # Iterate over each document in the file
        for doc_id, content_data in content.items():
            logger.debug(doc_id, content_data)
            # Check if the document ID is in the list to process
            #if doc_id in doc_ids and 'content' in doc_data:
            if all([doc_id in doc_ids, 'content' in content_data]):
                table_data.append(calculate_content_quantities_p1(doc_id, content_data['content'], file_name))

    return table_data


In [12]:
def calculate_content_quantities_p2(doc_id, content_data, filename):
    terms_relationship = content_data['content'].get('terms_relationship', [])
    logger.debug(f"terms_relationship: {terms_relationship}")
    terms = content_data['content']['terms']
    logger.debug(f"terms: {terms}")

    # Count terms with and without definitions
    total_terms = len(terms)
    terms_with_definition = sum(1 for term in terms if term.get('definition'))
    terms_without_definition = total_terms - terms_with_definition

    # Check for term relationships and count them
    terms_relationship_count = len(terms_relationship)

    # Add data to table
    return {
        "document_id": doc_id,
        "count_of_terms": total_terms,
        "terms_with_definition": terms_with_definition,
        "terms_without_definition": terms_without_definition,
        "terms_relationship_count": terms_relationship_count,
        "filename": filename
    }

In [13]:
def process_documents_p2(file_path, file_name, doc_ids):
    table_data = []
    with open(file_path, 'r') as file:
        content = json.load(file)

        # Iterate over each document in the file
        for doc_id, doc_data in content.items():
            # Check if the document has terms in its content
            #if doc_id in doc_ids and 'content' in doc_data and 'terms' in doc_data['content']:
            if all([doc_id in doc_ids, 'content' in doc_data, 'terms' in doc_data['content']]):
                table_data.append(calculate_content_quantities_p2(doc_id, doc_data, file_name))
    return table_data

### Get section from KG CFR
Due the mistakes in the original dataset, we need to correct it. This function will not be used in the final version. Instead we will use variables (document_02, document_05, document_07) from the original dataset.

#### Access allegrograph

In [14]:
# conn = ag_connect(repo=config["ALLEGROGRAPH"]["REPO"], catalog=config["ALLEGROGRAPH"]["CATALOG"],
#                 host=f'https://{config["ALLEGROGRAPH"]["HOST"]}:443',
#                 user=config["ALLEGROGRAPH"]["USER"], password=config["ALLEGROGRAPH"]["PASSWORD"])

In [15]:
# section_num = "§ 275.0-7"
# logger.info(get_section_from_kg(conn, section_num=section_num))

Print results formatted

In [16]:
# conn.close()

### Texts to extract the elements

CFR Sections 275.0-2, 275.0-5, 275.0-7

#### Section 275.0-2

In [17]:
manager.add_document(
    Document(
        id="§ 275.0-2",
        type="section",
content = """
§ 275.0-2 General procedures for serving non-residents.
(a) General procedures for serving process, pleadings, or other papers on non-resident investment advisers, general partners and managing agents.  Under Forms ADV and ADV-NR [17 CFR 279.1 and 279.4], a person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents:
  (1) A person may serve a non-resident investment adviser, non-resident general partner, or non-resident managing agent by furnishing the Commission with one copy of the process, pleadings, or papers, for each named party, and one additional copy for the Commission's records.
  (2) If process, pleadings, or other papers are served on the Commission as described in this section, the Secretary of the Commission (Secretary) will promptly forward a copy to each named party by registered or certified mail at that party's last address filed with the Commission.
  (3) If the Secretary certifies that the Commission was served with process, pleadings, or other papers pursuant to paragraph (a)(1) of this section and forwarded these documents to a named party pursuant to paragraph (a)(2) of this section, this certification constitutes evidence of service upon that party.
(b) Definitions.  For purposes of this section:
  (1) Managing agent  means any person, including a trustee, who directs or manages, or who participates in directing or managing, the affairs of any unincorporated organization or association other than a partnership.
  (2) Non-resident  means:
    (i) An individual who resides in any place not subject to the jurisdiction of the United States;
    (ii) A corporation that is incorporated in or that has its principal office and place of business in any place not subject to the jurisdiction of the United States; and
    (iii) A partnership or other unincorporated organization or association that has its principal office and place of business in any place not subject to the jurisdiction of the United States.
  (3) Principal office and place of business  has the same meaning as in § 275.203A-3(c) of this chapter.
"""
    )
)

In [20]:
docs = manager.list_document_ids(doc_type="section")

for doc in docs:
    text = manager.retrieve_document(doc, "section").content
    logger.info(f"Document ID: {doc}")
    paragraphs, words, avg_word_per_paragraph = basic_text_stats(text)
    tokens = estimate_tokens(text)
    logger.info(f"Section paragraphs: {paragraphs}, words: {words}, avg_word_per_paragraph: {avg_word_per_paragraph}, tokens: {tokens}")


2024-11-07 17:13:28 - INFO - Document ID: § 275.0-2
2024-11-07 17:13:28 - INFO - Section paragraphs: 14, words: 362, avg_word_per_paragraph: 26, tokens: 481
2024-11-07 17:13:28 - INFO - Document ID: § 275.0-5


2024-11-07 17:13:28 - INFO - Section paragraphs: 10, words: 260, avg_word_per_paragraph: 26, tokens: 307
2024-11-07 17:13:28 - INFO - Document ID: § 275.0-7
2024-11-07 17:13:28 - INFO - Section paragraphs: 19, words: 513, avg_word_per_paragraph: 27, tokens: 642


In [None]:
estimate_tokens(manager.documents["§ 275.0-2"].content)

Section 275.0-5

In [18]:
manager.add_document(
    Document(
        id="§ 275.0-5",
        type="section",
content = """
§ 275.0-5 Procedure with respect to applications and other matters.
The procedure hereinbelow set forth will be followed with respect to any proceeding initiated by the filing of an application, or upon the Commission's own motion, pursuant to any section of the Act or any rule or regulation thereunder, unless in the particular case a different procedure is provided:
(a) Notice of the initiation of the proceeding will be published in the Federal Register and will indicate the earliest date upon which an order disposing of the matter may be entered. The notice will also provide that any interested person may, within the period of time specified therein, submit to the Commission in writing any facts bearing upon the desirability of a hearing on the matter and may request that a hearing be held, stating his reasons therefor and the nature of his interest in the matter.
(b) An order disposing of the matter will be issued as of course following the expiration of the period of time referred to in paragraph (a) of this section, unless the Commission thereafter orders a hearing on the matter.
(c) The Commission will order a hearing on the matter, if it appears that a hearing is necessary or appropriate in the public interest or for the protection of investors,
  (1) upon the request of any interested person or
  (2) upon its own motion.
(d) Definition of application. For purposes of this rule, an “application” means any application for an order of the Commission under the Act other than an application for registration as an investment adviser.
"""
    )
)

Section 275.0-7

In [19]:
manager.add_document(
    Document(
        id="§ 275.0-7",
        type="section",
content = """
§ 275.0-7 Small entities under the Investment Advisers Act for purposes of the Regulatory Flexibility Act.
(a) For purposes of Commission rulemaking in accordance with the provisions of Chapter Six of the Administrative Procedure Act (5 U.S.C. 601 et seq.) and unless otherwise defined for purposes of a particular rulemaking proceeding, the term small business or small organization for purposes of the Investment Advisers Act of 1940 shall mean an investment adviser that:
  (1) Has assets under management, as defined under Section 203A(a)(3) of the Act (15 U.S.C. 80b-3a(a)(2)) and reported on its annual updating amendment to Form ADV (17 CFR 279.1), of less than $25 million, or such higher amount as the Commission may by rule deem appropriate under Section 203A(a)(1)(A) of the Act (15 U.S.C. 80b-3a(a)(1)(A));
  (2) Did not have total assets of $5 million or more on the last day of the most recent fiscal year; and
  (3) Does not control, is not controlled by, and is not under common control with another investment adviser that has assets under management of $25 million or more (or such higher amount as the Commission may deem appropriate), or any person (other than a natural person) that had total assets of $5 million or more on the last day of the most recent fiscal year.
(b) For purposes of this section:
  (1) Control  means the power, directly or indirectly, to direct the management or policies of a person, whether through ownership of securities, by contract, or otherwise.
    (i) A person is presumed to control a corporation if the person:
      (A) Directly or indirectly has the right to vote 25 percent or more of a class of the corporation's voting securities; or
      (B) Has the power to sell or direct the sale of 25 percent or more of a class of the corporation's voting securities.
    (ii) A person is presumed to control a partnership if the person has the right to receive upon dissolution, or has contributed, 25 percent or more of the capital of the partnership.
    (iii) A person is presumed to control a limited liability company (LLC) if the person:
      (A) Directly or indirectly has the right to vote 25 percent or more of a class of the interests of the LLC;
      (B) Has the right to receive upon dissolution, or has contributed, 25 percent or more of the capital of the LLC; or
      (C) Is an elected manager of the LLC.
    (iv) A person is presumed to control a trust if the person is a trustee or managing agent of the trust.
  (2) Total assets  means the total assets as shown on the balance sheet of the investment adviser or other person described above under paragraph (a)(3) of this section, or the balance sheet of the investment adviser or such other person with its subsidiaries consolidated, whichever is larger.
"""
    )
)

### True tables

True tables are annotated or "golden" datasets in which entities have been manually identified and labeled within the original source data.

True tables for sectiona 275.0-2, 275.0-5 and 275.0-7

Load true table for P1 - Elements extraction and classification, terms, and verb symbols.

In [20]:
with open(f"{config['DEFAULT_DATA_DIR']}/extract_p1_true_table.json", 'r') as file:
    data = json.load(file)

    manager.add_document(
        Document.model_validate(data["§ 275.0-2_P1|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-5_P1|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-7_P1|true_table"])
    )

Load true table for P2 - Terms definition and synonyms.

In [21]:
with open(f"{config['DEFAULT_DATA_DIR']}/extract_p2_true_table.json", 'r') as file:
    data = json.load(file)

    manager.add_document(
        Document.model_validate(data["§ 275.0-2_P2|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-5_P2|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-7_P2|true_table"])
    )

### Save checkpoint

In [23]:
# Persist the state to a file
save_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"], manager=manager)

2024-11-05 13:31:06 - INFO - Checkpoint saved.


## extract / classify elements

### General functions and data structures

Functions and data structures used in the notebook.

LLM model for extracting elements P1.

In [24]:
class Term(BaseModel):
    term: str = Field(..., description="The term is a word or a group of words that represents a specific concept, entity, or subject in a particular context")
    classification: str = Field(..., description="The classification of the term, either 'Common Noun' or 'Proper Noun'.")

class Element(BaseModel):
    id: int = Field(..., description="A unique numeric identifier for each fact, fact type, or rule.")
    statement: str = Field(..., description="The full statement or phrase representing the fact, fact type, or rule.")
    terms: List[Term] = Field(..., description="A list of terms involved in the fact, fact type, or rule.")
    verb_symbols: List[str] = Field(..., description="A list of vers, verb phrases or prepositions connecting the terms.")
    classification: str = Field(..., description="Indicates whether the statement is classified as 'Fact', 'Fact Type', or 'Operative Rule'.")
    source: str = Field(..., description="The paragraph ID of the document where the fact, fact type, or rule is located (e.g., '(a)', '(b)(2)').")

class ElementsDocumentModel(BaseModel):
    section: str = Field(..., description="The section ID of the document.")
    summary: str = Field(..., description="The summary of the document.")
    elements: List[Element] = Field(..., description="A list of facts, fact types, and rules extracted from the document.")

LLM model for extracting elements P2.

In [25]:
class Item(BaseModel):
    term: str = Field(..., description="The term is a word or a group of words that represents a specific concept, entity, or subject in a particular context")
    definition: Optional[str] = Field(None, description="Definition is a explanation or description of the meaning of the term.")

class TermsRelationship(BaseModel):
    term_1: str = Field(..., description="First term in the relationship.")
    term_2: str = Field(..., description="Second term in the relationship.")
    relation: str = Field(..., description="The typrelationship between the terms.")

class TermsDocumentModel(BaseModel):
    terms: List[Item] = Field(..., description="A list of terms.")
    terms_relationship: List[TermsRelationship] = Field(..., description="A list of relationships between terms.")

In [26]:
def extract_unique_terms(document: ElementsDocumentModel) -> List[str]:
    """
    Extracts unique terms from the 'terms' attribute of elements within an ElementsDocumentModel instance.

    Args:
        document (ElementsDocumentModel): The document containing elements, each with a list of terms.

    Returns:
        List[str]: A list of unique terms found across all elements in the document.

    This function iterates through each element of the document, accesses the terms list in each element, and collects
    the unique terms. It uses a set to ensure that the terms are unique before converting it back to a list for the output.
    """

    # Initialize a set to store unique terms
    unique_terms: Set[str] = set()

    # Loop through each element in the 'elements' list of the document
    for element in document.elements:
        # Loop through the 'terms' list in each element
        for term_info in element.terms:
            # Add the term to the set
            unique_terms.add(term_info.term)

    # Convert the set to a list and return it
    return list(unique_terms)

In [27]:
def remove_section_symbol(input_string: str) -> str:
    """
    Removes the '§' symbol from the input string and trims whitespace.

    Args:
        input_string (str): The string from which to remove the '§' symbol.

    Returns:
        str: The cleaned string without the '§' symbol and leading/trailing whitespace.

    Raises:
        TypeError: If 'input_string' is not a string.
    """
    if not isinstance(input_string, str):
        raise TypeError("input_string must be a string")
    return input_string.replace("§", "").strip()

In [28]:
def signifier_sources(sources: list) -> list:
    """
    Extract desgnations sources

    Args:
        sources (list): List of sources

    Returns:
        list: List of sources
    """
    # Extract desgnations sources
    sources_lst = []
    for source in sources:
        source_section = str(source.get("section"))
        source_paragraph = str(source.get("paragraph"))
        sources_lst.append(source_section + source_paragraph)
    return sources_lst

In [29]:
def transform_title_cased(input_string: str) -> str:
    """
    Transform the input string to title case, which capitalizes the first letter of each word.

    Args:
        input_string (str): The string to transform.

    Returns:
        title_case_string (str): The transformed string.
    """
    title_case_string = input_string.title()
    # Remove all spaces
    transformed_string = title_case_string.replace(" ", "")
    return transformed_string

In [30]:
def normalize_ns_string(input_string: str) -> str:
    """
    Transform the input string to title case, which capitalizes the first letter of each word.

    Args:
        input_string (str): The string to normalize.

    Returns:
        normalized_string (str): The normalized string.
    """
    normalized_string = remove_section_symbol(input_string)

    # Remove all spaces, change points and hyphens to underscores
    return normalized_string.replace(" ", "").replace("-", "_").replace(".", "_")

In [31]:
# TODO: Implement this function (Move to LAB 5)
def upsert_fact_to_kg(conn, fact):
    """
    Add a fact to the knowledge graph. If exists, replace it.
    Context:
        Facts build on concepts: Facts are statements or assertions about the relationships
        between these concepts. They describe how terms relate to each other in specific ways.
        Example "A customer places an order.".

    Args:
        conn (Connection): The connection to the knowledge graph database.
        fact (str): The fact to add to the knowledge graph.

    Returns:
        True if the fact was added successfully, False otherwise.
    """
    pass

In [32]:
# TODO: Implement this function (Move to LAB 5)
def upsert_rule_to_kg(conn: RepositoryConnection, fact:Dict[str, Any]) -> bool:
    """
    Add a rule to the knowledge graph. If exists, replace it.

    Context:
        Rules build on facts: Rules are constructed based on these facts to enforce
        certain conditions, constraints, or actions within the business.
        Rules dictate what must or must not happen under certain circumstances by referencing
        the relationships described by facts
        Example "A customer must not place more than one order at a time."
    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        fact (str): The fact to add to the knowledge graph.

    Returns:
        True if the rule was added successfully, False otherwise.
    """
    pass

In [33]:
class Designation(BaseModel):
    signifier: str
    statement: str
    concept_type: str
    closeMatch: Optional[List[str]]
    exactMatch: Optional[List[str]]
    vocabulary_name: str
    sources: Optional[List[str]]

def upsert_designation_to_kg(conn: RepositoryConnection, designation: Designation) -> bool:
    """
    Add a term to the knowledge graph. If exists, replace it.

    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        term (Term): The term to add to the knowledge graph.

    Returns:
        True if the term was added successfully, False otherwise.
    """
    signifier = designation.signifier
    statement = designation.statement
    designation_class = transform_title_cased(signifier)
    concept_type = designation.concept_type
    vocabulary_namespace = f"cfr-sbvr:CFR_SBVR_{designation.vocabulary_name}_NS"

    if concept_type == "IndividualNounConcept":
        designation_type = "Name"
    else:
        designation_type = "Term"

    logger.info(f"Format {signifier} to {designation_class}.")

    # Constructing closeMatch triples
    close_matches_triples = ""
    if designation.closeMatch:
        for close_match in designation.closeMatch:
            close_matches_triples += f"sbvr:closeMatch {close_match} ;\n"

    # Construct exactMatch triple if exactMatch is provided
    exact_match_triples = ""
    if designation.exactMatch:
        for exact_match in designation.exactMatch:
            exact_match_triples += f"sbvr:exactMatch {exact_match} ;\n"

    # Construct surces triple if sources is provided
    sources_triples = ""
    if designation.sources:
        for source in designation.sources:
            sources_triples += f'sbvr:referenceSupportsMeaning "{source}" ;\n'

    designation_upsert_query = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:{designation_class} ?p ?o .
}}
INSERT {{
    cfr-sbvr:{designation_class} a sbvr:{designation_type},
            sbvr:IntensionalDefinition,
            sbvr:{concept_type} ;
        sbvr:signifier "{signifier}" ;
        {exact_match_triples}
        {close_matches_triples}
        {sources_triples}
        sbvr:isImplicitlyUnderstood "false"^^xsd:boolean ;
        sbvr:Statement "{statement}" ;
        sbvr:designationIsInNamespace {vocabulary_namespace} .
}}
WHERE {{
    # Match all existing triples related to {designation_class}
    OPTIONAL {{ cfr-sbvr:{designation_class} ?p ?o . }}
}}
    """

    logger.debug(f"SPARQL Query: {designation_upsert_query}")

    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, designation_upsert_query).evaluate()
        logger.info(f"Designation '{signifier}' upserted successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to upsert designation {signifier}: {e}")
        return False

**Create vocabulary** (Move to LAB 5)

In [34]:
def define_vocabulary(section_id: str, source_section: str) -> str:
    """
    Determines the vocabulary section ID based on the term's source section.

    Args:
        section_id (str): The section ID of the current document.
        source_section: The section id.

    Returns:
        str: The appropriate vocabulary section ID.

    Raises:
        KeyError: If 'source' or 'section' key is missing in the term.
        TypeError: If 'section_id' is not a string or 'term' is not a dictionary.
    """

    # if not isinstance(section_id, str):
    #     raise TypeError("section_id must be a string")
    # if not isinstance(term, dict):
    #     raise TypeError("term must be a dictionary")
    # if "sources" not in term or "section" not in term["source"]:
    #     raise KeyError("term must contain 'source' with 'section'")

    section_id = remove_section_symbol(section_id)

    try:
        term_section_id = remove_section_symbol(source_section)
    except KeyError:
        term_section_id = section_id

    return section_id if term_section_id == section_id else term_section_id

In [35]:
def create_vocabulary(conn: RepositoryConnection, vocabulary_name: str) -> bool:
    """
    Create a new vocabulary in the knowledge graph.

    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        vocabulary (str): The name of the vocabulary to create.

    Returns:
        True if the vocabulary was created successfully, False otherwise.
    """

    query_remove_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

DELETE DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    cfr-sbvr:CFR_SBVR_VOC sbvr:vocabulary1IncorporatesVocabulary2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC .
}}
}}
    """

    query_add_triples = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC ?p ?o .
}}

INSERT {{
    cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC
        a owl:Class, sbvr:Vocabulary .
}}
WHERE {{
    # Match all existing triples related to cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC
    OPTIONAL {{ cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC ?p ?o . }}
}}
    """

    query_add_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

INSERT DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    cfr-sbvr:CFR_SBVR_VOC sbvr:vocabulary1IncorporatesVocabulary2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC .
}}
}}
    """

    logger.debug(f"SPARQL Query: {query_remove_association}")

    logger.debug(f"SPARQL Query: {query_add_triples}")

    logger.debug(f"Vocabulary name: cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC")

    # Remove associated vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_remove_association).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} delete associated successfully.")
    except Exception as e:
        logger.error(f"Failed to delete associated vocabulary {vocabulary_name}: {e}")

    # create new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_triples).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} created successfully.")
    except Exception as e:
        logger.error(f"Failed to create vocabulary {vocabulary_name}: {e}")

    # Add association with new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_association).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} associated successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to associate vocabulary {vocabulary_name}: {e}")
        return False


In [36]:
def create_vocabulary_namespace(conn: RepositoryConnection, vocabulary_name: str) -> bool:
    """
    Create a new vocabulary namespace in the knowledge graph.

    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        vocabulary_namespace (str): The name of the vocabulary namespace to create.

    Returns:
        True if the vocabulary namespace was created successfully, False otherwise.
    """

    query_remove_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

DELETE DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    cfr-sbvr:CFR_SBVR_NS sbvr:namespace1IncorporatesNamespace2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS .
}}
}}
    """

    query_add_triples = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX dct: <http://purl.org/dc/terms/>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS ?p ?o .
}}

INSERT {{
cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS
        a owl:Class, sbvr:VocabularyNamespace;
    sbvr:namespaceHasURI <http://cfr2sbvr.com/cfr/CFR_SBVR_{vocabulary_name}_NS#> ;
    sbvr:vocabularyIsExpressedInLanguage cfr-sbvr:EnglishLanguage ;
    sbvr:vocabularyNamespaceIsDerivedFromVocabulary cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC ;
    dct:title "Semantics of Business Vocabulary and Business Rules (SBVR) for Code of Federal Regulations (CFR)" ;
    skos:definition "SBVR-CFR is an adopted standard of the Object Management Group (OMG) intended to be the basis for formal and detailed natural language declarative description of CFR regulations" ;
    dct:source <https://github.com/asantos2000/dissertacao-santos-anderson-2024> .
}}
WHERE {{
    # Match all existing triples related to cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS
    OPTIONAL {{ cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS ?p ?o . }}
}}
    """
    query_add_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

INSERT DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    cfr-sbvr:CFR_SBVR_NS sbvr:namespace1IncorporatesNamespace2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS .
}}
}}
    """

    logger.debug(f"SPARQL Query: {query_remove_association}")
    logger.debug(f"SPARQL Query: {query_add_triples}")
    logger.debug(f"SPARQL Query: {query_add_association}")

    # Remove associated vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_remove_association).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} delete associated successfully.")
    except Exception as e:
        logger.error(f"Failed to delete associated vocabulary {vocabulary_name}: {e}")

    # create new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_triples).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} created successfully.")
    except Exception as e:
        logger.error(f"Failed to create vocabulary {vocabulary_name}: {e}")

    # Add association with new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_association).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} associated successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to associate vocabulary {vocabulary_name}: {e}")
        return False

**similarity search (P5)** (Move to LAB 5)

Try a similarity search to find the entity in the graph. If not found, create a new entity and corresponding embedding. If exists, create a link between the two.

In [37]:
def get_from_kg(conn: RepositoryConnection, signifier: str, kg: str, vector_db: str) -> List[Dict[str, Any]]:
    """
    Queries the knowledge graph to retrieve similar terms to the given term.

    Args:
        conn (RepositoryConnection): The AllegroGraph repository connection.
        term (str): The term to search for similar terms in the knowledge graph.
        kg (str): The name of the knowledge graph to query.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing information about similar terms,
        including URIs, scores, definitions, and related predicates.
    """

    if kg not in {config["FIBO_GRAPH"], config["CFR_SBVR_GRAPH"]}:
        raise ValueError(f"Unsupported knowledge graph: {kg}")

    query_string = f"""
PREFIX llm: <http://franz.com/ns/allegrograph/8.0.0/llm/>
PREFIX fibo: <https://spec.edmcouncil.org/fibo/ontology/master/2024Q2/QuickFIBOProd#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>

SELECT ?uri (xsd:decimal(?score) as ?score_percent) ?s ?p ?definition
FROM {kg}
WHERE {{
    (?uri ?score ?originalText ?p) llm:nearestNeighbor ("{signifier}" "{vector_db}" 5 0.8) .
    ?s ?p ?originalText .

    OPTIONAL {{ ?s skos:definition ?definition . }}
    OPTIONAL {{ ?s sbvr:Statement ?definition . }}
}}
ORDER BY DESC(?score)
    """

    logger.debug(f"SPARQL Query: {query_string}")

    tuple_query = conn.prepareTupleQuery(QueryLanguage.SPARQL, query_string)

    try:
        result = tuple_query.evaluate()
        logger.debug(f"Result metadata: {result.metadata}")

        with result:
            similar_signifiers = [
                {
                    "uri": str(binding.getValue("uri")),
                    "score_percent": Decimal(binding.getValue("score_percent").getLabel()),
                    "located_signifier_uri": str(binding.getValue("s")),
                    "located_signifier_uri_local_name": binding.getValue("s").getLocalName(),
                    "located_signifier_predicate": str(binding.getValue("p")),
                    "definition": str(binding.getValue("definition"))
                }
                for binding in result
            ]
    except Exception as e:
        logger.error(f"Error evaluating SPARQL query: {e}")
        raise

    logger.info(f"Found {len(similar_signifiers)} similar signifier(s) for '{signifier}' on {kg}.")

    return similar_signifiers

In [38]:
def get_similar_signifiers(conn: RepositoryConnection, signifier: str) -> Tuple[list]:
    """
    Get similar signifiers for a given signifier.

    Args:
        conn (allegrograph.AllegroGraphConnection): An AllegroGraph connection object.
        signifier (str): The signifier to search for.

    Returns:
        list (Tuple[list]): A list of exact and close matches for the signifier.
    """
    fibo_similarity =  get_from_kg(conn, signifier, config["FIBO_GRAPH"], config["FIBO_GRAPH_VECTOR_STORE"])
    cfr_sbvr_similarity = get_from_kg(conn, signifier, config["CFR_SBVR_GRAPH"], config["CFR_SBVR_GRAPH_VECTOR_STORE"])

    exact_match = []
    close_match = []

    for item in fibo_similarity:
        if item["score_percent"] > config["SIMILARITY_THRESHOLD"]:
            exact_match.append(item.get("located_signifier_uri"))
        else:
            close_match.append(item.get("located_signifier_uri"))

    for item in cfr_sbvr_similarity:
        if item["score_percent"] > config["SIMILARITY_THRESHOLD"]:
            exact_match.append(item.get("located_signifier_uri"))
        else:
            close_match.append(item.get("located_signifier_uri"))

    logger.info(f"Found {len(exact_match)} exact matche(s) and {len(close_match)} close matche(s) for '{signifier}'.")

    return exact_match, close_match

### Prompt engeneering

Prompt strucuture is based on [1]. It is a zero-shot prompt following the concept of chain of thought.

Following the approaches are taken.

#### 1. facts and fact types
Try to extract all facts and fact types from a given document.

This approach has successful results. It is focused on extracting the elements, and achive the best results, similar to the approach 3.

In [39]:
system_prompt_facts = """

You are tasked with extracting **facts**, **fact types**, and their **relationships** from a given document. Follow these steps carefully:

#### Steps to Perform:

1. **Identify Facts and Fact Types**:
   - A **fact** is a specific instance or statement that describes an event or condition.
   - A **fact type** is a general template or relationship that defines how entities interact.
   - For each fact or fact type:
     - Extract the **statement** that represents the fact or fact type.
     - List the **terms** (Nouns or Proper nouns) involved in the fact or fact type.
     - Identify the **fact symbols** (verbs, verb phrases, or prepositions) connecting the terms.
     - Classify the statement as either a **Fact** or **Fact Type**.
     - Note the section or paragraph where the fact or fact type appears as the **source**.

2. **Classify Terms**:
   - For each fact or fact type, classify all **terms**:
     - Label each term as either a **Noun** or **Proper Noun**.
   - Ensure that the terms are extracted accurately and classified correctly.

3. **Define term**:
   - For each term look in the document for the term definition. If the term definition is not found, use "missing".:

4. **Identify Fact Symbols**:
   - Extract the verbs or prepositions that define the relationships between the terms. These are referred to as **fact symbols**.
   - Each fact or fact type should have a list of fact symbols.

5. **Source Information**:
   - Record the paragraph or section of the document where each fact or fact type is found as **source** information (e.g., “(a)(1)”, “(b)”).

6. **Recognize Term Relationships**:
   - Identify relationships between terms:
     - **Synonyms**: Terms that can be used interchangeably without changing the meaning.
     - **Hypernym-Hyponym**: A broader term (hypernym) that includes a more specific term (hyponym).
   - For each pair of terms:
     - Identify the relationship (either "Synonym" or "Hypernym-Hyponym").
     - Ensure that both terms involved in the relationship are valid terms from the document.

7. **Structure the Output in JSON Format**:
   - Create a JSON object with the following structure:
     - **facts_and_fact_types**: A list of dictionaries, where each dictionary contains:
       - **id**: A unique identifier for the fact or fact type.
       - **statement**: The extracted fact or fact type.
       - **terms**: A list of dictionaries, where each dictionary has a term and its classification (either "Noun" or "Proper Noun").
       - **fact_symbols**: A list of verb phrases or prepositions connecting the terms.
       - **classification**: Either "Fact" or "Fact Type".
       - **source**: The section or paragraph where the fact or fact type appears.
     - **terms_relationship**: A list of dictionaries, where each dictionary contains:
       - **terms**: A list of two related terms.
       - **relation**: Either "Synonym" or "Hypernym-Hyponym".

#### Example Output:

```json
{
  "facts_and_fact_types": [
    {
      "id": 1,
      "statement": "A person serves a non-resident investment adviser by furnishing the Commission with process, pleadings, or papers.",
      "terms": [
        {"Person": "Noun"},
        {"Non-resident investment adviser": "Noun"},
        {"Commission": "Proper Noun"},
        {"Process": "Noun"},
        {"Pleadings": "Noun"},
        {"Papers": "Noun"}
      ],
      "fact_symbols": ["serves", "by furnishing", "with"],
      "classification": "Fact Type",
      "source": "(a)"
    }
  ],
  "terms_relationship": [
    {
      "terms": [
        "Principal office",
        "Place of business"
      ],
      "relation": "Synonym"
    }
  ]
}
```

#### Guidelines:
- Be precise in identifying **terms** and **fact symbols**.
- Classify the relationships between terms accurately as **Synonym** or **Hypernym-Hyponym**.
- Ensure the final output adheres to the specified JSON structure.

#### Start of the document
"""

#### 2. facts, fact types, rules, and terms with definitions

Try to extract all facts, fact types, rules, and terms with definitions from a given document. Try to extract the relationships for each term  as well.

**Results**

The result are fairly consistent, but it failed to extract term's definitions, even when the definition was clear in the text, like in the document 275.0-7 from the fragment "... the **term** small business or small organization for purposes of the Investment Advisers Act of 1940 shall **mean** an investment adviser that: ...". The prompt failed to define small business and small organization, what are the main purpose of the document. It also failed to recognize that small business and small organization are synonyms.

In [40]:
system_prompt_v1 = """
You are tasked with extracting **facts**, **fact types**, **rules**, and their **relationships** from a given document. Follow these steps carefully:

<steps>

1. Summarize the document. Use the summary to verify if all important facts, fact types, and rules are present.

2. **Identify Facts, Fact Types, and Rules**:
   - A **fact** is a specific instance or statement that describes an event or condition. Facts are statements of truth without any directive element. They are often associated with relationships between terms or entities. e.g., "John works for X Inc.".
   - A **fact type** is a general, abstract template that describes the potential relationships between terms or entities. It serves as a model for generating specific facts. e.g., "Person works for Company".
   - A **rule** rule is generally defined as a statement that governs or constrains some aspect of the business. It specifies what must be done or what is not allowed, often guiding actions, decisions, and behaviors within an organization. Rules enforce compliance, limit possibilities, or prescribe specific behaviors in response to business situations. e.g., "A customer must provide identification before opening an account.".
   - For each fact, fact type, or rule:
     - Extract the **statement** that represents the fact, fact type, or rule.
     - List the **terms** involved in the fact, fact type, or rule.
     - Identify the **verb symbols** (verbs, verb phrases, or prepositions) connecting the terms.
     - Classify the statement as either a **Fact**, **Fact Type**, or **Rule**.
     - Note the section or paragraph where the fact, fact type, or rule appears as the **source**.
     - For each term look in the document for the term definition. If the term definition is not found, use "missing".:

3. Classify Terms:
   - For each fact, fact type, or rule classify all **terms**:
     - Label each term as either a **Common Noun** or **Proper Noun**.
   - Ensure that the terms are extracted accurately and classified correctly.

4. Define term:
   - For each term look in the document for the term definition, explaining, or meaning. If the term definition is not found, use "missing".:

4. Identify Verb Symbols:
   - Extract the verbs or prepositions that define the relationships between the terms. These are referred to as **verb symbols**.
   - Each fact, fact type, or rule should have a list of verb symbols.

5. Source Information:
   - Record the paragraph or section of the document where each fact, fact type, or rule is found as **source** information (e.g., "(a)(1)", "(b)").

6. Recognize term relationships:
   - Identify relationships between terms:
     - **Synonyms**: Terms that can be used interchangeably without changing the meaning.
     - **Hypernym-Hyponym**: A broader term (hypernym) that includes a more specific term (hyponym).
   - For each pair of terms:
     - Identify the relationship (either "Synonym" or "Hypernym-Hyponym").
     - Ensure that both terms involved in the relationship are valid terms from the document.

7. Answer only with the output example structure in JSON format. All the values are optional.

<output_example>

```json
{
  "section": "§ 123.4-5",
  "elements": [
    {
      "id": 1,
      "statement": "A person serves a non-resident investment adviser by furnishing the Commission with process, pleadings, or papers.",
      "terms": [
        {
            "term": "Person",
            "classification": "Noun",
            "definition": "missing"
        },
      ...
      ],
      "verb_symbols": ["serves", "by furnishing", "with"],
      "classification": "Fact Type",
      "source": "(a)"
    }
  ],
  "terms_relationship": [
    {
      "terms": [
        "Principal office",
        "Place of business"
      ],
      "relation": "Synonym"
    }
  ]
},
...
```
</output_example>

</steps>
"""

The v2 is a variation of the v1, with more concise description of the steps, and changing the organization of the text. The results are the same, but there was miss classification of the statements.

In [41]:
system_prompt_v2 = """
Extract facts, fact types, and their relationships from a given document, and structure the output in a specified JSON format.

Follow the steps to identify and classify statements, using document details to find definitions and source information.

# Steps

1. **Summarize the Document:**
   - Provide a summary to ensure the completeness of identified facts, fact types, and rules.

2. **Identify Facts, Fact Types, and Rules:**
   - Define and extract each:
     - **Fact:** Instance or statement of event/condition, e.g., "John works for X Inc."
     - **Fact Type:** Template for relationships, e.g., "Person works for Company."
     - **Rule:** Governing statement, e.g., "A customer must provide identification before opening an account."
   - For each, document:
     - **Statement**
     - **Terms** involved
     - **Verb Symbols** connecting the terms
     - **Classification** as Fact, Fact Type, or Rule
     - **Source** paragraph or section in the document

3. **Classify Terms:**
   - Classify each term as **Common Noun** or **Proper Noun**.

4. **Define Term:**
   - Locate definitions for terms in the document, or mark as "missing."

5. **Identify Verb Symbols:**
   - Extract verbs or prepositions (verb symbols) that define term relationships.

6. **Source Information:**
   - Note the document source (section/paragraph) for each statement.

7. **Recognize Term Relationships:**
   - Identify pairs of terms with relationships:
     - **Synonyms:** interchangeable terms.
     - **Hypernym-Hyponym:** broader (hypernym) includes more specific (hyponym).
   - Ensure relationship validity using document terms.

# Output Format

Produce a structured JSON format based on the specified template. Ensure all necessary fields are populated accurately, even if some fields are optional or marked as "missing".

# Examples

**Example JSON Structure:**

```json
{
  "section": "§ 123.4-5",
  "elements": [
    {
      "id": 1,
      "statement": "A person serves a non-resident investment adviser by furnishing the Commission with process, pleadings, or papers.",
      "terms": [
        {
            "term": "Person",
            "classification": "Noun",
            "definition": "missing"
        },
        // Additional terms...
      ],
      "verb_symbols": ["serves", "by furnishing", "with"],
      "classification": "Fact Type",
      "source": "(a)"
    }
  ],
  "terms_relationship": [
    {
      "terms": [
        "Principal office",
        "Place of business"
      ],
      "relation": "Synonym"
    }
  ]
}
```

# Notes

- Ensure extracted statements are fully detailed and clearly classified.
- Pay careful attention to identifying and classifying terms accurately.
- Follow the precise JSON format for all outputs, populating fields as required.
"""


The v3 is back to v1, changing the organization of the text.

**Results**

The results are the same of v1 and v2. 5 elements were extracted. 16 terms were extracted with 2 definitions.

In [42]:
system_prompt_v3 = """
You are tasked with extracting elements and **relationships** from a given legal document. Please follow these steps carefully and ensure all instructions are adhered to:

**Steps**:

1. **Summarize the document**:
   - Summarize the document to understand its purpose and use it to verify if all important terms, term definitions, facts, fact types, and rules are identified in subsequent steps.

2. **Identify Facts, Fact Types, and Rules**:
   - **Definitions**:
     - **Fact**: A specific instance or statement that describes an event or condition without any directive element. Facts often involve relationships between terms or entities. Example: "John works for X Inc."
     - **Fact Type**: A general, abstract template that describes potential relationships between terms or entities, serving as a model for generating specific facts. Example: "Person works for Company."
     - **Rule**: A statement that governs or constrains some aspect of the business, specifying what must be done or what is not allowed. Rules enforce compliance, limit possibilities, or prescribe specific behaviors in response to business situations. Example: "A customer must provide identification before opening an account."
   - **For each fact, fact type, or rule**:
     - **Extract the statement**: Identify the exact statement or phrase from the document representing the fact, fact type, or rule.
     - **Extract Terms**: List all the terms involved in the statement.
     - **Extract Verb Symbols**: Identify verbs, verb phrases, or prepositions that connect the terms in the statement.
     - **Classification**: Classify the statement as either a **Fact**, **Fact Type**, or **Rule**.
     - **Source**: Note the specific paragraph or section of the document where the statement is found (e.g., "(a)(1)", "(b)").

3. **Classify Terms**:
   - For each term extracted classify it as either a **Common Noun** or a **Proper Noun**.

4. **Define Terms**:
   - For each term:
     - Search the entire document for the term's definition, explanation, or meaning. Also, look in the document summary.
     - If the definition is found, include it.
     - If the definition is not found in the document, use **None**.

5. **Identify Relationships Between Terms**:
   - **Types of Relationships**:
     - **Synonym**: Terms that can be used interchangeably without changing the meaning.
     - **Hypernym-Hyponym**: A broader term (hypernym) that includes a more specific term (hyponym).
   - **For each pair of terms in the document**:
     - Identify if a relationship exists as either "Synonym" or "Hypernym-Hyponym".
     - Only include relationships where both terms are present in the document.

6. **Provide JSON Output**:
   - Format your answer as per the output example below.
   - **All values are optional**: Include as much information as is available based on the document.
   - **Do not include any additional text or explanation outside the JSON structure**.

**Output Example**:

```json
{
  "section": "§ 123.4-5",
  "elements": [
    {
      "id": 1,
      "statement": "A person serves a non-resident investment adviser by furnishing the Commission with process, pleadings, or papers.",
      "terms": [
        {
          "term": "Person",
          "classification": "Common Noun",
          "definition": "An individual or legal entity."
        },
        {
          "term": "Non-resident investment adviser",
          "classification": "Common Noun",
          "definition": null
        },
        ...
      ],
      "verb_symbols": ["serves", "by furnishing", "with"],
      "classification": "Fact Type",
      "source": "(a)"
    },
    ...
  ],
  "terms_relationship": [
    {
      "terms": [
        "Principal office",
        "Place of business"
      ],
      "relation": "Synonym"
    },
    {
      "terms": [
        "Person",
        "Individual"
      ],
      "relation": "Synonym"
    },
    ...
  ]
}
```
"""

In [43]:
response_prompt_v3 = {
  "section": "§ 275.0-7",
  "elements": [
    {
      "id": 1,
      "statement": "An investment adviser that has assets under management of less than $25 million is considered a small business for the purposes of the Investment Advisers Act of 1940.",
      "terms": [
        {
          "term": "Investment adviser",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Assets under management",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "$25 million",
          "classification": "Proper Noun",
          "definition": None
        },
        {
          "term": "Small business",
          "classification": "Common Noun",
          "definition": None
        }
      ],
      "verb_symbols": ["has", "is considered"],
      "classification": "Fact Type",
      "source": "(a)(1)"
    },
    {
      "id": 2,
      "statement": "An investment adviser is considered a small organization if it did not have total assets of $5 million or more on the last day of the most recent fiscal year.",
      "terms": [
        {
          "term": "Investment adviser",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Total assets",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "$5 million",
          "classification": "Proper Noun",
          "definition": None
        },
        {
          "term": "Small organization",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Fiscal year",
          "classification": "Common Noun",
          "definition": None
        }
      ],
      "verb_symbols": ["did not have", "is considered"],
      "classification": "Fact Type",
      "source": "(a)(2)"
    },
    {
      "id": 3,
      "statement": "An investment adviser is not considered a small business if it controls, is controlled by, or is under common control with another investment adviser that has assets under management of $25 million or more.",
      "terms": [
        {
          "term": "Investment adviser",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Control",
          "classification": "Common Noun",
          "definition": "The power, directly or indirectly, to direct the management or policies of a person."
        },
        {
          "term": "Common control",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "$25 million",
          "classification": "Proper Noun",
          "definition": None
        },
        {
          "term": "Small business",
          "classification": "Common Noun",
          "definition": None
        }
      ],
      "verb_symbols": ["controls", "is controlled by", "is under"],
      "classification": "Rule",
      "source": "(a)(3)"
    },
    {
      "id": 4,
      "statement": "Control means the power, directly or indirectly, to direct the management or policies of a person, whether through ownership of securities, by contract, or otherwise.",
      "terms": [
        {
          "term": "Control",
          "classification": "Common Noun",
          "definition": "The power, directly or indirectly, to direct the management or policies of a person."
        },
        {
          "term": "Person",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Securities",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Contract",
          "classification": "Common Noun",
          "definition": None
        }
      ],
      "verb_symbols": ["means", "to direct", "whether through"],
      "classification": "Fact",
      "source": "(b)(1)"
    },
    {
      "id": 5,
      "statement": "A person is presumed to control a corporation if the person has the right to vote 25 percent or more of a class of the corporation's voting securities.",
      "terms": [
        {
          "term": "Person",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Corporation",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "Voting securities",
          "classification": "Common Noun",
          "definition": None
        },
        {
          "term": "25 percent",
          "classification": "Proper Noun",
          "definition": None
        }
      ],
      "verb_symbols": ["is presumed", "to control", "has the right to vote"],
      "classification": "Fact Type",
      "source": "(b)(1)(i)(A)"
    }
  ],
  "terms_relationship": [
    {
      "terms": [
        "Investment adviser",
        "Small business"
      ],
      "relation": "Hypernym-Hyponym"
    },
    {
      "terms": [
        "Investment adviser",
        "Small organization"
      ],
      "relation": "Hypernym-Hyponym"
    }
  ]
}


In [44]:
# Restore checkpoint
manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])

2024-11-05 13:31:07 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-11-05-1.json.


In [45]:
len(response_prompt_v3["elements"]), len(response_prompt_v3["terms_relationship"])

(5, 2)

#### 3. facts, fact types, rules, and terms

Try to extract all facts, fact types, rules, and terms without definitions from a given document, and do not try to extract the relationships for each term.

This approach is very similar to the approach used in the previous, but it is more focused on extracting the elements. It is divided in two parts:
- Extract the elements
- Extract the definitions and relationships

**Results**

The result are consistents, 7 elements and 21 terms with definitions are extracted. in contrast, the previous approach, 5 elements and 16 terms with 2 definitions were extracted. An improvement of 40% extracting facts and rules, 31% extracting terms, and 1050% extracting definitions.

ts are extracted in the first part. For the second part the result are much better than the previous approach, more definitions and relationships are extracted.

The prompt for the first part is similar to the previous one, but without the steps 4 and 5. The definition and relationships elements are removed from the output json.

> The summary of the document was added to the output json.

In [46]:
system_prompt_v4_1 = """
You are tasked with extracting elements from a given legal document. Please follow these steps carefully and ensure all instructions are adhered to:

# Steps

1. **Summarize the document** to understand its purpose and use it to verify if all important terms,facts, fact types, and rules are identified in subsequent steps.

2. **Identify elements**:
   - **About the elements**:
     - **Fact**: A specific instance or statement that describes an event or condition without any directive element. Facts often involve relationships between terms or entities. Example: "John works for X Inc."
     - **Fact Type**: A general, abstract template that describes potential relationships between terms or entities, serving as a model for generating specific facts. Example: "Person works for Company."
     - **Operative Rule**: A statement that governs or constrains some aspect of the business, specifying what must be done or what is not allowed. Rules enforce compliance, limit possibilities, or prescribe specific behaviors in response to business situations. Operative rules (otherwise known as normative rules or prescriptive rules) state what must or must not happen in particular circumstances. Operative rules can be contravened: required information may be omitted, inappropriate information supplied, or an attempt may be made to perform a process that is prohibited. Example: "A customer must provide identification before opening an account."
     - **Term**: A word or a group of words that represents a specific concept, entity, or subject in a particular context.
     - Terms, Fact, Fact Type, and Operative Rule are statements that should allow only full compliance or full contravention; partial compliance is not possible. The presence of "or" or "and" often suggests the need to separate a statement into two.
   - **For each fact, fact type, or rule**:
     - **Extract the statement**: Identify the exact statement or phrase from the document representing the fact, fact type, or rule.
     - **Extract and classify Terms**:
       - **Extract all the terms involved in the statement.
       - **Classify each term** as either **Common Noun** or **Proper Noun**.
       - If a Term contains nouns separated by "and," ",", or "or," split it into two or more terms. For example, "Principal office and place of business" should be split into "Principal office" and "Place of business".
     - **Extract Verb Symbols**: Identify verbs, verb phrases, or prepositions that connect the terms in the statement.
     - **Classification**: Classify the statement as either a **Fact**, **Fact Type**, or **Rule**.
     - **Source**: Note the specific paragraph or section of the document where the statement is found (e.g., "(a)(1)", "(b)").

3. **Provide JSON Output**:
   - Format your answer as per the output example below.
   - **All values are optional**: Include as much information as is available based on the document.
   - **Do not include any additional text or explanation outside the JSON structure**.

**Output Example**:

```json
{
  "section": "§ 123.4-5",
  "elements": [
    {
      "id": 1,
      "statement": "A person serves a non-resident investment adviser by furnishing the Commission with process, pleadings, or papers.",
      "terms": [
        {
          "term": "Person",
          "classification": "Common Noun"
        },
        {
          "term": "Non-resident investment adviser",
          "classification": "Common Noun"
        },
        ...
      ],
      "verb_symbols": ["serves", "by furnishing", "with"],
      "classification": "Fact Type",
      "source": "(a)"
    },
    ...
  ]
}
```
"""

In [47]:
response_prompt_v4_1 = {
  "section": "§ 275.0-7",
  "summary": "The definition of small entities under the Investment Advisers Act for the purposes of the Regulatory Flexibility Act. It details criteria for qualifying as a small business or organization and provides definitions for 'control' and 'total assets' within this context.",
  "elements": [
    {
      "id": 1,
      "statement": "The term small business or small organization for purposes of the Investment Advisers Act of 1940 shall mean an investment adviser that has assets under management of less than $25 million.",
      "terms": [
        {
          "term": "Small business",
          "classification": "Common Noun"
        },
        {
          "term": "Small organization",
          "classification": "Common Noun"
        },
        {
          "term": "Investment adviser",
          "classification": "Common Noun"
        },
        {
          "term": "Assets under management",
          "classification": "Common Noun"
        },
        {
          "term": "$25 million",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["mean", "has"],
      "classification": "Fact",
      "source": "(a)(1)"
    },
    {
      "id": 2,
      "statement": "An investment adviser did not have total assets of $5 million or more on the last day of the most recent fiscal year.",
      "terms": [
        {
          "term": "Investment adviser",
          "classification": "Common Noun"
        },
        {
          "term": "Total assets",
          "classification": "Common Noun"
        },
        {
          "term": "$5 million",
          "classification": "Common Noun"
        },
        {
          "term": "Fiscal year",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["did not have"],
      "classification": "Fact",
      "source": "(a)(2)"
    },
    {
      "id": 3,
      "statement": "An investment adviser does not control, is not controlled by, and is not under common control with another investment adviser that has assets under management of $25 million or more.",
      "terms": [
        {
          "term": "Investment adviser",
          "classification": "Common Noun"
        },
        {
          "term": "Control",
          "classification": "Common Noun"
        },
        {
          "term": "$25 million",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["does not control", "is not controlled by", "is not under common control with"],
      "classification": "Fact",
      "source": "(a)(3)"
    },
    {
      "id": 4,
      "statement": "Control means the power, directly or indirectly, to direct the management or policies of a person, whether through ownership of securities, by contract, or otherwise.",
      "terms": [
        {
          "term": "Control",
          "classification": "Common Noun"
        },
        {
          "term": "Power",
          "classification": "Common Noun"
        },
        {
          "term": "Management",
          "classification": "Common Noun"
        },
        {
          "term": "Policies",
          "classification": "Common Noun"
        },
        {
          "term": "Person",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["means", "to direct"],
      "classification": "Fact Type",
      "source": "(b)(1)"
    },
    {
      "id": 5,
      "statement": "A person is presumed to control a corporation if the person directly or indirectly has the right to vote 25 percent or more of a class of the corporation's voting securities.",
      "terms": [
        {
          "term": "Person",
          "classification": "Common Noun"
        },
        {
          "term": "Corporation",
          "classification": "Common Noun"
        },
        {
          "term": "Voting securities",
          "classification": "Common Noun"
        },
        {
          "term": "25 percent",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["is presumed to control", "has the right to vote"],
      "classification": "Operative Rule",
      "source": "(b)(1)(i)(A)"
    },
    {
      "id": 6,
      "statement": "A person is presumed to control a partnership if the person has the right to receive upon dissolution, or has contributed, 25 percent or more of the capital of the partnership.",
      "terms": [
        {
          "term": "Person",
          "classification": "Common Noun"
        },
        {
          "term": "Partnership",
          "classification": "Common Noun"
        },
        {
          "term": "Dissolution",
          "classification": "Common Noun"
        },
        {
          "term": "Capital",
          "classification": "Common Noun"
        },
        {
          "term": "25 percent",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["is presumed to control", "has the right to receive", "has contributed"],
      "classification": "Operative Rule",
      "source": "(b)(1)(ii)"
    },
    {
      "id": 7,
      "statement": "Total assets means the total assets as shown on the balance sheet of the investment adviser or other person with its subsidiaries consolidated, whichever is larger.",
      "terms": [
        {
          "term": "Total assets",
          "classification": "Common Noun"
        },
        {
          "term": "Balance sheet",
          "classification": "Common Noun"
        },
        {
          "term": "Investment adviser",
          "classification": "Common Noun"
        },
        {
          "term": "Subsidiaries",
          "classification": "Common Noun"
        }
      ],
      "verb_symbols": ["means", "shown on"],
      "classification": "Fact Type",
      "source": "(b)(2)"
    }
  ]
}

In [48]:
len(response_prompt_v4_1["elements"])

7

The steps 4 and 5 are adapted from the previous approach. The system prompt for the second part is:

In [49]:
system_prompt_v4_2 = """
You are tasked with extracting definitions and **relationships** of terms in the terms list searching a given legal document. Please follow these steps carefully and ensure all instructions are adhered to:

# Steps

1. **Summarize the document** to understand its purpose and use it to verify if all important terms, term definitions, facts, fact types, and rules are identified in subsequent steps.

2. **Define terms**:
  - For each term:
    - Search the entire document for the term's definition, explanation, or meaning. Also, look in the document summary.
    - If the definition is found, include it.
    - If the definition is not found in the document, use null.

3. **Identify synonym relationships between terms**:
  - For each term in the terms list:
    - Compare it against other terms in the text to find synonyms.
    - Ensure both terms exist within the same document context.
  - List all valid synonym pairs identified.

4. **Provide JSON Output**:
  - Format your answer as per the output example below.
  - **All values are optional**: Include as much information as is available based on the document.
  - **Do not include any additional text or explanation outside the JSON structure**.

**Output Example**:

```json
{
  "terms": [
    {
      "term": "Person",
      "definition": "A person is a person."
    },
    {
      "term": "Capital",
      "definition": "The total assets of a person."
    },
    ...
  ],
  "relationships": [
    {
      "term_1": "Person",
      "term_2": "Capital",
      "relationship": "Synonym"
    },
    {
      "term_1": "Capital",
      "term_2": "Person",
      "relationship": "Synonym"
    },
    ...
  ]
}
"""

In the "user prompt", along with the document, a unique list of terms from the result of the previous part, is provided. The drawback of this approach is the document needs to be provided again. It means spending more tokens.

As commented above, the output is better than the previous approach. 21 terms are extracted with definitions, and 6 relationships are identified. More important that the terms small business, and, small organization are extracted.

In [50]:
response_prompt_v4_2 = {
  "terms": [
    {
      "term": "$5 million",
      "definition": "An amount referenced as a threshold for total assets of an investment adviser or other entity on the last day of the most recent fiscal year."
    },
    {
      "term": "Control",
      "definition": "The power, directly or indirectly, to direct the management or policies of a person, whether through ownership of securities, by contract, or otherwise."
    },
    {
      "term": "Capital",
      "definition": "The amount of financial contribution or investment in a partnership or LLC, particularly relevant to the right to receive upon dissolution or contribution of 25 percent or more."
    },
    {
      "term": "Dissolution",
      "definition": "The act of formally ending a partnership or LLC, at which point capital contributions may be distributed."
    },
    {
      "term": "25 percent",
      "definition": "A threshold used to presume control over a corporation, partnership, or LLC, based on ownership, voting rights, or capital contribution."
    },
    {
      "term": "Subsidiaries",
      "definition": "Companies that are controlled by another company, typically through ownership of more than 50% of the subsidiary’s voting stock."
    },
    {
      "term": "Management",
      "definition": "The act of overseeing and controlling the policies or operations of an entity."
    },
    {
      "term": "Corporation",
      "definition": "A legal entity that is presumed to be controlled if a person has the right to vote or sell 25 percent or more of its voting securities."
    },
    {
      "term": "Balance sheet",
      "definition": "A financial statement that reports total assets, used to determine control and asset thresholds for investment advisers."
    },
    {
      "term": "Assets under management",
      "definition": "The total market value of investments that an investment adviser manages on behalf of clients."
    },
    {
      "term": "$25 million",
      "definition": "An amount referenced as a threshold for assets under management to determine whether an entity qualifies as a small business or small organization under the Investment Advisers Act."
    },
    {
      "term": "Fiscal year",
      "definition": "A one-year period used for accounting purposes and preparing financial statements, relevant to determining total assets."
    },
    {
      "term": "Voting securities",
      "definition": "Securities that give the holder the right to vote on matters of corporate policy or management, used to determine control."
    },
    {
      "term": "Power",
      "definition": "The ability to influence or direct the management or policies of a person or entity, often associated with control."
    },
    {
      "term": "Total assets",
      "definition": "The total value of all assets as shown on an entity's balance sheet, including those of subsidiaries, used to assess financial thresholds."
    },
    {
      "term": "Investment adviser",
      "definition": "An individual or firm that manages the investments of clients, subject to regulations under the Investment Advisers Act of 1940."
    },
    {
      "term": "Person",
      "definition": "An individual, corporation, partnership, LLC, trust, or other entity, potentially subject to control rules under the Investment Advisers Act."
    },
    {
      "term": "Small business",
      "definition": "An investment adviser with less than $25 million in assets under management and less than $5 million in total assets, or as otherwise defined by the Commission."
    },
    {
      "term": "Partnership",
      "definition": "A business structure where control is presumed if a person owns or contributes 25 percent or more of the partnership's capital."
    },
    {
      "term": "Small organization",
      "definition": "An entity, such as an investment adviser, that qualifies as a small business under the Investment Advisers Act by meeting specific asset thresholds."
    },
    {
      "term": "Policies",
      "definition": "The principles or rules governing the management and control of an entity, relevant to determining control under the Investment Advisers Act."
    }
  ],
  "relationships": [
    {
      "term_1": "Small business",
      "term_2": "Small organization",
      "relationship": "Synonym"
    },
    {
      "term_1": "$5 million",
      "term_2": "Total assets",
      "relationship": "Hypernym-Hyponym"
    },
    {
      "term_1": "$25 million",
      "term_2": "Assets under management",
      "relationship": "Hypernym-Hyponym"
    },
    {
      "term_1": "Person",
      "term_2": "Corporation",
      "relationship": "Hypernym-Hyponym"
    },
    {
      "term_1": "Person",
      "term_2": "Partnership",
      "relationship": "Hypernym-Hyponym"
    },
    {
      "term_1": "Person",
      "term_2": "Investment adviser",
      "relationship": "Hypernym-Hyponym"
    }
  ]
}


In [51]:
len(response_prompt_v4_2["terms"]), len(response_prompt_v4_2["relationships"])

(21, 6)

#### Save checkpoint

Define which prompt will be used in the experiment.

In [52]:
# TODO: Refactor name to system_prompt_extract_P1 and use the function above
system_prompt_extract_part_1 = system_prompt_v4_1
system_prompt_extract_part_2 = system_prompt_v4_2

manager.add_document(
    Document(
        id="prompt-extract_P1",
        type="prompt",
        content=f"""
{system_prompt_extract_part_1}
        """,
    )
)

manager.add_document(
    Document(
        id="prompt-extract_P2",
        type="prompt",
        content=f"""
{system_prompt_extract_part_2}
        """,
    )
)

In [53]:
# Persist the state to a file
save_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"], manager=manager)

2024-11-05 13:31:08 - INFO - Checkpoint saved.


### Execution

#### Restore checkpoint

In [54]:
manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])

2024-11-05 13:31:08 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-11-05-1.json.


#### extract and classify elements

- Classify statements in the document;
- Extract terms and verb symbols;
- Classify terms.

In [55]:
for doc in manager.list_document_ids(doc_type="section"):
    logger.info(f"Processing document: {doc}")
    retrieved_doc = manager.retrieve_document(doc_id=doc, doc_type="section")

    # Part 1 - Extraction of elements
    # TODO: Refactor to a function. Put it in the prompt engeering section
    user_prompt = f"""
# Document

{manager.retrieve_document(doc_id=doc, doc_type="section").content}
    """

    logger.info("P1. Extracting elements...")
    response_part_1 = query_instruct_llm(
        system_prompt=system_prompt_extract_part_1,
        user_prompt=user_prompt,
        document_model=ElementsDocumentModel,
        llm_model=config["LLM"]["MODEL"],
        temperature=config["LLM"]["TEMPERATURE"],
        max_tokens=config["LLM"]["MAX_TOKENS"],
    )

    logger.debug(response_part_1)

    doc_1 = Document(id=f"{doc}_P1", type="llm_response", content=response_part_1)
    manager.add_document(doc_1)

    # Part 2 - Definition of terms and relationships

    terms_list_part_1 = extract_unique_terms(response_part_1)

    user_prompt = f"""
# Terms list

{terms_list_part_1}

# Document
{manager.retrieve_document(doc_id=doc, doc_type="section").content}
    """

    logger.info("P2. Extracting terms and relationships...")

    response_part_2 = query_instruct_llm(
        system_prompt=system_prompt_extract_part_2,
        user_prompt=user_prompt,
        document_model=TermsDocumentModel,
        llm_model=config["LLM"]["MODEL"],
        temperature=config["LLM"]["TEMPERATURE"],
        max_tokens=config["LLM"]["MAX_TOKENS"],
    )

    logger.debug(response_part_2)

    doc_2 = Document(id=f"{doc}_P2", type="llm_response", content=response_part_2)
    manager.add_document(doc_2)

    logger.info("Saving llm_response to checkpoint...")

    # Save each document to save money.
    save_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"], manager=manager)

logger.info("Finished processing documents.")

2024-11-05 13:31:08 - INFO - Processing document: § 275.0-2
2024-11-05 13:31:08 - INFO - P1. Extracting elements...


2024-11-05 13:31:36 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-05 13:31:36 - INFO - Execution time for query_instruct_llm: 28.03 seconds
2024-11-05 13:31:36 - INFO - P2. Extracting terms and relationships...
2024-11-05 13:31:47 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-05 13:31:47 - INFO - Execution time for query_instruct_llm: 8.09 seconds
2024-11-05 13:31:47 - INFO - Saving llm_response to checkpoint...
2024-11-05 13:31:47 - INFO - Checkpoint saved.
2024-11-05 13:31:47 - INFO - Processing document: § 275.0-5
2024-11-05 13:31:47 - INFO - P1. Extracting elements...
2024-11-05 13:32:05 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-05 13:32:05 - INFO - Execution time for query_instruct_llm: 18.40 seconds
2024-11-05 13:32:05 - INFO - P2. Extracting terms and relationships...
2024-11-05 13:32:22 - INFO - HTTP Request: POST https://a

Avarage execution time: 32s / per document.

#### Restore checkpoint

In [56]:
# Restore checkpoint
manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])

2024-11-05 13:33:14 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-11-05-1.json.


#### Check the content of datasets

In [57]:
logger.info("SECTIONS:")
# List all document ids | type
logger.info(f"section docs: {manager.list_document_ids(doc_type='section')}")

# Retrieve a document by id | type
for doc in manager.list_document_ids(doc_type="section"):
    retrieved_doc = manager.retrieve_document(doc_id=doc, doc_type="section")
    logger.debug(retrieved_doc)
    lines, words, avg_words_per_line = basic_text_stats(retrieved_doc.content)
    logger.info(
        f"{doc}: Total number of lines: {lines}, total number of words: {words}, and average words per line: {avg_words_per_line}"
    )

retrieved_true_table_p1 = []
retrieved_true_table_p2 = []

for doc in manager.list_document_ids(doc_type="true_table"):
    logger.info(f"Processing document: {doc} ...")
    # Docs type true_table P1
    if doc.endswith("_P1"):
        retrieved_true_table_p1.append(
            calculate_content_quantities_p1(
                doc,
                manager.retrieve_document(
                    doc_id=doc, doc_type="true_table"
                ).model_dump()["content"],
                filename="p1_true_table.json",
            )
        )
        logger.info("retrieve P1")
    # Docs type true_table P2
    elif doc.endswith("_P2"):
        retrieved_true_table_p2.append(
            calculate_content_quantities_p2(
                doc,
                manager.retrieve_document(
                    doc_id=doc, doc_type="true_table"
                ).model_dump(),
                filename="p2_true_table.json",
            )
        )
        logger.info("retrieve P2")

# Convert collected data to a DataFrame
table_true_df_p1 = pd.DataFrame(retrieved_true_table_p1)
table_true_df_p2 = pd.DataFrame(retrieved_true_table_p2)

# Save DataFrames to CSV if needed
table_true_df_p1.to_excel(f"{config['DEFAULT_OUTPUT_DIR']}/P1_summary_true_table.xlsx", index=False)
table_true_df_p2.to_excel(f"{config['DEFAULT_OUTPUT_DIR']}/P2_summary_true_table.xlsx", index=False)

2024-11-05 13:33:14 - INFO - SECTIONS:
2024-11-05 13:33:14 - INFO - section docs: ['§ 275.0-2', '§ 275.0-5', '§ 275.0-7']
2024-11-05 13:33:14 - INFO - § 275.0-2: Total number of lines: 14, total number of words: 362, and average words per line: 26
2024-11-05 13:33:14 - INFO - § 275.0-5: Total number of lines: 10, total number of words: 260, and average words per line: 26
2024-11-05 13:33:14 - INFO - § 275.0-7: Total number of lines: 19, total number of words: 513, and average words per line: 27
2024-11-05 13:33:14 - INFO - Processing document: § 275.0-2_P1 ...
2024-11-05 13:33:14 - INFO - retrieve P1
2024-11-05 13:33:14 - INFO - Processing document: § 275.0-5_P1 ...
2024-11-05 13:33:14 - INFO - retrieve P1
2024-11-05 13:33:14 - INFO - Processing document: § 275.0-7_P1 ...
2024-11-05 13:33:14 - INFO - retrieve P1
2024-11-05 13:33:14 - INFO - Processing document: § 275.0-2_P2 ...
2024-11-05 13:33:14 - INFO - retrieve P2
2024-11-05 13:33:14 - INFO - Processing document: § 275.0-5_P2 ...
2

True table for P1.

In [58]:
table_true_df_p1

Unnamed: 0,document_id,quantity_of_elements,quantity_of_facts,quantity_of_fact_types,quantity_of_rules,quantity_of_verbs,quantity_of_terms,filename
0,§ 275.0-2_P1,9,0,7,0,20,64,p1_true_table.json
1,§ 275.0-5_P1,5,0,1,0,20,30,p1_true_table.json
2,§ 275.0-7_P1,9,0,9,0,20,44,p1_true_table.json


In [59]:
table_true_p1 = table_true_df_p1.groupby('document_id').describe()

table_true_p1

Unnamed: 0_level_0,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_elements,quantity_of_facts,quantity_of_facts,...,quantity_of_verbs,quantity_of_verbs,quantity_of_terms,quantity_of_terms,quantity_of_terms,quantity_of_terms,quantity_of_terms,quantity_of_terms,quantity_of_terms,quantity_of_terms
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
document_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
§ 275.0-2_P1,1.0,9.0,,9.0,9.0,9.0,9.0,9.0,1.0,0.0,...,20.0,20.0,1.0,64.0,,64.0,64.0,64.0,64.0,64.0
§ 275.0-5_P1,1.0,5.0,,5.0,5.0,5.0,5.0,5.0,1.0,0.0,...,20.0,20.0,1.0,30.0,,30.0,30.0,30.0,30.0,30.0
§ 275.0-7_P1,1.0,9.0,,9.0,9.0,9.0,9.0,9.0,1.0,0.0,...,20.0,20.0,1.0,44.0,,44.0,44.0,44.0,44.0,44.0


True table for P2.

In [60]:
table_true_df_p2

Unnamed: 0,document_id,count_of_terms,terms_with_definition,terms_without_definition,terms_relationship_count,filename
0,§ 275.0-2_P2,35,2,33,0,p2_true_table.json
1,§ 275.0-5_P2,21,21,0,0,p2_true_table.json
2,§ 275.0-7_P2,26,19,7,2,p2_true_table.json


In [61]:
table_true_p2 = table_true_df_p2.groupby('document_id').describe()

table_true_p2

Unnamed: 0_level_0,count_of_terms,count_of_terms,count_of_terms,count_of_terms,count_of_terms,count_of_terms,count_of_terms,count_of_terms,terms_with_definition,terms_with_definition,...,terms_without_definition,terms_without_definition,terms_relationship_count,terms_relationship_count,terms_relationship_count,terms_relationship_count,terms_relationship_count,terms_relationship_count,terms_relationship_count,terms_relationship_count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
document_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
§ 275.0-2_P2,1.0,35.0,,35.0,35.0,35.0,35.0,35.0,1.0,2.0,...,33.0,33.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
§ 275.0-5_P2,1.0,21.0,,21.0,21.0,21.0,21.0,21.0,1.0,21.0,...,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
§ 275.0-7_P2,1.0,26.0,,26.0,26.0,26.0,26.0,26.0,1.0,19.0,...,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0


#### BREAK

In [62]:
# Stop here. Next sections still in progress.
raise SystemExit("Stop here. Next sections still in progress.")

SystemExit: Stop here. Next sections still in progress.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


#### Main (Move to lab 5)

Orchestrates the process of the semantic annotation.

Processing terms, names, vocabularies and vocabulary namespaces

In [None]:
# Connect to AllegroGraph
conn = ag_connect(
    repo=config["ALLEGROGRAPH_CLOUD"]["REPO"],
    catalog=config["ALLEGROGRAPH_CLOUD"]["CATALOG"],
    host=config["ALLEGROGRAPH_CLOUD"]["HOST"],
    port=config["ALLEGROGRAPH_CLOUD"]["PORT"],
    user=config["ALLEGROGRAPH_CLOUD"]["USER"],
    password=config["ALLEGROGRAPH_CLOUD"]["PASSWORD"],
)

Restore checkpoint

In [None]:
# Restore checkpoint
manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])

In [None]:
for doc in manager.list_document_ids(doc_type="llm_response"):
    logger.info(f"Processing document: {doc} ...")
    retrieved_llm_response = manager.retrieve_document(doc_id=doc, doc_type="llm_response")

    for response in retrieved_llm_response.content:

        logger.debug(response)

        signifier = response['signifier']
        statement = response['definition']
        concept_type = response['concept_classification']
        sources = response.get("sources")

        logger.info(f"Processing '{signifier}' ...")

        # define vocabulary
        # Assume first occorrence of section is the correct section
        # In case do not have section, use section_id.
        # TODO: Improve this
        vocabulary = define_vocabulary(doc, sources[0]["section"])
        vocabulary = normalize_ns_string(vocabulary)

        logger.info(f"Processing vocabulary {vocabulary}")

        if create_vocabulary(conn, vocabulary):
            logger.info(f"Vocabulary {vocabulary} upserted")
            if create_vocabulary_namespace(conn, vocabulary):
                logger.info(f"Vocabulary namespace {vocabulary} upserted")
            else:
                logger.info(f"Vocabulary namespace {vocabulary} not upserted")
        else:
            logger.info(f"Vocabulary {vocabulary} not upserted")

        # similar search
        exact_match, close_match = get_similar_signifiers(conn, signifier)

        # create designation
        designation = Designation(
            signifier=signifier,
            statement=statement,
            concept_type=concept_type,
            closeMatch=close_match,
            exactMatch=exact_match,
            vocabulary_name=vocabulary,
            sources=signifier_sources(sources) # Associate desgnations the their sources
        )

        upsert_designation_to_kg(conn, designation)

        logger.debug(f"Processed {designation}")
        logger.info(f"Signifier '{signifier}' done.")

    logger.info(f"{doc} done.")


In [None]:
conn.close()

### Discussion

TODO

## Notes

- Kernel conda environment: ipt-cfr2sbvr - Python version 3.11.9