<a href="https://colab.research.google.com/github/asantos2000/master-degree-santos-anderson/blob/main/code/src/chap_6_nlp2sbvr_elements_association_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# nlp2sbvr - elements association and creation

Chapter 6. Ferramentas de suporte
- Section 6.2 Implementação dos principais componentes
  - Section 6.2.4 nlp2sbvr
    - Section Algoritmo "elements association and creation"
    - Section Algoritmo "define vocabular namespace"
    - Section Algoritmo "similarity search"

> Use this version if the last checkpoint will be inserted into KG, for the insertion of the best scores use the "chap_6_nlp2sbvr_elements_association_creation_best.ipynb" instead. DO NOT USE BOTH.

## Google colab

> Before run set your keys in `/content/.env` file and any preferences in `/content/config.yaml`, use the `.env.example` and `config.colab.yaml` of git repo as examples.

In [1]:
%load_ext autoreload
%autoreload 2

import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  !rm -rf cfr2sbvr configuration checkpoint
  !git clone https://github.com/asantos2000/master-degree-santos-anderson.git cfr2sbvr
  %pip install -r cfr2sbvr/code/requirements.txt
  !cp -r cfr2sbvr/code/src/configuration .
  !cp -r cfr2sbvr/code/src/checkpoint .
  !cp -r cfr2sbvr/code/config.colab.yaml config.yaml
  DEFAULT_CONFIG_FILE="config.yaml"
else:
  DEFAULT_CONFIG_FILE="../config.yaml"

## Imports

In [2]:
# Standard library imports
import re
from decimal import Decimal
from datetime import datetime

# Third-party libraries
from pydantic import BaseModel
from typing import List, Dict, Optional, Any, Tuple
import spacy
from slugify import slugify

# Franz AllegroGraph (AG) imports
from franz.openrdf.connect import ag_connect
from franz.openrdf.repository.repository import RepositoryConnection
from franz.openrdf.query.query import QueryLanguage

# Local application/library-specific imports
import checkpoint.main as checkpoint
from checkpoint.main import (
    restore_checkpoint,
    DocumentProcessor,
)
import configuration.main as configuration
import logging_setup.main as logging_setup

DEV_MODE = True

if DEV_MODE:
    # Development mode
    import importlib

    importlib.reload(configuration)
    importlib.reload(logging_setup)
    importlib.reload(checkpoint)

## Settings

Default settings, check them before run the notebook.

### Get configuration

In [3]:
# load config
config = configuration.load_config(DEFAULT_CONFIG_FILE)

Generated files for analysis in this run

In [4]:
print(f'{config["ALLEGROGRAPH_HOSTING"]=}')
print(f'{config["FIBO_GRAPH"]=}')
print(f'{config["FIBO_GRAPH_VECTOR_STORE"]=}')
print(f'{config["SIMILARITY_THRESHOLD"]=}')

config["ALLEGROGRAPH_HOSTING"]='ALLEGROGRAPH_LOCAL'
config["FIBO_GRAPH"]='fibo:FIBO_Graph'
config["FIBO_GRAPH_VECTOR_STORE"]='fibo-glossary-3m-vec'
config["SIMILARITY_THRESHOLD"]=0.85


### Logging configuration

In [5]:
logger = logging_setup.setting_logging(config["DEFAULT_LOG_DIR"], config["LOG_LEVEL"])

2025-03-05 20:59:16 - INFO - Logging is set up with daily rotation.


## Checkpoints

Documents, annoted datasets, statistics and metrics about the execution of the notebook are stored by checkpoint module.

Checkpoints are stored / retrieved at the directory `DEFAULT_CHECKPOINT_FILE` in the configuration file.

During the execution, it will restore the checkpoint at the beginning of the section and saved at the end. We can run and restore the checkpoint several times. If the run fails, check the closest checkpoint and restore it.

Restore the checkpoint

In [6]:
# To run after transform
last_checkpoint = configuration.get_last_filename(config["DEFAULT_CHECKPOINT_DIR"], "documents", "json")

logger.info(f"{last_checkpoint=}")

config["DEFAULT_CHECKPOINT_FILE"] = last_checkpoint

manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])

2025-03-05 21:00:18 - INFO - last_checkpoint='../data/checkpoints/documents-2024-12-08-10.json'
2025-03-05 21:00:18 - INFO - DocumentManager restored from file: ../data/checkpoints/documents-2024-12-08-10.json
2025-03-05 21:00:18 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-12-08-10.json.


## General functions and data structures

In [9]:
def now_as_xsd_dateTime():
    # Get the current datetime in UTC
    current_time = datetime.utcnow().isoformat()

    # Remove microseconds for compliance
    if '.' in current_time:
        current_time = current_time.split('.')[0]

    # Append the UTC timezone indicator
    current_time += 'Z'

    return current_time

In [10]:
def remove_section_symbol(input_string: str) -> str:
    """
    Removes the '§' symbol from the input string and trims whitespace.

    Args:
        input_string (str): The string from which to remove the '§' symbol.

    Returns:
        str: The cleaned string without the '§' symbol and leading/trailing whitespace.

    Raises:
        TypeError: If 'input_string' is not a string.
    """
    if not isinstance(input_string, str):
        raise TypeError("input_string must be a string")
    return input_string.replace("§", "").strip()

In [11]:
def signifier_sources(sources: list) -> list:
    """
    Extract desgnations sources

    Args:
        sources (list): List of sources

    Returns:
        list: List of sources
    """
    # Extract desgnations sources
    sources_lst = []
    for source in sources:
        source_section = str(source.get("section"))
        source_paragraph = str(source.get("paragraph"))
        sources_lst.append(source_section + source_paragraph)
    return sources_lst

In [12]:
def normalize_ns_string(input_string: str) -> str:
    """
    Transform the input string to title case, which capitalizes the first letter of each word.

    Args:
        input_string (str): The string to normalize.

    Returns:
        normalized_string (str): The normalized string.
    """
    normalized_string = remove_section_symbol(input_string)

    # Remove all spaces, change points and hyphens to underscores
    return normalized_string.replace(" ", "").replace("-", "_").replace(".", "_")

In [13]:
def get_metadata_cfr2sbvr(element):
    return {
        "extract_original_statement":element.get('definition', element.get('statement', 'missing')),
        "transformation_semscore": element.get("semscore", 0),
        "transformation_similarity_score":element.get("similarity_score", 0),
        "transformation_similarity_score_confidence":element.get("similarity_score_confidence", 0),
        "transformation_accuracy":element.get("transformation_accuracy", 0),
        "transformation_grammar_syntax_accuracy":element.get("grammar_syntax_accuracy", 0),
        "transformation_findings":element.get("findings", []),
        # from classification
        "classification_type":element.get("type", 'missing'),
        "classification_subtype":element.get("subtype", 'missing'),
        "classification_type_confidence":element.get("type_confidence", 0),
        "classification_type_explanation":element.get("type_explanation", 'not available'),
        "classification_subtype_confidence":element.get("subtype_confidence", element.get("confidence", 0)),
        "classification_subtype_explanation":element.get("subtype_explanation",  element.get("explanation", 'not available')),
        "classification_templates_ids":element.get("templates_ids", [])
    }

KG functions

In [14]:
def upsert_section_to_kg(conn: Any,
                   section_chapter: str,
                   section_part: str,
                   section_title: str,
                   section_id: str,
                   section_content: str) -> bool:
    """
    Upserts a section into the Knowledge Graph.

    Args:
        conn (RepositoryConnection): The connection to the Knowledge Graph.
        section_chapter (str): The chapter of the section.
        section_part (str): The part of the section.
        section_title (str): The title of the section.
        section_id (str): The ID of the section.
        section_content (str): The content of the section.

    Returns:
        bool: True if the section was upserted successfully, False otherwise.
    """
    # Upsert section into Knowledge Graph
    normalized_id = normalize_ns_string(section_id)
    query = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

WITH cfr-sbvr:CFR_SBVR_Graph
DELETE {{
    cfr-sbvr:{normalized_id} ?p ?o .
}}
INSERT {{
    cfr-sbvr:{normalized_id} a cfr-sbvr:CFRSession ;
        cfr-sbvr:cfrId "{section_id}" ;
        cfr-sbvr:cfrChapter "{section_chapter}" ;
        cfr-sbvr:cfrPart "{section_part}" ;
        cfr-sbvr:cfrTitle "{section_title}" ;
        cfr-sbvr:cfrText \"""{section_content}\""" 
.
}}
WHERE {{
    # Match all existing triples
    OPTIONAL {{ cfr-sbvr:{normalized_id} ?p ?o . }}
}}
    """
    
    logger.debug(f"{query=}")

    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query).evaluate()
        logger.info(f"Section '{normalized_id}' upserted successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to upsert section {normalized_id}: {e}")
        return False


In [15]:
def get_section_from_kg(conn: Any, section_id: str) -> Any:
    """
    Retrieves a section from the Knowledge Graph based on the section number.

    Args:
        conn (RepositoryConnection): The connection to the Knowledge Graph.
        section_id (str): The section number.
      
    Returns:
        dict: The section content.
    """
    # Query section number from KG

    normalized_id = normalize_ns_string(section_id)
    
    query = f"""
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>

SELECT ?p ?o
WHERE {{
    cfr-sbvr:{normalized_id} ?p ?o .
}}
    """
    tuple_query = conn.prepareTupleQuery(QueryLanguage.SPARQL, query)
    result = tuple_query.evaluate()

    logger.info(f"result.metadata: {result.metadata}")
    logger.info(f"result.variable_names: {result.variable_names}")

    section_text = ""
    section_part = ""
    section_chapter = ""
    section_title = ""
    section_id = ""

    with result:
        for binding_set in result:
            if str(binding_set.getValue("p")) == '<http://cfr2sbvr.com/cfr#cfrText>':
                section_text = str(binding_set.getValue("o")).replace('"', '')
            if str(binding_set.getValue("p")).replace('"', '') == '<http://cfr2sbvr.com/cfr#cfrPart>':
                section_part = str(binding_set.getValue("o")).replace('"', '')
            if str(binding_set.getValue("p")) == '<http://cfr2sbvr.com/cfr#cfrChapter>':
                section_chapter = str(binding_set.getValue("o")).replace('"', '')
            if str(binding_set.getValue("p")) == '<http://cfr2sbvr.com/cfr#cfrTitle>':
                section_title = str(binding_set.getValue("o")).replace('"', '')
            if str(binding_set.getValue("p")).replace('"', '') == '<http://cfr2sbvr.com/cfr#cfrId>':
                section_id = str(binding_set.getValue("o")).replace('"', '')
        res_dict = {
            "section_text": section_text.replace('\\n', '\n').replace('\\t', '\t'),
            "section_part": section_part,
            "section_chapter": section_chapter,
            "section_title": section_title,
            "section_id": section_id
        } 

        return res_dict

In [16]:
def transform_to_rdf_subject(input_string: str) -> str:
    """
    Transform the input string to a valid RDF subject by converting it to camel case
    and replacing invalid characters.

    Args:
        input_string (str): The string to transform.

    Returns:
        rdf_subject (str): The transformed RDF subject name.
    """
    # Convert to title case (camel case)
    camel_case_string = ''.join(word.capitalize() for word in input_string.split())
    # Replace invalid characters (retain only alphanumeric and underscore)
    rdf_subject = re.sub(r'[^a-zA-Z0-9_]', '', camel_case_string)

    return rdf_subject

In [17]:
class Term(BaseModel):
    term: str
    classification: Optional[str]  # Allows additional information about the term

class RuleAndFact(BaseModel):
    statement_id: str
    statement: str
    concept_type: str  # Maps to "element_name" in the structure
    terms: Optional[List[Term]]  # Supports nested terms structure
    verb_symbols: Optional[List[str]]  # Supports verb symbols as a list of strings
    vocabulary_namespace: str  # Maps to a constant or inferred namespace
    sources: Optional[List[str]]
    doc_id: Optional[str]
    metadata_cfr2sbvr: Optional[Dict[str, Any]]

def upsert_rule_and_fact_to_kg(conn: RepositoryConnection, rule_fact_model: RuleAndFact):
    statement = rule_fact_model.statement
    designation_class = rule_fact_model.statement_id
    concept_type = rule_fact_model.concept_type
    vocabulary_namespace = rule_fact_model.vocabulary_namespace
    doc_id = rule_fact_model.doc_id
    metadata_cfr2sbvr = rule_fact_model.metadata_cfr2sbvr

    if concept_type == "Fact":
        designation_type = "DefinitionalRule"
    else:
        designation_type = "BehavioralBusinessRule"

    # Construct surces triple if sources is provided
    sources_triples = ""
    if rule_fact_model.sources:
        for source in rule_fact_model.sources:
            sources_triples += f'sbvr:referenceSupportsMeaning "{doc_id}{source}" ;\n'

    # Construct exactMatch triple if exactMatch is provided
    terms_triples = ""
    if rule_fact_model.terms:
        for term in rule_fact_model.terms:
            terms_triples += f"cfr-sbvr:hasTerm cfr-sbvr:{transform_to_rdf_subject(term.term)} ;\n"

    # Construct surces triple if sources is provided
    verb_symbols_triples = ""
    if rule_fact_model.verb_symbols:
        for verb_symbol in rule_fact_model.verb_symbols:
            verb_symbols_triples += f'cfr-sbvr:hasVerbSymbol cfr-sbvr:{transform_to_rdf_subject(verb_symbol)} ;\n'

    # Construct metadata triples if metadata_cfr2sbvr is provided
    logger.debug(f'{metadata_cfr2sbvr.get("classification_classification_confidence")=}')
    
    # Olny rules pass for two types of classification, facts are DefinitinalRules per definition
    type_classification = ""
    if concept_type == "Rule":
        type_classification = f"""
            cfr-sbvr:classificationTypeConfidence {metadata_cfr2sbvr.get("classification_type_confidence")} ;
            cfr-sbvr:classificationTypeExplanation "{metadata_cfr2sbvr.get("classification_type_explanation")}" ;
        """

    if statement != "missing":
        finding_triples = ""
        if metadata_cfr2sbvr.get("transformation_findings"):
            for find in metadata_cfr2sbvr.get("transformation_findings"):
                finding_triples += f'cfr-sbvr:transformationFinding "{find}" ;\n'

        templates_triples = ""
        if metadata_cfr2sbvr.get("classification_templates_ids"):
            for template in metadata_cfr2sbvr.get("classification_templates_ids"):
                templates_triples += f'cfr-sbvr:classificationTemplatesId "{template}" ;\n'

    metadata = f"""
        cfr-sbvr:extractOriginalStatement "{metadata_cfr2sbvr.get("extract_original_statement")}" ;
        cfr-sbvr:transformationSemscore {metadata_cfr2sbvr.get("transformation_semscore")} ;
        cfr-sbvr:transformationSimilarityScore {metadata_cfr2sbvr.get("transformation_similarity_score")} ;
        cfr-sbvr:transformationSimilarityScoreConfidence {metadata_cfr2sbvr.get("transformation_similarity_score_confidence")} ;
        cfr-sbvr:transformationAccuracy {metadata_cfr2sbvr.get("transformation_accuracy")} ;
        cfr-sbvr:transformationGrammarSyntaxAccuracy {metadata_cfr2sbvr.get("transformation_grammar_syntax_accuracy")} ;
        cfr-sbvr:classificationType "{metadata_cfr2sbvr.get("classification_type")}" ;
        cfr-sbvr:classificationSubtype "{metadata_cfr2sbvr.get("classification_subtype")}" ;
        {type_classification}
        {finding_triples}
        {templates_triples}
        cfr-sbvr:classificationSubtypeConfidence {metadata_cfr2sbvr.get("classification_subtype_confidence")} ;
        cfr-sbvr:classificationSubtypeExplanation "{metadata_cfr2sbvr.get("classification_subtype_explanation")}" ;
    """

    upsert_query = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:{designation_class} ?p ?o .
}}
INSERT {{
    cfr-sbvr:{designation_class} a sbvr:{concept_type},
            sbvr:{designation_type} ;
        {terms_triples}
        {verb_symbols_triples}
        {sources_triples}
        sbvr:isImplicitlyUnderstood "false"^^xsd:boolean ;
        sbvr:statement "{statement}" ;
        sbvr:designationIsInNamespace {vocabulary_namespace} ;
        {metadata} 
        cfr-sbvr:createDate "{now_as_xsd_dateTime()}"^^xsd:dateTime .
}}
WHERE {{
    # Match all existing triples related to {designation_class}
    OPTIONAL {{ cfr-sbvr:{designation_class} ?p ?o . }}
}}
    """

    logger.info(f"SPARQL Query: {upsert_query}")

    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, upsert_query).evaluate()
        logger.info(f"{concept_type} '{designation_class}' upserted successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to upsert {concept_type} {designation_class}: {e}")
        return False

Class to represent a verb symbol, terms, and names.

In [18]:
class Designation(BaseModel):
    signifier: str
    statement: str
    concept_type: str
    closeMatch: Optional[List[str]]
    exactMatch: Optional[List[str]]
    vocabulary_namespace: str
    sources: Optional[List[str]]
    doc_id: Optional[str]
    metadata_cfr2sbvr: Optional[Dict[str, Any]]

In [19]:
def upsert_verb_symbol_to_kg(
    conn: RepositoryConnection, designation: Designation
) -> bool:
    """
    Add a verb symbol to the knowledge graph. If exists, replace it.

    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        verb_symbol (Term): The term to add to the knowledge graph.

    Returns:
        True if the term was added successfully, False otherwise.
    """
    signifier = designation.signifier
    transformed_statement = designation.statement
    statement = "missing"

    concept_type = designation.concept_type  # sbvr:VerbConcept
    vocabulary_namespace = designation.vocabulary_namespace
    doc_id = designation.doc_id
    metadata_cfr2sbvr = designation.metadata_cfr2sbvr

    designation_class = transform_to_rdf_subject(
        f"{signifier}-{remove_section_symbol(doc_id)}"
    )

    logger.info(f"Format {signifier} to {designation_class}.")

    # Construct surces triple if sources is provided
    sources_triples = ""
    if designation.sources:
        for source in designation.sources:
            sources_triples += f'sbvr:referenceSupportsMeaning "{doc_id}{source}" ;\n'

    # Construct metadata triples if metadata_cfr2sbvr is provided

    metadata = f"""
        cfr-sbvr:extractOriginalStatement "{metadata_cfr2sbvr.get("extract_original_statement")}" ;
        cfr-sbvr:transformedStatement "{transformed_statement}" ;
        cfr-sbvr:createDate "{now_as_xsd_dateTime()}"^^xsd:dateTime ;
    """

    query = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:{designation_class} ?p ?o .
}}
INSERT {{
    cfr-sbvr:{designation_class} a sbvr:VerbSymbol,
            sbvr:{concept_type} ;
        sbvr:signifier "{signifier}" ;
        {sources_triples}
        sbvr:isImplicitlyUnderstood "false"^^xsd:boolean ;
        sbvr:statement "{statement}" ;
        sbvr:designationIsInNamespace {vocabulary_namespace} ;
        {metadata} .
}}
WHERE {{
    # Match all existing triples related to {designation_class}
    OPTIONAL {{ cfr-sbvr:{designation_class} ?p ?o . }}
}}
    """

    logger.debug(f"SPARQL Query: {query}")

    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query).evaluate()
        logger.info(f"Designation '{signifier}' upserted successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to upsert verb symbol {signifier}: {e}")
        return False

In [20]:
def upsert_term_and_name_to_kg(conn: RepositoryConnection, designation: Designation) -> bool:
    """
    Add a term to the knowledge graph. If exists, replace it.

    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        term (Term): The term to add to the knowledge graph.

    Returns:
        True if the term was added successfully, False otherwise.
    """
    signifier = designation.signifier
    statement = designation.statement
    concept_type = designation.concept_type
    vocabulary_namespace = designation.vocabulary_namespace
    doc_id = designation.doc_id
    metadata_cfr2sbvr = designation.metadata_cfr2sbvr

    designation_class = transform_to_rdf_subject(f"{signifier}-{remove_section_symbol(doc_id)}")

    if concept_type == "Name":
        designation_type = "IndividualNounConcept"
    else:
        designation_type = "GeneralConcept"

    logger.info(f"Format {signifier} to {designation_class}.")

    # Rule type
    match metadata_cfr2sbvr.get("classification_subtype"):
        case "Formal intensional definitions":
            rule_type = "sbvr:IntensionalDefinition"
        case "Formal extensional definitions":
            rule_type = "sbvr:Extensionaldefinition"
        case "Categorization scheme enumerations":
            rule_type = "sbvr:Categorizationscheme"
        case _:
            rule_type = "sbvr:DefinitionalRule"

    # Constructing closeMatch triples
    close_matches_triples = ""
    if designation.closeMatch:
        for close_match in designation.closeMatch:
            close_matches_triples += f"sbvr:closeMatch {close_match} ;\n"

    # Construct exactMatch triple if exactMatch is provided
    exact_match_triples = ""
    if designation.exactMatch:
        for exact_match in designation.exactMatch:
            exact_match_triples += f"sbvr:exactMatch {exact_match} ;\n"

    # Construct surces triple if sources is provided
    sources_triples = ""
    if designation.sources:
        for source in designation.sources:
            sources_triples += f'sbvr:referenceSupportsMeaning "{doc_id}{source}" ;\n'

    # Construct metadata triples if metadata_cfr2sbvr is provided
    metadata = ""
    if statement != "missing":
        finding_triples = ""
        if metadata_cfr2sbvr.get("transformation_findings"):
            for find in metadata_cfr2sbvr.get("transformation_findings"):
                finding_triples += f'cfr-sbvr:transformationFinding "{find}" ;\n'

        templates_triples = ""
        if metadata_cfr2sbvr.get("classification_templates_ids"):
            for template in metadata_cfr2sbvr.get("classification_templates_ids"):
                templates_triples += f'cfr-sbvr:classificationTemplatesId "{template}" ;\n'

        metadata = f"""
            cfr-sbvr:extractOriginalStatement "{metadata_cfr2sbvr.get("extract_original_statement")}" ;
            cfr-sbvr:transformationSemscore {metadata_cfr2sbvr.get("transformation_semscore")} ;
            cfr-sbvr:transformationSimilarityScore {metadata_cfr2sbvr.get("transformation_similarity_score")} ;
            cfr-sbvr:transformationSimilarityScoreConfidence {metadata_cfr2sbvr.get("transformation_similarity_score_confidence")} ;
            cfr-sbvr:transformationAccuracy {metadata_cfr2sbvr.get("transformation_accuracy")} ;
            cfr-sbvr:transformationGrammarSyntaxAccuracy {metadata_cfr2sbvr.get("transformation_grammar_syntax_accuracy")} ;
            {finding_triples}
            {templates_triples}
            cfr-sbvr:classificationType "{metadata_cfr2sbvr.get("classification_type")}" ;
            cfr-sbvr:classificationSubtype "{metadata_cfr2sbvr.get("classification_subtype")}" ;
            cfr-sbvr:classificationSubtypeConfidence {metadata_cfr2sbvr.get("classification_subtype_confidence")} ;
            cfr-sbvr:classificationSubtypeExplanation "{metadata_cfr2sbvr.get("classification_subtype_explanation")}" ;
        """

    designation_upsert_query = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:{designation_class} ?p ?o .
}}
INSERT {{
    cfr-sbvr:{designation_class} a sbvr:{designation_type},
            {rule_type},
            sbvr:{concept_type} ;
        sbvr:signifier "{signifier}" ;
        {exact_match_triples}
        {close_matches_triples}
        {sources_triples}
        sbvr:isImplicitlyUnderstood "false"^^xsd:boolean ;
        sbvr:statement "{statement}" ;
        sbvr:designationIsInNamespace {vocabulary_namespace} ;
        {metadata}
        cfr-sbvr:createDate "{now_as_xsd_dateTime()}"^^xsd:dateTime .
}}
WHERE {{
    # Match all existing triples related to {designation_class}
    OPTIONAL {{ cfr-sbvr:{designation_class} ?p ?o . }}
}}
    """

    logger.info(f"SPARQL Query: {designation_upsert_query}")

    try:
        #conn.prepareUpdate(QueryLanguage.SPARQL, designation_upsert_query).evaluate()
        logger.info(f"Designation '{signifier}' upserted successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to upsert designation {signifier}: {e}")
        return False

In [21]:
def create_vocabulary(conn: RepositoryConnection, vocabulary_name: str) -> bool:
    """
    Create a new vocabulary in the knowledge graph.

    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        vocabulary (str): The name of the vocabulary to create.

    Returns:
        True if the vocabulary was created successfully, False otherwise.
    """

    query_remove_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>

DELETE DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    fro-cfr:CFR_Title_17_Part_275_VOC sbvr:vocabulary1IncorporatesVocabulary2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC .
}}
}}
    """

    query_add_triples = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC ?p ?o .
}}

INSERT {{
    cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC
        a owl:Class, sbvr:Vocabulary ;
        cfr-sbvr:createDate "{now_as_xsd_dateTime()}"^^xsd:dateTime .
}}
WHERE {{
    # Match all existing triples related to cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC
    OPTIONAL {{ cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC ?p ?o . }}
}}
    """

    query_add_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>

INSERT DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    fro-cfr:CFR_Title_17_Part_275_VOC sbvr:vocabulary1IncorporatesVocabulary2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC .
}}
}}
    """

    logger.debug(f"SPARQL Query: {query_remove_association}")
    logger.debug(f"SPARQL Query: {query_add_triples}")
    logger.debug(f"Vocabulary name: cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC")

    # Remove associated vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_remove_association).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} delete associated successfully.")
    except Exception as e:
        logger.error(f"Failed to delete associated vocabulary {vocabulary_name}: {e}")

    # create new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_triples).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} created successfully.")
    except Exception as e:
        logger.error(f"Failed to create vocabulary {vocabulary_name}: {e}")

    # Add association with new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_association).evaluate()
        logger.info(f"Vocabulary {vocabulary_name} associated successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to associate vocabulary {vocabulary_name}: {e}")
        return False

In [22]:
def create_vocabulary_namespace(conn: RepositoryConnection, vocabulary_name: str) -> bool:
    """
    Create a new vocabulary namespace in the knowledge graph.

    Args:
        conn (RepositoryConnection): The connection to the knowledge graph database.
        vocabulary_namespace (str): The name of the vocabulary namespace to create.

    Returns:
        True if the vocabulary namespace was created successfully, False otherwise.
    """

    query_remove_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>

DELETE DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    fro-cfr:CFR_Title_17_Part_275_NS sbvr:namespace1IncorporatesNamespace2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS .
}}
}}
    """

    query_add_triples = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>

WITH cfr-sbvr:CFR_SBVR
DELETE {{
    cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS ?p ?o .
}}

INSERT {{
cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS
        a owl:Class, sbvr:VocabularyNamespace;
    sbvr:namespaceHasURI <http://cfr2sbvr.com/cfr/CFR_SBVR_{vocabulary_name}_NS#> ;
    sbvr:vocabularyIsExpressedInLanguage cfr-sbvr:EnglishLanguage ;
    sbvr:vocabularyNamespaceIsDerivedFromVocabulary cfr-sbvr:CFR_SBVR_{vocabulary_name}_VOC ;
    dct:title "Semantics of Business Vocabulary and Business Rules (SBVR) for Code of Federal Regulations (CFR)" ;
    skos:definition "SBVR-CFR is an adopted standard of the Object Management Group (OMG) intended to be the basis for formal and detailed natural language declarative description of CFR regulations" ;
    dct:source <https://github.com/asantos2000/dissertacao-santos-anderson-2024> ;
    cfr-sbvr:createDate "{now_as_xsd_dateTime()}"^^xsd:dateTime .
}}
WHERE {{
    # Match all existing triples related to cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS
    OPTIONAL {{ cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS ?p ?o . }}
}}
    """
    query_add_association = f"""
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>

INSERT DATA {{
GRAPH cfr-sbvr:CFR_SBVR {{
    fro-cfr:CFR_Title_17_Part_275_NS sbvr:namespace1IncorporatesNamespace2 cfr-sbvr:CFR_SBVR_{vocabulary_name}_NS .
}}
}}
    """

    logger.debug(f"SPARQL Query: {query_remove_association}")
    logger.debug(f"SPARQL Query: {query_add_triples}")
    logger.debug(f"SPARQL Query: {query_add_association}")

    # Remove associated vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_remove_association).evaluate()
        logger.info(f"Vocabulary NS {vocabulary_name} delete associated successfully.")
    except Exception as e:
        logger.error(f"Failed to delete associated vocabulary NS {vocabulary_name}: {e}")

    # create new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_triples).evaluate()
        logger.info(f"Vocabulary NS {vocabulary_name} created successfully.")
    except Exception as e:
        logger.error(f"Failed to create vocabulary NS {vocabulary_name}: {e}")

    # Add association with new vocabulary
    try:
        conn.prepareUpdate(QueryLanguage.SPARQL, query_add_association).evaluate()
        logger.info(f"Vocabulary NS {vocabulary_name} associated successfully.")
        return True
    except Exception as e:
        logger.error(f"Failed to associate vocabulary NS {vocabulary_name}: {e}")
        return False

In [23]:
def define_vocabulary_ns(conn: RepositoryConnection, doc_id: str, is_local_scope: bool) -> str:
    """
    Determines the vocabulary section ID based on the term's source section.

    Args:
        section_id (str): The section ID of the current document.
        source_section: The section id.

    Returns:
        str: The appropriate vocabulary section ID.

    Raises:
        KeyError: If 'source' or 'section' key is missing in the term.
        TypeError: If 'section_id' is not a string or 'term' is not a dictionary.
    """

    doc_id = remove_section_symbol(normalize_ns_string(doc_id))

    if is_local_scope:
        ns = f"cfr-sbvr:CFR_SBVR_{doc_id}_NS"
    else:
        ns = "fro-cfr:CFR_Title_17_Part_275_NS"

    query = f"""
        PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
        PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>

        select ?p ?o {{
            graph cfr-sbvr:CFR_SBVR {{
                {ns} ?p ?o 
            }}
        }}
    """

    tuple_query = conn.prepareTupleQuery(QueryLanguage.SPARQL, query)
    result = tuple_query.evaluate()

    logger.debug(f"result.metadata: {result.metadata}")
    logger.debug(f"result.variable_names: {result.variable_names}")

    for binding in result:
        logger.debug(f"binding: {binding}")

    if not result:
        # Vocabulary
        logger.info("Vocabulary not found. Creating vocabulary and namespace...")
        if create_vocabulary(conn, doc_id):
            logger.info("Vocabulary created")
        else:
            raise Exception("Failed to create vocabulary")

        # Vocabulary namespace
        if create_vocabulary_namespace(conn, doc_id):
            logger.info("Vocabulary namespace created")
        else:
            raise Exception("Failed to create vocabulary namespace")
    else:
        logger.info("Vocabulary already exists")
    
    logger.info(f"Vocabulary namespace: {ns}")
    
    return ns

**similarity search (P5)** (Move to LAB 5)

Try a similarity search to find the entity in the graph. If not found, create a new entity and corresponding embedding. If exists, create a link between the two.

In [24]:
def get_from_kg(conn: RepositoryConnection, signifier: str, kg: str, vector_db: str, exact: bool = False) -> List[Dict[str, Any]]:
    """
    Queries the knowledge graph to retrieve similar terms to the given term.

    Args:
        conn (RepositoryConnection): The AllegroGraph repository connection.
        term (str): The term to search for similar terms in the knowledge graph.
        kg (str): The name of the knowledge graph to query.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing information about similar terms,
        including URIs, scores, definitions, and related predicates.
    """

    if kg not in {config["FIBO_GRAPH"], config["CFR_SBVR_GRAPH"]}:
        raise ValueError(f"Unsupported knowledge graph: {kg}")

    query_string_close = f"""
PREFIX llm: <http://franz.com/ns/allegrograph/8.0.0/llm/>
PREFIX fibo: <https://spec.edmcouncil.org/fibo/ontology/master/2024Q2/QuickFIBOProd#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>

SELECT ?uri (xsd:decimal(?score) as ?score_percent) ?s ?p ?definition
FROM {kg}
WHERE {{
    (?uri ?score ?originalText ?p) llm:nearestNeighbor ("{signifier}" "{vector_db}" 5 {config["SIMILARITY_THRESHOLD"]}) .
    ?s ?p ?originalText .

    OPTIONAL {{ ?s skos:definition ?definition . }}
    OPTIONAL {{ ?s sbvr:Statement ?definition . }}
}}
ORDER BY DESC(?score)
    """

    query_string_exact = f"""
PREFIX fibo: <https://spec.edmcouncil.org/fibo/ontology/master/2024Q2/QuickFIBOProd#>
PREFIX fro-cfr: <http://cfr2sbvr.com/fro/cfr/Code_Federal_Regulations.ttl#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ("https://spec.edmcouncil.org/fibo"^^xsd:anyUri as ?uri) (xsd:decimal(100) as ?score_percent) ?s (rdfs:label as ?p) ?definition ?originalText
FROM NAMED fibo:FIBO_Graph
WHERE {{
  GRAPH ?g {{
    ?s a ?type ;
    skos:definition ?definition ;
    (rdfs:label | skos:prefLabel) ?originalText .
    
    FILTER(?type IN (owl:Class, owl:NamedIndividual))
    FILTER(LCASE(STR(?originalText)) = "{signifier}")
  }}
}}
    """

    if exact:
        query_string = query_string_exact
    else:
        query_string = query_string_close

    logger.debug(f"SPARQL Query: {query_string}")

    tuple_query = conn.prepareTupleQuery(QueryLanguage.SPARQL, query_string)

    try:
        result = tuple_query.evaluate()
        logger.debug(f"Result metadata: {result.metadata}")

        with result:
            similar_signifiers = [
                {
                    "uri": str(binding.getValue("uri")),
                    "score_percent": Decimal(binding.getValue("score_percent").getLabel()),
                    "located_signifier_uri": str(binding.getValue("s")),
                    "located_signifier_uri_local_name": binding.getValue("s").getLocalName(),
                    "located_signifier_predicate": str(binding.getValue("p")),
                    "definition": str(binding.getValue("definition"))
                }
                for binding in result
            ]
    except Exception as e:
        logger.error(f"Error evaluating SPARQL query: {e}")
        raise

    logger.info(f"Found {len(similar_signifiers)} similar signifier(s) for '{signifier}' on {kg}.")

    return similar_signifiers

In [25]:
def get_similar_signifiers(conn: RepositoryConnection, signifier: str) -> Tuple[list]:
    """
    Get similar signifiers for a given signifier.

    Args:
        conn (allegrograph.AllegroGraphConnection): An AllegroGraph connection object.
        signifier (str): The signifier to search for.

    Returns:
        list (Tuple[list]): A list of exact and close matches for the signifier.
    """
    fibo_exact =  get_from_kg(conn, signifier, config["FIBO_GRAPH"], config["FIBO_GRAPH_VECTOR_STORE"], True)
    fibo_similarity =  get_from_kg(conn, signifier, config["FIBO_GRAPH"], config["FIBO_GRAPH_VECTOR_STORE"], False)
    cfr_sbvr_similarity = get_from_kg(conn, signifier, config["CFR_SBVR_GRAPH"], config["CFR_SBVR_GRAPH_VECTOR_STORE"], False)

    exact_match = []
    close_match = []

    for item in fibo_similarity:
        logger.info(f"{item=}")
        close_match.append(item.get("located_signifier_uri"))

    for item in fibo_exact:
        logger.info(f"{item=}")
        exact_match.append(item.get("located_signifier_uri"))

    for item in cfr_sbvr_similarity:
        close_match.append(item.get("located_signifier_uri"))

    logger.info(f"Found {len(exact_match)} exact matche(s) and {len(close_match)} close matche(s) for '{signifier}'.")

    return exact_match, close_match

Generate a RDF subject from a statement.

In [26]:
def to_camel_case(snake_str):
    """
    Convert a snake_case string to CamelCase.
    """
    components = snake_str.split("_")
    return "".join(x.title() for x in components)


def generate_meaningful_rdf_subject(statement, domain_context=""):
    """
    Generate a meaningful and descriptive RDF subject from a statement in CamelCase.

    Args:
        statement (str): The input sentence or statement.
        domain_context (str): Additional domain context for enrichment.

    Returns:
        str: A rich, meaningful RDF subject in CamelCase.
    """
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(statement)

    # Initialize components
    main_subject = None
    predicate = None
    obj = None
    attributes = []

    # Extract subject, verb (predicate), and object
    for token in doc:
        if token.dep_ in ("nsubj", "nsubjpass") and not main_subject:
            main_subject = token.text
        elif token.pos_ == "VERB" and not predicate:
            predicate = token.lemma_  # Base form of the verb
        elif token.dep_ in ("dobj", "pobj") and not obj:
            obj = token.text

    # Extract additional attributes (e.g., named entities or adjectives)
    attributes.extend(
        [ent.text for ent in doc.ents if ent.label_ in {"ORG", "GPE", "LAW", "EVENT"}]
    )
    attributes.extend([token.text for token in doc if token.pos_ == "ADJ"])

    # Combine extracted components
    components = [main_subject, predicate, obj] + attributes
    if domain_context:
        components.append(domain_context)

    # Filter out None, empty values, and repetitions
    logger.debug(f"{components=}")

    # Filter out None, empty values, and repetitions
    components = [
        comp for i, comp in enumerate(components)
        if comp and isinstance(comp, str) and (i == 0 or not components[i-1] or isinstance(components[i-1], str) and comp.lower() != components[i-1].lower())
    ]


    # Normalize to snake_case, then convert to CamelCase
    snake_case_subject = slugify("_".join(components), separator="_")
    camel_case_subject = to_camel_case(snake_case_subject)

    return camel_case_subject

## Datasets

Datasets used in the notebook.

### Elements to save to KG

Get expressions to save

In [27]:
# Merge same elements from different paragraphs into a single element with a list of paragraphs
processor = DocumentProcessor(manager, merge=True)

pred_operative_rules = processor.get_rules()
pred_facts = processor.get_facts()
pred_terms = processor.get_terms()
pred_names = processor.get_names()

logger.debug(f"Rules: {pred_operative_rules}")
logger.debug(f"Facts: {pred_facts}")
logger.debug(f"Terms: {pred_terms}")
logger.debug(f"Names: {pred_names}")
logger.info(f"Rules to evaluate: {len(pred_operative_rules)}")
logger.info(f"Facts to evaluate: {len(pred_facts)}")
logger.info(f"Terms to evaluate: {len(pred_terms)}")
logger.info(f"Names to evaluate: {len(pred_names)}")

2025-03-05 21:01:15 - INFO - Not filtering terms based on definition presence.
2025-03-05 21:01:15 - INFO - Not filtering names based on definition presence.
2025-03-05 21:01:15 - INFO - Rules to evaluate: 6
2025-03-05 21:01:15 - INFO - Facts to evaluate: 16
2025-03-05 21:01:15 - INFO - Terms to evaluate: 77
2025-03-05 21:01:15 - INFO - Names to evaluate: 8


Search for a specific element by statement_id

In [28]:
[(i, item) for i, item in enumerate(pred_terms) if item['statement_id'] == "Trustee"]

[(17,
  {'doc_id': '§ 275.0-2',
   'statement_id': 'Trustee',
   'definition': None,
   'isLocalScope': False,
   'sources': ['(b)(1)'],
   'element_name': 'Term'}),
 (73,
  {'doc_id': '§ 275.0-7',
   'statement_id': 'Trustee',
   'definition': None,
   'isLocalScope': False,
   'sources': ['(b)(1)(iv)'],
   'element_name': 'Term'})]

## Execution

### elements association and creation

Orchestrates the process of save metadata as a triples in the KGs.

Processing terms, names, fact types, operative rules, vocabularies, and vocabulary namespaces

#### Connect to KG

In [29]:
hosting = config["ALLEGROGRAPH_HOSTING"]

conn = ag_connect(
    repo=config[hosting]["REPO"],
    catalog=config[hosting]["CATALOG"],
    host=config[hosting]["HOST"],
    port=config[hosting]["PORT"],
    user=config[hosting]["USER"],
    password=config[hosting]["PASSWORD"],
)

logger.info(f"Connected to AllegroGraph: {hosting}")

2025-03-05 21:01:41 - INFO - Connected to AllegroGraph: ALLEGROGRAPH_LOCAL


#### Sections

In [103]:
true_table_manager = restore_checkpoint(filename=f'{config["DEFAULT_DATA_DIR"]}/documents_true_table.json')

docs = true_table_manager.list_document_ids(doc_type="section")
logger.info(f"Documents to evaluate: {len(docs)}")

# Define the CFR section
cfr_part = "275"
cfr_title = "17"
cfr_chapter = "II"

for doc_id in docs:
    logger.info(f"Processing document {doc_id}...")
    document = true_table_manager.retrieve_document(doc_id=doc_id, doc_type="section")

    upsert_section_to_kg(conn,
                   section_chapter = cfr_chapter,
                   section_part = cfr_part,
                   section_title = cfr_title,
                   section_id = doc_id,
                   section_content = document.content)

2024-12-26 23:23:28 - INFO - DocumentManager restored from file: ../data/documents_true_table.json
2024-12-26 23:23:28 - INFO - Checkpoint restored from ../data/documents_true_table.json.
2024-12-26 23:23:28 - INFO - Documents to evaluate: 3
2024-12-26 23:23:28 - INFO - Processing document § 275.0-2...
2024-12-26 23:23:28 - INFO - Section '275_0_2' upserted successfully.
2024-12-26 23:23:28 - INFO - Processing document § 275.0-5...
2024-12-26 23:23:28 - INFO - Section '275_0_5' upserted successfully.
2024-12-26 23:23:28 - INFO - Processing document § 275.0-7...
2024-12-26 23:23:28 - INFO - Section '275_0_7' upserted successfully.


Retrieve section for test purpose

In [104]:
section = get_section_from_kg(conn=conn, section_id="§ 275.0-2")

print(section["section_text"])

2024-12-26 23:23:31 - INFO - result.metadata: {'time': {'output': 0.001048, 'parse': 0.0, 'plan': 0.000182, 'query': 0.000114, 'total': 0.001344}, 'memory': {'maximumChunk': 2800000, 'maximumMap': 5200000}, 'other': {'generation': 20510000, 'info': 'bindings-set', 'rowCount': 6, 'verb': 'select'}}
2024-12-26 23:23:31 - INFO - result.variable_names: ['p', 'o']


§ 275.0-2 General procedures for serving non-residents.
(a) General procedures for serving process, pleadings, or other papers on non-resident investment advisers, general partners and managing agents.  Under Forms ADV and ADV-NR [17 CFR 279.1 and 279.4], a person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents:
  (1) A person may serve a non-resident investment adviser, non-resident general partner, or non-resident managing agent by furnishing the Commission with one copy of the process, pleadings, or papers, for each named party, and one additional copy for the Commission's records.
  (2) If process, pleadings, or other papers are served on the Commission as described in this section, the Secretary of the Commission (Secretary) will promptly forward a copy to each named party by registered or certified mail at t

#### Terms

In [30]:
for index, element in enumerate(pred_terms):

    logger.info(f"{index=} {element=}")

    # from extraction
    doc_id = element.get('doc_id') # section
    signifier = element.get('statement_id')
    
    statement = element.get('transformed')
    statement = statement if statement else "missing" # Change None to "missing"

    concept_type = element.get("element_name").replace(" ", "")
    sources = element.get("sources") # paragraphs
    is_local_scope = element.get("isLocalScope")
    # from transformation
    metadata_cfr2sbvr = get_metadata_cfr2sbvr(element)

    logger.info(f"Processing '{index}: {signifier}' ...")

    logger.debug(f"{doc_id=}")
    logger.debug(f"{signifier=}")
    logger.debug(f"{statement=}")
    logger.debug(f"{concept_type=}")
    logger.debug(f"{is_local_scope=}")
    logger.debug(f"{metadata_cfr2sbvr=}")
    
    # create vocabulary and namespace if not exists
    logger.info("Creating vocabulary and namespace if not exists...")
    logger.debug(f"{doc_id=} - {is_local_scope=}")
    vocabulary = define_vocabulary_ns(conn, doc_id, is_local_scope)
    logger.info(f"{vocabulary=}")

    # similar search
    exact_match, close_match = get_similar_signifiers(conn, signifier.lower())

    # create designation
    designation = Designation(
        signifier=signifier,
        statement=statement,
        concept_type=concept_type,
        closeMatch=close_match,
        exactMatch=exact_match,
        vocabulary_namespace=vocabulary,
        sources=sources,
        doc_id=doc_id,
        metadata_cfr2sbvr=metadata_cfr2sbvr
    )

    logger.debug(f"{designation=}")

    # upsert
    upsert_term_and_name_to_kg(conn, designation)

    logger.info(f"Signifier '{signifier}' done.")

2025-03-05 21:03:26 - INFO - index=0 element={'doc_id': '§ 275.0-2', 'statement_id': 'Person', 'definition': None, 'isLocalScope': False, 'sources': ['(a)', '(a)(1)', '(b)(1)'], 'element_name': 'Term'}
2025-03-05 21:03:26 - INFO - Processing '0: Person' ...
2025-03-05 21:03:26 - INFO - Creating vocabulary and namespace if not exists...
2025-03-05 21:03:26 - INFO - Vocabulary already exists
2025-03-05 21:03:26 - INFO - Vocabulary namespace: fro-cfr:CFR_Title_17_Part_275_NS
2025-03-05 21:03:26 - INFO - vocabulary='fro-cfr:CFR_Title_17_Part_275_NS'
2025-03-05 21:03:27 - INFO - Found 1 similar signifier(s) for 'person' on fibo:FIBO_Graph.
2025-03-05 21:03:29 - INFO - Found 4 similar signifier(s) for 'person' on fibo:FIBO_Graph.
2025-03-05 21:03:30 - INFO - Found 0 similar signifier(s) for 'person' on cfr-sbvr:CFR_SBVR.
2025-03-05 21:03:30 - INFO - item={'uri': '<http://franz.com/vdb/id/1409>', 'score_percent': Decimal('0.8905864953994751'), 'located_signifier_uri': '<https://spec.edmcounci

#### Names

In [None]:
len(pred_names)

In [31]:
for index, element in enumerate(pred_names):

    logger.info(f"{index=} {element=}")

    # from extraction
    doc_id = element.get('doc_id') # section
    signifier = element.get('statement_id')
    
    statement = element.get('transformed')
    statement = statement if statement else "missing" # Change None to "missing"

    concept_type = element.get("element_name").replace(" ", "")
    sources = element.get("sources") # paragraph
    is_local_scope = element.get("isLocalScope")
    # from transformation
    metadata_cfr2sbvr = get_metadata_cfr2sbvr(element)

    logger.info(f"Processing '{index}: {signifier}' ...")

    logger.debug(f"{doc_id=}")
    logger.debug(f"{signifier=}")
    logger.debug(f"{statement=}")
    logger.debug(f"{concept_type=}")
    logger.debug(f"{is_local_scope=}")
    logger.debug(f"{metadata_cfr2sbvr=}")

    # create vocabulary and namespace if not exists
    logger.info("Creating vocabulary and namespace if not exists...")
    logger.debug(f"{doc_id=} - {is_local_scope=}")
    vocabulary = define_vocabulary_ns(conn, doc_id, is_local_scope)
    logger.info(f"{vocabulary=}")

    # similar search
    exact_match, close_match = get_similar_signifiers(conn, signifier.lower())

    # create designation
    designation = Designation(
        signifier=signifier,
        statement=statement,
        concept_type=concept_type,
        closeMatch=close_match,
        exactMatch=exact_match,
        vocabulary_namespace=vocabulary,
        sources=sources,
        doc_id=doc_id,
        metadata_cfr2sbvr=metadata_cfr2sbvr
    )

    logger.debug(f"{designation=}")

    #raise Exception("Stop")

    # upsert
    upsert_term_and_name_to_kg(conn, designation)

    logger.info(f"Signifier '{signifier}' done.")

2025-03-05 21:06:35 - INFO - index=0 element={'doc_id': '§ 275.0-2', 'statement_id': 'Commission', 'definition': None, 'isLocalScope': False, 'sources': ['(a)(3)', '(a)(1)', '(a)(2)'], 'element_name': 'Name'}
2025-03-05 21:06:35 - INFO - Processing '0: Commission' ...
2025-03-05 21:06:35 - INFO - Creating vocabulary and namespace if not exists...
2025-03-05 21:06:36 - INFO - Vocabulary already exists
2025-03-05 21:06:36 - INFO - Vocabulary namespace: fro-cfr:CFR_Title_17_Part_275_NS
2025-03-05 21:06:36 - INFO - vocabulary='fro-cfr:CFR_Title_17_Part_275_NS'
2025-03-05 21:06:37 - INFO - Found 0 similar signifier(s) for 'commission' on fibo:FIBO_Graph.
2025-03-05 21:06:39 - INFO - Found 5 similar signifier(s) for 'commission' on fibo:FIBO_Graph.
2025-03-05 21:06:39 - INFO - Found 0 similar signifier(s) for 'commission' on cfr-sbvr:CFR_SBVR.
2025-03-05 21:06:39 - INFO - item={'uri': '<http://franz.com/vdb/id/1875>', 'score_percent': Decimal('0.8834182024002075'), 'located_signifier_uri': '

#### Fact types

In [None]:
for index, element in enumerate(pred_facts):

    logger.info(f"{index=} {element=}")

    # from extraction
    doc_id = element.get('doc_id') # section

    statement_id = element.get('statement_id')
    
    statement = element.get('transformed')
    statement = statement if statement else "missing" # Change None to "missing"

    # SBVR ontology just have Fact
    concept_type = "Fact" #element.get("element_name").replace(" ", "")

    # statement_subject = generator.create_rdf_subject(element.get('statement'))
    #statement_subject = generator.create_rdf_subject(statement)
    
    logger.debug(f"{index}. {element.get('statement')}")
    statement_subject  = transform_to_rdf_subject(element.get('statement_title'))#generate_meaningful_rdf_subject(element.get('statement'))
    logger.debug(f"{index}. {statement_subject}")

    # logger.debug(f"{index}. {statement}")
    # statement_subject2 = generate_meaningful_rdf_subject(statement)
    # logger.debug(f"{index}. {statement_subject2}")

    source = element.get("source") # paragraph

    terms = element.get("terms")

    verb_symbols = element.get("verb_symbols")
    
    # from transformation
    metadata_cfr2sbvr = get_metadata_cfr2sbvr(element)

    logger.info(f"Processing '{index}: {statement_subject}' ...")

    logger.debug(f"{doc_id=}")
    logger.debug(f"{statement_id=}")
    logger.debug(f"{statement=}")
    logger.debug(f"{concept_type=}")
    logger.debug(f"{metadata_cfr2sbvr=}")

    # create vocabulary and namespace if not exists
    logger.info("Creating vocabulary and namespace if not exists...")

    vocabulary = define_vocabulary_ns(conn, doc_id, True)
    logger.info(f"{vocabulary=}")

    # create Fact model
    rule_fact_model = RuleAndFact(
        statement_id=statement_subject,
        statement=statement,
        concept_type=concept_type,
        terms=terms,
        verb_symbols=verb_symbols,
        vocabulary_namespace=vocabulary,
        sources=[source],
        doc_id=doc_id,
        metadata_cfr2sbvr=metadata_cfr2sbvr
    )

    logger.debug(f"{rule_fact_model=}")

    # upsert
    upsert_rule_and_fact_to_kg(conn, rule_fact_model)
    
    logger.info(f"Fact '{statement_subject}' done.")

2024-11-24 01:54:28 - INFO - index=0 element={'doc_id': '§ 275.0-2', 'statement_id': 4, 'statement': 'Managing agent means any person, including a trustee, who directs or manages, or who participates in directing or managing, the affairs of any unincorporated organization or association other than a partnership.', 'source': '(b)(1)', 'terms': [{'term': 'Managing agent', 'classification': 'Common Noun', 'confidence': 0.9, 'reason': 'The term is defined within the document.', 'extracted_confidence': 0.9, 'extracted_reason': 'The term is explicitly defined.'}, {'term': 'Person', 'classification': 'Common Noun', 'confidence': 0.9, 'reason': 'The term is a general reference to an individual or entity.', 'extracted_confidence': 0.9, 'extracted_reason': 'The term is part of the definition.'}, {'term': 'Trustee', 'classification': 'Common Noun', 'confidence': 0.9, 'reason': 'The term refers to a specific role within an organization.', 'extracted_confidence': 0.9, 'extracted_reason': 'The term 

### Rules

In [55]:
for index, element in enumerate(pred_operative_rules):

    logger.info(f"{index=} {element=}")

    # from extraction
    doc_id = element.get('doc_id') # section

    statement_id = element.get('statement_id')
    
    statement = element.get('transformed')
    statement = statement if statement else "missing" # Change None to "missing"

    # SBVR ontology just have Rule
    concept_type = "Rule" #element.get("element_name").replace(" ", "")
    
    logger.debug(f"{index}. {element.get('statement')}")
    #statement_subject  = generate_meaningful_rdf_subject(element.get('statement'))
    statement_subject  = transform_to_rdf_subject(element.get('statement_title'))#generate_meaningful_rdf_subject(element.get('statement'))

    logger.debug(f"{index}. {statement_subject}")

    sources = element.get("source") # paragraph

    terms = element.get("terms")

    verb_symbols = element.get("verb_symbols")
    
    # from transformation
    metadata_cfr2sbvr = get_metadata_cfr2sbvr(element)

    logger.info(f"Processing '{index}: {statement_subject}' ...")

    logger.debug(f"{doc_id=}")
    logger.debug(f"{statement_id=}")
    logger.debug(f"{statement=}")
    logger.debug(f"{concept_type=}")
    logger.debug(f"{metadata_cfr2sbvr=}")

    # create vocabulary and namespace if not exists
    logger.info("Creating vocabulary and namespace if not exists...")

    vocabulary = define_vocabulary_ns(conn, doc_id, True)
    logger.info(f"{vocabulary=}")

    # create Fact model
    rule_fact_model = RuleAndFact(
        statement_id=statement_subject,
        statement=statement,
        concept_type=concept_type,
        terms=terms,
        verb_symbols=verb_symbols,
        vocabulary_namespace=vocabulary,
        sources=sources,
        doc_id=doc_id,
        metadata_cfr2sbvr=metadata_cfr2sbvr
    )

    logger.debug(f"{rule_fact_model=}")

    # upsert
    upsert_rule_and_fact_to_kg(conn, rule_fact_model)
    
    logger.info(f"Rule '{statement_subject}' done.")

2024-12-01 17:14:32 - INFO - index=0 element={'doc_id': '§ 275.0-2', 'statement_id': 3, 'statement_title': 'Forwarding documents by the Secretary', 'statement': "If process, pleadings, or other papers are served on the Commission as described in this section, the Secretary of the Commission (Secretary) will promptly forward a copy to each named party by registered or certified mail at that party's last address filed with the Commission.", 'sources': ['(a)(2)'], 'terms': [{'term': 'Process, pleadings, or other papers', 'classification': 'Common Noun', 'confidence': 0.8, 'reason': 'The term refers to legal documents involved in the service process.', 'extracted_confidence': 0.8, 'extracted_reason': 'The term is explicitly mentioned as the object.'}, {'term': 'Commission', 'classification': 'Proper Noun', 'confidence': 0.9, 'reason': 'The term refers to a specific governmental body.', 'extracted_confidence': 0.9, 'extracted_reason': 'The term is explicitly mentioned as the recipient.'}, {

### Verb symbols

**verb symbol**

**Definition**: designation that represents a verb concept and that is demonstrated by a verb concept wording

**Reference Scheme**: a verb concept wording that incorporates the verb symbol

**Example**: In the expression, ‘Each customer rents a car’, ‘rents’ is a verb symbol denoting a verb concept.

**Example**: In the expression, ‘A driver of a car returns the car to a branch office’, ‘of’ is a verb symbol for one verb concept (relating a driver to a car) and ‘returns to’ is another verb symbol denoting a verb concept (relating a driver to a car and a branch office).

Source: SBVR Specification 1.5

In [34]:
for index, element in enumerate(pred_facts + pred_operative_rules):

    logger.info(f"{index=} {element=}")

    # from extraction
    doc_id = element.get('doc_id') # section

    # There is no extracted / transformed statement for verb symbols
    # Storging the fact or rule statement as cfr-sbvr:transformedStatement
    statement = element.get('transformed')

    # SBVR ontology
    concept_type = "VerbConcept"
    
    source = element.get("source") # paragraph

    verb_symbols = element.get("verb_symbols")
    
    # from transformation
    metadata_cfr2sbvr = get_metadata_cfr2sbvr(element)

    logger.info(f"Processing '{index}: {verb_symbols}' ...")

    logger.debug(f"{doc_id=}")
    logger.debug(f"{statement_id=}")
    logger.debug(f"{statement=}")
    logger.debug(f"{concept_type=}")
    logger.debug(f"{metadata_cfr2sbvr=}")

    # create vocabulary and namespace if not exists
    logger.info("Creating vocabulary and namespace if not exists...")

    vocabulary = define_vocabulary_ns(conn, doc_id, True)
    logger.info(f"{vocabulary=}")

    for verb_symbol in verb_symbols:
        # create Fact model
        designation_model = Designation(
            signifier=verb_symbol,
            statement=statement, # There is no extracted / transformed statement
            concept_type=concept_type,
            closeMatch=[],
            exactMatch=[],
            vocabulary_namespace=vocabulary,
            sources=[source],
            doc_id=doc_id,
            metadata_cfr2sbvr=metadata_cfr2sbvr
        )

        logger.info(f"{designation_model=}")

        # upsert
        upsert_verb_symbol_to_kg(conn, designation_model)
        
        logger.info(f"Verb symbol '{verb_symbol}' done.")

2024-11-24 01:54:40 - INFO - index=0 element={'doc_id': '§ 275.0-2', 'statement_id': 4, 'statement': 'Managing agent means any person, including a trustee, who directs or manages, or who participates in directing or managing, the affairs of any unincorporated organization or association other than a partnership.', 'source': '(b)(1)', 'terms': [{'term': 'Managing agent', 'classification': 'Common Noun', 'confidence': 0.9, 'reason': 'The term is defined within the document.', 'extracted_confidence': 0.9, 'extracted_reason': 'The term is explicitly defined.'}, {'term': 'Person', 'classification': 'Common Noun', 'confidence': 0.9, 'reason': 'The term is a general reference to an individual or entity.', 'extracted_confidence': 0.9, 'extracted_reason': 'The term is part of the definition.'}, {'term': 'Trustee', 'classification': 'Common Noun', 'confidence': 0.9, 'reason': 'The term refers to a specific role within an organization.', 'extracted_confidence': 0.9, 'extracted_reason': 'The term 

Close database connection

In [35]:
conn.close()

### Discussion

TODO