<a href="https://colab.research.google.com/github/asantos2000/master-degree-santos-anderson/blob/main/code/src/chap_6_create_kg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# create kg

Chapter 6. Ferramentas de suporte
- Section 6.2.1 Grafo de conhecimento para CFR e FIBO

## Google colab

In [1]:
%load_ext autoreload
%autoreload 2

import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  !rm -rf cfr2sbvr configuration checkpoint
  !git clone https://github.com/asantos2000/master-degree-santos-anderson.git cfr2sbvr
  %pip install -r cfr2sbvr/code/requirements.txt
  !cp -r cfr2sbvr/code/src/configuration .
  !cp -r cfr2sbvr/code/src/checkpoint .
  !cp -r cfr2sbvr/code/config.colab.yaml config.yaml
  DEFAULT_CONFIG_FILE="config.yaml"
else:
  DEFAULT_CONFIG_FILE="../config.yaml"

## Imports

In [2]:
# Standard library imports
import subprocess
import os

def update_llm_spec_file(llm_spec, key, value):
    # Read in the file
    with open(llm_spec, "r") as file:
        filedata = file.read()

    # Replace the target string
    filedata = filedata.replace(key, value)

    # Write the file out again
    with open(llm_spec, "w") as file:
        file.write(filedata)
# Franz AllegroGraph (AG) imports
from franz.openrdf.connect import ag_connect
from franz.openrdf.rio.rdfformat import RDFFormat

# Local application/library-specific imports
import configuration.main as configuration
import logging_setup.main as logging_setup

DEV_MODE = True

if DEV_MODE:
    # Development mode
    import importlib

    importlib.reload(configuration)
    importlib.reload(logging_setup)

## Settings

Default settings, check them before run the notebook.

### Get configuration

Adjust the algorithm behavior. 

Configuration is loaded from `config.yaml`

In [7]:
# load config
config = configuration.load_config(DEFAULT_CONFIG_FILE)

### Logging configuration

In [8]:
logger = logging_setup.setting_logging(config["DEFAULT_LOG_DIR"], config["LOG_LEVEL"])

2025-02-10 20:05:10 - INFO - Logging is set up with daily rotation.


## Execution

### Connect to KG

Connect to AllegroGraph and create repo if not exists.

Workaround to connect AllegroGraph Cloud via stunnel.

> Configure the host using the connect variable.

In [None]:
%%writefile agraph_stunnel.conf

[allegrograph_proxy]
client = yes
accept = 127.0.0.1:8443
connect = ag1eawvuu0p3zv35.allegrograph.cloud:443

In [5]:
hosting = config["ALLEGROGRAPH_HOSTING"]

if hosting == "ALLEGROGRAPH_CLOUD":
    # Start tunnel
    import getpass
    import os

    password = getpass.getpass()
    command = "sudo -S stunnel agraph_stunnel.conf" #can be any command but don't forget -S as it enables input from stdin
    os.system('echo %s | %s' % (password, command)) # Start stunnel

    config[hosting]["HOST"]="localhost"
    config[hosting]["PORT"]=8443
    conn = ag_connect(
        repo=config[hosting]["REPO"],
        catalog=config[hosting]["CATALOG"],
        host=config[hosting]["HOST"],
        port=config[hosting]["PORT"],
        protocol=config[hosting]["PROTOCOL"],
        user=config[hosting]["USER"],
        password=config[hosting]["PASSWORD"],
    )
else:
    conn = ag_connect(
        repo=config[hosting]["REPO"],
        catalog=config[hosting]["CATALOG"],
        host=config[hosting]["HOST"],
        port=config[hosting]["PORT"],
        user=config[hosting]["USER"],
        password=config[hosting]["PASSWORD"],
    )

logger.info(f"Connected to AllegroGraph: {hosting}")

2025-02-10 20:03:55 - INFO - Connected to AllegroGraph: ALLEGROGRAPH_LOCAL


Run scripts even if the KG is not empty

In [None]:
# If FORCE is true will run anyway
is_empty = conn.isEmpty()
logger.info(f"Repository is empty: {is_empty}")

ALLEGROGRAPH_FORCE_RUN = config["ALLEGROGRAPH_FORCE_RUN"]
logger.info(f"{ALLEGROGRAPH_FORCE_RUN=}")

if not conn.isEmpty() or not ALLEGROGRAPH_FORCE_RUN:
    raise Exception(
        "The repository is not empty. If you want to force run, set FORCE to True"
    )

2025-02-10 20:05:21 - INFO - Repository is empty: False
2025-02-10 20:05:21 - INFO - ALLEGROGRAPH_FORCE_RUN=True


Exception: The repository is not empty. If you want to force run, set FORCE to True

Clean KG before run

In [None]:
# If true will clean all data
CLEAN_BEFORE_RUN = config["ALLEGROGRAPH_CLEAN_BEFORE_RUN"]
logger.info(f"{CLEAN_BEFORE_RUN=}")
logger.info(f"Repo number of statements (all graphs): {conn.size()}")

if CLEAN_BEFORE_RUN:
    conn.clear()
    logger.info(f"Repo cleaned. Number of statements (all graphs): {conn.size()}")

2025-02-10 20:06:32 - INFO - CLEAN_BEFORE_RUN=True


2025-02-10 20:06:32 - INFO - Repo number of statements (all graphs): 14
2025-02-10 20:06:32 - INFO - Repo cleaned. Number of statements (all graphs): 0


### Create vocabulary triples

#### Create language triples

In [11]:
result = conn.executeUpdate(
    """
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

INSERT DATA {
    GRAPH cfr-sbvr:CFR_SBVR {
        cfr-sbvr:EnglishLanguage
            a sbvr:Language ;
            skos:label "English" ;
            sbvr:signifier "English" .
    }
}
    """
)

logger.info(f"Create English language: {result}")

2025-02-10 20:07:00 - INFO - Create English language: True


#### CFR vocabulary and vocabulary namespace

In [12]:
result = conn.executeUpdate(
    """
# Insert CFR-FRO graph metadata
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

INSERT DATA {
    GRAPH cfr-sbvr:CFR_SBVR {
        fro-cfr:CFR_Title_17_Part_275_VOC
            a owl:Class, sbvr:Vocabulary .

        fro-cfr:CFR_Title_17_Part_275_NS
            a owl:Class, sbvr:VocabularyNamespace;
            sbvr:vocabularyNamespaceIsDerivedFromVocabulary fro-cfr:CFR_Title_17_Part_275_VOC ;
            sbvr:namespaceHasURI <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#> ;
            sbvr:vocabularyIsExpressedInLanguage cfr-sbvr:EnglishLanguage ;
            dct:title "RULES AND REGULATIONS, INVESTMENT ADVISERS ACT OF 1940" ;
            skos:definition "Financial Regulation Ontology: FRO CFR Title 17 Part 275" ;
            dct:source <https://finregont.com/fro/cfr/FRO_CFR_Title_17_Part_275.ttl> .

        cfr-sbvr:CFR_SBVR_VOC sbvr:vocabulary1IncorporatesVocabulary2 fro-cfr:CFR_Title_17_Part_275_VOC .
    }
}
    """
)

logger.info(f"Create CFR vocabulary and vocabulary namespace: {result}")

2025-02-10 20:07:06 - INFO - Create CFR vocabulary and vocabulary namespace: True


#### FIBO vocabulary and vocabulary namespace

Definition: Financial Industry Business Ontology (FIBO) is a set of ontologies that describes financial industry business. The FIBO ontologies are used to describe financial industry business.

Language: English

Synonym: FIBO

Namespace URI: https://spec.edmcouncil.org/fibo/ontology/QuickFIBOProd

In [13]:
result = conn.executeUpdate(
    """
# Insert FIBO graph metadata
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX fibo: <https://spec.edmcouncil.org/fibo/ontology/QuickFIBOProd#>
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

INSERT DATA {
    GRAPH cfr-sbvr:CFR_SBVR {
        fibo:FIBO_VOC
            a owl:Class, sbvr:Vocabulary .

        fibo:FIBO_NS
            a owl:Class, sbvr:VocabularyNamespace ;
            sbvr:vocabularyNamespaceIsDerivedFromVocabulary fibo:FIBO_VOC ;
            sbvr:namespaceHasURI <https://spec.edmcouncil.org/fibo/ontology/QuickFIBOProd#> ;
            sbvr:vocabularyIsExpressedInLanguage cfr-sbvr:EnglishLanguage ;
            dct:title "Financial Business Ontology" ;
            skos:definition "This ontology is provided for the convenience of FIBO users. It loads all of the very latest FIBO production ontologies based on the contents of GitHub, rather than those that comprise a specific version, such as a quarterly release. Note that metadata files and other 'load' files, such as the various domain-specific 'all' files, are intentionally excluded." ;
            dct:source <https://spec.edmcouncil.org/fibo/ontology/master/2024Q2/LoadFIBOProd.ttl> .

        cfr-sbvr:CFR_SBVR_VOC sbvr:vocabulary1IncorporatesVocabulary2 fibo:FIBO_VOC .
    }
}
    """
)

logger.info(f"Create FIBO vocabulary and vocabulary namespace: {result}")

2025-02-10 20:07:10 - INFO - Create FIBO vocabulary and vocabulary namespace: True


#### CFR_SBVR vocabulary and vocabulary namespace

In [14]:
result = conn.executeUpdate(
    """
# Insert SBVR graph metadata
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX cfr-sbvr: <http://cfr2sbvr.com/cfr#>
PREFIX sbvr: <https://www.omg.org/spec/SBVR/20190601#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

INSERT DATA {
    GRAPH cfr-sbvr:CFR_SBVR {
        cfr-sbvr:CFR_SBVR_VOC
            a owl:Class, sbvr:Vocabulary .
                                
        cfr-sbvr:CFR_SBVR_NS
            a owl:Class, sbvr:VocabularyNamespace;
            sbvr:namespaceHasURI <http://cfr2sbvr.com/cfr#> ;
            sbvr:vocabularyIsExpressedInLanguage cfr-sbvr:EnglishLanguage ;
            sbvr:vocabularyNamespaceIsDerivedFromVocabulary cfr-sbvr:CFR_SBVR_VOC ;
            dct:title "Semantics of Business Vocabulary and Business Rules (SBVR) for Code of Federal Regulations (CFR)" ;
            skos:definition "SBVR-CFR is an adopted standard of the Object Management Group (OMG) intended to be the basis for formal and detailed natural language declarative description of CFR regulations" ;
            dct:source <https://github.com/asantos2000/dissertacao-santos-anderson-2024> .
    }
}
    """
)

logger.info(f"Create CFR_SBVR vocabulary and vocabulary namespace: {result}")

2025-02-10 20:07:12 - INFO - Create CFR_SBVR vocabulary and vocabulary namespace: True


### Load data and ontologies

#### CFR ontology - US_LegalReference.ttl

In [15]:
result = conn.addFile(
    filePath="../data/US_LegalReference.ttl",
    context="<http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#CFR_Title_17_Part_275>",
    format=RDFFormat.TURTLE,
)

logger.info(f"US_LegalReference.ttl added to graph CFR_Title_17_Part_275: {result}")

2025-02-10 20:07:17 - INFO - US_LegalReference.ttl added to graph CFR_Title_17_Part_275: None


#### CFR Ontology - Code_Federal_Regulations

Language: English

Synonym: FRO-CFR

Namespace URI: http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl

In [16]:
result = conn.addFile(
    filePath="../data/Code_Federal_Regulations.ttl",
    context="<http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#CFR_Title_17_Part_275>",
    format=RDFFormat.TURTLE,
)

logger.info(f"Code_Federal_Regulations.ttl added to graph CFR_Title_17_Part_275: {result}")

2025-02-10 20:07:20 - INFO - Code_Federal_Regulations.ttl added to graph CFR_Title_17_Part_275: None


#### CFR Data - FRO CFR Title 17 Part 275

In [17]:
result = conn.addFile(
    filePath="../data/FRO_CFR_Title_17_Part_275.ttl",
    context="<http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#CFR_Title_17_Part_275>",
    format=RDFFormat.TURTLE,
)

logger.info(f"FRO_CFR_Title_17_Part_275.ttl added to graph CFR_Title_17_Part_275: {result}")

2025-02-10 20:07:22 - INFO - FRO_CFR_Title_17_Part_275.ttl added to graph CFR_Title_17_Part_275: None


#### FIBO Data and Ontology - QuickFIBOProd

Definition: Financial Industry Business Ontology (FIBO) is a set of ontologies that describes financial industry business. The FIBO ontologies are used to describe financial industry business.

Language: English

Synonym: FIBO

Namespace URI: https://spec.edmcouncil.org/fibo/ontology/QuickFIBOProd

In [18]:
result = conn.addFile(
    filePath="../data/prod-fibo-quickstart-2024Q3.ttl",
    context="<https://spec.edmcouncil.org/fibo/ontology/master/2024Q3/QuickFIBOProd#FIBO>",
    format=RDFFormat.TURTLE,
)

logger.info(f"prod-fibo-quickstart-2024Q3.ttl added to graph FIBO: {result}")

2025-02-10 20:07:28 - INFO - prod-fibo-quickstart-2024Q3.ttl added to graph FIBO: None


#### SBVR ontology - sbvr-dtc-19-05-32-ontology-v1

Adapted by author from <https://www.omg.org/spec/SBVR/20190601/SBVR-XML-Schema.xsd>

In [19]:
result = conn.addFile(
    filePath="../data/sbvr-dtc-19-05-32-ontology-v1.ttl",
    context="<https://www.omg.org/spec/SBVR/20190601#SBVR_Onto>",
    format=RDFFormat.TURTLE,
)

logger.info(f"sbvr-dtc-19-05-32-ontology-v1.ttl added to graph SBVR: {result}")

2025-02-10 20:07:32 - INFO - sbvr-dtc-19-05-32-ontology-v1.ttl added to graph SBVR: None


### Close connection

In [20]:
conn.close()

### Create / Update similarity index

AllegroGraph configuration

In [21]:
hosting = config["ALLEGROGRAPH_HOSTING"]
open_ai_api_key = os.getenv("OPENAI_API_KEY")
repo_spec = f'http://{config[hosting]["USER"]}:{config[hosting]["PASSWORD"]}@{config[hosting]["HOST"]}:{config[hosting]["PORT"]}/repositories/{config[hosting]["REPO"]}'
tool = f'{config[hosting]["HOME_DIR"]}/bin/agtool'

### FIBO

Index FIBO ontologies and data into AllegroGraph vector store.

> Required: Used for "elements association and creation" algorithm to associate SBVR terms with FIBO terms.

In [22]:
llm_spec = f'{config["DEFAULT_SCRIPTS_DIR"]}/fibo-vec.def'

update_llm_spec_file(llm_spec, "{open_ai_api_key}", open_ai_api_key)

result = subprocess.run(
    [
        tool,
        "llm",
        "index",
        repo_spec,
        llm_spec,
    ],
    capture_output=True,
    text=True,
    check=True
)

print("Output:", result.stdout)
print("Return Code:", result.returncode)

# Remove the API key from the file
update_llm_spec_file(llm_spec, open_ai_api_key, "{open_ai_api_key}")

CalledProcessError: Command '['/home/adsantos/agraph-8.3.1/bin/agtool', 'llm', 'index', 'http://super:2002@localhost:10035/repositories/cfr2sbvr', '../scripts/fibo-vec.def']' returned non-zero exit status 2.

Estimate time: 115 minutes

Estimate cost: 5.00 USD

### CFR

Index CFR data and ontologies into AllegroGraph vector store.

> Optional

In [None]:
llm_spec = f'{config["DEFAULT_SCRIPTS_DIR"]}/cfr-sbvr-vec'

update_llm_spec_file(llm_spec, "{open_ai_api_key}", open_ai_api_key)

result = subprocess.run(
    [
        tool,
        "llm",
        "index",
        repo_spec,
        llm_spec,
    ],
    capture_output=True,
    text=True,
    check=True
)

print("Output:", result.stdout)
print("Return Code:", result.returncode)

# Remove the API key from the file
update_llm_spec_file(llm_spec, open_ai_api_key, "{open_ai_api_key}")

query=select DISTINCT ?id ?text ?pred ?type {
?id a ?type.
FILTER (isLiteral(?text))
?id ?pred ?text.
VALUES ?pred {<http://www.w3.org/2000/01/rdf-schema#label> }
VALUES ?type {<http://www.w3.org/2002/07/owl#Class> <http://www.w3.org/2002/07/owl#NamedIndividual> }


  } LIMIT 1000000

Begin indexing, 15640 strings to consider.  checkpoint every 50 strings
checkpoint, 50 processed


Estimate time: 35 minutes

Estimate cost: 8.00 USD