# LAB 3 - Metadata

If the experiment is in the final version, the results will be in the ../src directory in cfr2sbvr_modules.ipynb or as a script.

## Transform witt yaml files to rdf/ttl

Experimental version. See:

- [../data/witt_rules_taxonomy_v1.ttl](../data/witt_rules_taxonomy_v1.ttl)
- [../data/witt_templates.yaml](../data/witt_templates.yaml)
- [../data/witt_template_subtemplate_relationship.yaml](../data/witt_template_subtemplate_relationship.yaml)
- [../data/witt_subtemplates.yaml](../data/witt_subtemplates.yaml)

In [1]:
# TODO: Add to the src directory

# Import required libraries
from rdflib import Graph, URIRef, Literal, Namespace, RDF
import yaml

# Define namespaces
EX = Namespace("http://example.org/schema#")
g = Graph()
g.bind("ns1", EX)

# File paths
file_paths = {
    "subtemplate_list": "../data/witt_subtemplates.yaml",
    "template_list": "../data/witt_templates.yaml",
    "template_subtemplate_relationship": "../data/witt_template_subtemplate_relationship.yaml"
}

# Reading and parsing the content of the files
with open(file_paths["subtemplate_list"], 'r') as file:
    subtemplate_data = yaml.safe_load(file)
    print("Full Subtemplate YAML Data Loaded:", subtemplate_data)
    # Extract subtemplate data using the correct key
    if isinstance(subtemplate_data, dict):
        subtemplate_data = subtemplate_data.get('subtemplate_list', [])
    print("Subtemplate Data Extracted:", subtemplate_data)

with open(file_paths["template_list"], 'r') as file:
    template_data = yaml.safe_load(file)
    print("Full Template YAML Data Loaded:", template_data)
    # Extract template data using the correct key
    if isinstance(template_data, dict):
        template_data = template_data.get('template_list', [])
    print("Template Data Extracted:", template_data)

with open(file_paths["template_subtemplate_relationship"], 'r') as file:
    template_subtemplate_relationship_data = yaml.safe_load(file)
    if isinstance(template_subtemplate_relationship_data, dict):
        template_subtemplate_relationship_data = template_subtemplate_relationship_data.get('template_subtemplate_relationship', {})
    print("Template-Subtemplate Relationships Loaded:", template_subtemplate_relationship_data)

# Create RDF triples for subtemplates
for subtemplate in subtemplate_data:
    if isinstance(subtemplate, dict) and 'id' in subtemplate:
        subtemplate_uri = EX[subtemplate['id']]
        g.add((subtemplate_uri, RDF.type, EX.Subtemplate))
        g.add((subtemplate_uri, EX.id, Literal(subtemplate['id'])))
        if 'explanation' in subtemplate:
            g.add((subtemplate_uri, EX.explanation, Literal(subtemplate['explanation'])))
        if 'text' in subtemplate:
            g.add((subtemplate_uri, EX.text, Literal(subtemplate['text'])))

# Create RDF triples for templates
for template in template_data:
    if isinstance(template, dict) and 'id' in template:
        template_uri = EX[template['id']]
        g.add((template_uri, RDF.type, EX.Template))
        g.add((template_uri, EX.id, Literal(template['id'])))
        if 'explanation' in template:
            g.add((template_uri, EX.explanation, Literal(template['explanation'])))
        if 'text' in template:
            g.add((template_uri, EX.text, Literal(template['text'])))

# Create RDF triples for template-subtemplate relationships
for template_id, subtemplate_ids in template_subtemplate_relationship_data.items():
    template_uri = EX[template_id]
    if isinstance(subtemplate_ids, list):
        for subtemplate_id in subtemplate_ids:
            subtemplate_uri = EX[subtemplate_id]
            g.add((template_uri, EX.usesSubtemplate, subtemplate_uri))

# Serialize the graph to Turtle format and save to file
rdf_output = g.serialize(format="turtle")
output_file_path = "../data/witt_rules_taxonomy_v1.ttl"
with open(output_file_path, 'w') as output_file:
    output_file.write(rdf_output)

# Display the RDF/Turtle representation
print("RDF/Turtle Output:\n", rdf_output)


Full Subtemplate YAML Data Loaded: {'subtemplate_list': [{'id': 'S1', 'explanation': '', 'text': '<operative rule statement subject>::=\n{<term>|combination of [<term>, and]|set of <term>}\n  {<qualifying clause>|}\n'}, {'id': 'S2', 'explanation': '', 'text': '<article>::={a |an|the}\n'}, {'id': 'S3', 'explanation': 'For example, "exactly one", "at least two", "at least one and at most four".\n', 'text': '<cardinality>::=\n{exactly|at least {<positive integer 1> and at most| }}\n<positive integer 2>\n'}, {'id': 'S4', 'explanation': "One or more determiners can be used before a noun to provide some information as to which (or how many) instances of the noun's concept are being referred to.\nThe most commonly used determiners are articles.\nSpecific determiners are used to limit the noun to referring only to a specific instance or instances.\nThere is an infinite set of ordinal numbers: 'first', 'second', 'third', etc. Any of these may be used between 'the' and a noun to indicate which m

## Get Witt (2012) rules taxonomy templates from yaml files

### Final version

See:

- [../src/cfr2sbvr_modules.py](../src/cfr2sbvr_modules.py)
- [../data/witt_templates.yaml](../data/witt_templates.yaml)
- [../data/witt_template_subtemplate_relationship.yaml](../data/witt_template_subtemplate_relationship.yaml)
- [../data/witt_subtemplates.yaml](../data/witt_subtemplates.yaml)

In [15]:
import yaml
from pathlib import Path

class RulesTemplateProvider:
    """
    A class to provide information about rules templates and their relationships from YAML data.

    This class loads and processes template data, subtemplate data, and their relationships from specified YAML files.
    It is used to extract information about templates and format them into readable output.

    Attributes:
    -----------
    data_directory : Path
        Path to the directory containing the YAML files.
    data_dicts : dict
        Dictionary containing data loaded from YAML files, including templates, subtemplates, and relationships.
    """
    
    def __init__(self, data_directory):
        """
        Initializes the RulesTemplateProvider with the specified data directory.

        Parameters:
        -----------
        data_directory : str or Path
            Path to the directory containing the YAML files with templates, subtemplates, and relationships.
        """
        self.data_directory = Path(data_directory)
        self.data_dicts = self._load_data()

    def _load_yaml(self, file_path):
        """
        Loads data from a YAML file.

        Parameters:
        -----------
        file_path : Path
            Path to the YAML file to be loaded.

        Returns:
        --------
        dict
            A dictionary containing the data from the YAML file.
        """
        with open(file_path, 'r') as file:
            return yaml.safe_load(file) or {}

    def _load_data(self):
        """
        Loads data from multiple YAML files required for template processing.

        Returns:
        --------
        dict
            A dictionary containing data from templates, subtemplates, and template relationships YAML files.
        """
        witt_template_relationship_file = self.data_directory / 'witt_template_subtemplate_relationship.yaml'
        witt_templates_file = self.data_directory / 'witt_templates.yaml'
        witt_subtemplates_file = self.data_directory / 'witt_subtemplates.yaml'

        witt_template_relationship_data = self._load_yaml(witt_template_relationship_file).get('template_subtemplate_relationship', {})
        witt_templates_data = self._load_yaml(witt_templates_file).get('template_list', [])
        witt_subtemplates_data = self._load_yaml(witt_subtemplates_file).get('subtemplate_list', [])

        return {
            'witt_template_relationship_data': witt_template_relationship_data,
            'witt_templates_data': witt_templates_data,
            'witt_subtemplates_data': witt_subtemplates_data
        }

    def _get_template_data(self, template_key, data):
        """
        Retrieves data for a specific template or subtemplate based on its key.

        Parameters:
        -----------
        template_key : str
            The key of the template or subtemplate to be retrieved.
        data : list or dict
            The data to search in, which can be a list of templates or a dictionary of relationships.

        Returns:
        --------
        dict or None
            The data corresponding to the specified template key, or None if not found.
        """
        if isinstance(data, dict):
            return data.get(template_key, None)
        elif isinstance(data, list):
            for item in data:
                if isinstance(item, dict) and item.get('id', '') == template_key:
                    return item
        return None

    def _format_template_output(self, template_key, template_data):
        """
        Formats the output for a given template or subtemplate.

        Parameters:
        -----------
        template_key : str
            The key of the template or subtemplate.
        template_data : dict
            The data of the template or subtemplate to be formatted.

        Returns:
        --------
        str
            A formatted string representation of the template data.
        """
        output = f"# {template_key}\n\n"
        if not template_data:
            output += "Template data not found.\n\n"
            return output
        if 'usesSubtemplate' in template_data:
            uses_subtemplate = template_data['usesSubtemplate']
            if isinstance(uses_subtemplate, list):
                uses_subtemplate = ', '.join(uses_subtemplate)
            output += f"## usesSubtemplate\n{uses_subtemplate}\n\n"
        if 'text' in template_data:
            output += f"## text\n\n{template_data['text']}\n\n"
        if 'explanation' in template_data:
            output += f"## explanation\n\n{template_data['explanation']}\n\n"
        return output

    def _process_template(self, template_key, processed_keys=None):
        """
        Processes a template or subtemplate recursively, including any subtemplates used.

        Parameters:
        -----------
        template_key : str
            The key of the template or subtemplate to be processed.
        processed_keys : set, optional
            A set of keys that have already been processed to prevent circular references.

        Returns:
        --------
        str
            A formatted string representation of the template and its subtemplates.
        """
        if processed_keys is None:
            processed_keys = set()

        if template_key in processed_keys:
            return ''
        processed_keys.add(template_key)

        template_data = None

        if template_key.startswith('T'):
            template_data = self._get_template_data(template_key, self.data_dicts['witt_templates_data']) or {}
            uses_subtemplate = self._get_template_data(template_key, self.data_dicts['witt_template_relationship_data'])
            if uses_subtemplate:
                template_data['usesSubtemplate'] = uses_subtemplate if isinstance(uses_subtemplate, list) else [uses_subtemplate]
        elif template_key.startswith('S'):
            template_data = self._get_template_data(template_key, self.data_dicts['witt_subtemplates_data']) or {}
            uses_subtemplate = self._get_template_data(template_key, self.data_dicts['witt_template_relationship_data'])
            if uses_subtemplate:
                template_data['usesSubtemplate'] = uses_subtemplate if isinstance(uses_subtemplate, list) else [uses_subtemplate]

        if not template_data:
            return f"# {template_key}\n\nTemplate data not found.\n\n"

        output = self._format_template_output(template_key, template_data)

        if 'usesSubtemplate' in template_data:
            subtemplate_keys = template_data['usesSubtemplate']
            subtemplate_keys = [subtemplate_keys] if isinstance(subtemplate_keys, str) else subtemplate_keys
            for sub_key in subtemplate_keys:
                sub_key = sub_key.strip()
                output += self._process_template(sub_key, processed_keys)

        return output

    def get_rules_template(self, template_key):
        """
        Retrieves the formatted rules template for the specified template key.

        Parameters:
        -----------
        template_key : str
            The key of the template to be retrieved.

        Returns:
        --------
        str
            A formatted string representation of the template and its associated subtemplates.
        """
        return self._process_template(template_key)

# Example usage:
# processor = RulesTemplateProcessor("../data")
# output = processor.get_rules_template("T7")
# print(output)


In [None]:
# Example usage:
rule_template_provider = RulesTemplateProvider("../data")
markdown_data = rule_template_provider.get_rules_template("T7")
print(markdown_data)

# T7

## usesSubtemplate
S14

## text

{A|An} <term 1>
  {of {a|an} <term 2>| }
is by definition
{a|an|the} <term 3>
  <qualifying clause>.


## explanation



# S14

## usesSubtemplate
S15, S9, S13, S18, S11, S16, S17

## text

<qualifying clause>::=
{{that |who} <verb phrase> {<object>| } |
<verb part> <object>|
other than {<object>| [<object>, or]} |
{<preposition> {which|whom}|whose} <conditional clause>|
{that |who} <verb phrase> {that | if |whether} <conditional clause>|
<and-qualifying clause>|
<or-qualifying clause>|
<both-and-qualifying clause>|
<either-or-qualifying clause>}


## explanation

A qualifying clause (also known as a restrictive relative clause) can be used after a term in two ways:
1. Following the subject term of a rule statement, a qualifying clause restricts the scope of that rule statement to a subset of the set of objects signified by that term, rather than the set of all objects signified by that term.
2. Following any other term in a rule statement, a qual

### Other attemps

TODO: Needs improvements.

In [None]:
# TODO: Refactoring the code to encapsulate the logic

import yaml

def load_yaml(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file) or {}

def get_template_data(template_key, data):
    if isinstance(data, dict):
        # For dictionary data (like witt_template_relationship_data), get by key
        return data.get(template_key, None)
    elif isinstance(data, list):
        # For list data (like witt_templates_data and witt_subtemplates_data), find the item where item['id'] == template_key
        for item in data:
            if isinstance(item, dict) and item.get('id', '') == template_key:
                return item
    return None

def format_template_output(template_key, template_data):
    output = f"# {template_key}\n\n"
    if not template_data:
        output += "Template data not found.\n\n"
        return output
    if 'usesSubtemplate' in template_data:
        uses_subtemplate = template_data['usesSubtemplate']
        # Ensure usesSubtemplate is a string
        if isinstance(uses_subtemplate, list):
            uses_subtemplate = ', '.join(uses_subtemplate)
        output += f"## usesSubtemplate\n{uses_subtemplate}\n\n"
    if 'text' in template_data:
        output += f"## text\n\n{template_data['text']}\n\n"
    if 'explanation' in template_data:
        output += f"## explanation\n\n{template_data['explanation']}\n\n"
    return output

def process_template(template_key, data_dicts, processed_keys=None):
    if processed_keys is None:
        processed_keys = set()

    if template_key in processed_keys:
        # Avoid infinite recursion in case of circular references
        return ''
    processed_keys.add(template_key)

    template_data = None

    # Determine which data source to use based on template key
    if template_key.startswith('T'):
        # Get data from witt_templates_data
        template_data = get_template_data(template_key, data_dicts['witt_templates_data'])
        if not template_data:
            template_data = {}
        # Get 'usesSubtemplate' from witt_template_relationship_data
        uses_subtemplate = get_template_data(template_key, data_dicts['witt_template_relationship_data'])
        if uses_subtemplate:
            if isinstance(uses_subtemplate, list):
                template_data['usesSubtemplate'] = uses_subtemplate
            else:
                template_data['usesSubtemplate'] = [uses_subtemplate]
    elif template_key.startswith('S'):
        # Get data from witt_subtemplates_data
        template_data = get_template_data(template_key, data_dicts['witt_subtemplates_data'])
        if not template_data:
            template_data = {}
        # Get 'usesSubtemplate' from witt_template_relationship_data (in case subtemplates have subtemplates)
        uses_subtemplate = get_template_data(template_key, data_dicts['witt_template_relationship_data'])
        if uses_subtemplate:
            if isinstance(uses_subtemplate, list):
                template_data['usesSubtemplate'] = uses_subtemplate
            else:
                template_data['usesSubtemplate'] = [uses_subtemplate]
    else:
        # Unknown template key format
        template_data = None

    if not template_data:
        output = f"# {template_key}\n\nTemplate data not found.\n\n"
        return output

    # Build the output for this template
    output = format_template_output(template_key, template_data)

    # If the template uses subtemplates, process each subtemplate recursively
    if 'usesSubtemplate' in template_data:
        subtemplate_keys = template_data['usesSubtemplate']
        if isinstance(subtemplate_keys, str):
            subtemplate_keys = [key.strip() for key in subtemplate_keys.split(',')]
        elif isinstance(subtemplate_keys, list):
            subtemplate_keys = subtemplate_keys
        else:
            subtemplate_keys = [subtemplate_keys]
        for sub_key in subtemplate_keys:
            sub_key = sub_key.strip()
            sub_output = process_template(sub_key, data_dicts, processed_keys)
            output += sub_output

    return output

def main():
    # Load the YAML files
    witt_template_relationship_file = '../data/witt_template_subtemplate_relationship.yaml'
    witt_templates_file = '../data/witt_templates.yaml'
    witt_subtemplates_file = '../data/witt_subtemplates.yaml'

    witt_template_relationship_data = load_yaml(witt_template_relationship_file).get('template_subtemplate_relationship', {})
    witt_templates_data = load_yaml(witt_templates_file).get('template_list', [])
    witt_subtemplates_data = load_yaml(witt_subtemplates_file).get('subtemplate_list', [])

    data_dicts = {
        'witt_template_relationship_data': witt_template_relationship_data,
        'witt_templates_data': witt_templates_data,
        'witt_subtemplates_data': witt_subtemplates_data
    }

    # Template key to search for
    template_key = 'T7'

    # Process the template and get the output
    output = process_template(template_key, data_dicts)

    # Print the output
    print(output)

if __name__ == "__main__":
    main()


# T7

## usesSubtemplate
S14

## text

{A|An} <term 1>
  {of {a|an} <term 2>| }
is by definition
{a|an|the} <term 3>
  <qualifying clause>.


## explanation



# S14

## usesSubtemplate
S15, S9, S13, S18, S11, S16, S17

## text

<qualifying clause>::=
{{that |who} <verb phrase> {<object>| } |
<verb part> <object>|
other than {<object>| [<object>, or]} |
{<preposition> {which|whom}|whose} <conditional clause>|
{that |who} <verb phrase> {that | if |whether} <conditional clause>|
<and-qualifying clause>|
<or-qualifying clause>|
<both-and-qualifying clause>|
<either-or-qualifying clause>}


## explanation

A qualifying clause (also known as a restrictive relative clause) can be used after a term in two ways:
1. Following the subject term of a rule statement, a qualifying clause restricts the scope of that rule statement to a subset of the set of objects signified by that term, rather than the set of all objects signified by that term.
2. Following any other term in a rule statement, a qual

In [None]:
# First attempt at using YAML (o1 preview)

import yaml

def load_yaml(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file) or {}

def get_template_data(template_key, data):
    if isinstance(data, dict):
        return data.get(template_key, None)
    elif isinstance(data, list):
        for item in data:
            if isinstance(item, dict) and template_key in item:
                return item[template_key]
    return None

def format_template_output(template_key, template_data):
    output = f"# {template_key}\n\n"
    if not template_data:
        output += "Template data not found.\n\n"
        return output
    if 'usesSubtemplate' in template_data:
        output += f"## usesSubtemplate\n{template_data['usesSubtemplate']}\n\n"
    if 'text' in template_data:
        output += f"## text\n\n{template_data['text']}\n\n"
    if 'explanation' in template_data:
        output += f"## explanation\n\n{template_data['explanation']}\n\n"
    return output

def process_template(template_key, data_dicts, processed_keys=None):
    if processed_keys is None:
        processed_keys = set()

    if template_key in processed_keys:
        # Avoid infinite recursion in case of circular references
        return ''
    processed_keys.add(template_key)

    # Get data for the given template key
    template_data = get_template_data(template_key, data_dicts['witt_template_data'])
    if not template_data:
        # If not found in witt_template_file, look in witt_templates and witt_subtemplates
        template_data = get_template_data(template_key, data_dicts['witt_templates_data'])
        if not template_data:
            template_data = get_template_data(template_key, data_dicts['witt_subtemplates_data'])

    # If the data is a list (like ['S14']), convert it to a dictionary with usesSubtemplate key
    if template_data and isinstance(template_data, list):
        template_data = {'usesSubtemplate': ', '.join(template_data)}

    # For T* templates, add text and explanation from witt_templates_data if not already present
    if template_key.startswith('T') and template_data:
        additional_data = get_template_data(template_key, data_dicts['witt_templates_data'])
        if additional_data:
            template_data.update({k: v for k, v in additional_data.items() if k not in template_data})

    # If template_data is None, handle it before proceeding
    if not template_data:
        output = f"# {template_key}\n\nTemplate data not found.\n\n"
        return output

    # Build the output for this template
    output = format_template_output(template_key, template_data)

    # If the template uses subtemplates, process each subtemplate recursively
    if 'usesSubtemplate' in template_data:
        subtemplate_keys = template_data['usesSubtemplate'].split(', ')
        for sub_key in subtemplate_keys:
            sub_key = sub_key.strip()
            sub_output = process_template(sub_key, data_dicts, processed_keys)
            output += sub_output

    return output

def main():
    # Load the YAML files
    witt_template_file = '../data/witt_template_subtemplate_relationship.yaml'
    witt_templates_file = '../data/witt_templates.yaml'
    witt_subtemplates_file = '../data/witt_subtemplates.yaml'

    witt_template_data = load_yaml(witt_template_file).get('template_subtemplate_relationship', {})
    witt_templates_data = load_yaml(witt_templates_file).get('template_list', [])
    witt_subtemplates_data = load_yaml(witt_subtemplates_file).get('subtemplate_list', [])

    data_dicts = {
        'witt_template_data': witt_template_data,
        'witt_templates_data': witt_templates_data,
        'witt_subtemplates_data': witt_subtemplates_data
    }

    # Template key to search for
    template_key = 'T7'

    # Process the template and get the output
    output = process_template(template_key, data_dicts)

    # Print the output
    print(output)

if __name__ == "__main__":
    main()


# T7

## usesSubtemplate
S14

# S14

## usesSubtemplate
S15, S9, S13, S18, S11, S16, S17

# S15

## usesSubtemplate
S11, S9

# S11

## usesSubtemplate
S5, S14, S4

# S5

Template data not found.

# S4

## usesSubtemplate
S3, S2

# S3

Template data not found.

# S2

Template data not found.

# S9

Template data not found.

# S13

## usesSubtemplate
S10, S12, S14, S4

# S10

## usesSubtemplate
S6, S11, S7

# S6

Template data not found.

# S7

Template data not found.

# S12

## usesSubtemplate
S11, S5, S14, S4

# S18

## usesSubtemplate
S11, S9

# S16

## usesSubtemplate
S11, S9

# S17

## usesSubtemplate
S11, S9




In [None]:
# Attemps with graphdb
# Import necessary libraries
from franz.openrdf.connect import ag_connect
from franz.openrdf.query.query import QueryLanguage

# Function to recursively fetch subtemplate relationships
def fetch_subtemplate_details(repo, host, port, user, password, initial_templates):
    # Connect to the AllegroGraph repository
    conn = ag_connect(repo=repo, catalog='root', host=host, port=port, 
                      user=user, password=password)
    try:
        # Initialize a set to store all visited templates and avoid re-querying them
        visited_templates = set()
        templates_to_query = set(initial_templates)
        results = []  # Store results here
        
        while templates_to_query:
            template_id = templates_to_query.pop()
            if template_id in visited_templates:
                continue
            
            # Mark this template as visited
            visited_templates.add(template_id)

            # Query to get predicates and objects of the given template
            query_string = f"""
                PREFIX ns1: <http://example.org/schema#>
                SELECT ?p ?o
                WHERE {{
                    ns1:{template_id} ?p ?o .
                }}
                ORDER BY ?p
            """
            
            # Prepare and execute the SPARQL query
            tuple_query = conn.prepareTupleQuery(QueryLanguage.SPARQL, query_string)
            result = tuple_query.evaluate()
            
            with result:
                for binding_set in result:
                    p = binding_set.getValue("p")
                    o = binding_set.getValue("o")
                    result_str = f"{template_id} {p} {o}"
                    results.append(result_str)
                    print(result_str)
                    
                    # If the predicate is 'usesSubtemplate', add the object to the query list
                    if str(p).endswith("usesSubtemplate"):
                        subtemplate_id = str(o).split('#')[-1]
                        if subtemplate_id not in visited_templates:
                            templates_to_query.add(subtemplate_id)
                    
                    # If the object is a template or subtemplate, add it to the query list
                    if str(o).startswith("http://example.org/schema#S") or str(o).startswith("http://example.org/schema#T"):
                        subtemplate_id = str(o).split('#')[-1]
                        if subtemplate_id not in visited_templates:
                            templates_to_query.add(subtemplate_id)
        
        # Return all results
        return results
    
    finally:
        # Close the connection
        conn.close()

# Configuration for the AllegroGraph connection
repo = 'cfr2sbvr'
host = 'localhost'
port = '10035'
user = 'super'
password = '2002'

# Initial template to start the query
initial_templates = ['T7']

# Fetch subtemplate details
fetch_subtemplate_details(repo, host, port, user, password, initial_templates)


T7 <http://example.org/schema#explanation> ""
T7 <http://example.org/schema#id> "T7"
T7 <http://example.org/schema#text> "{A|An} <term 1>\n  {of {a|an} <term 2>| }\nis by definition\n{a|an|the} <term 3>\n  <qualifying clause>.\n"
T7 <http://example.org/schema#usesSubtemplate> <http://example.org/schema#S14>
T7 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/schema#Template>


['T7 <http://example.org/schema#explanation> ""',
 'T7 <http://example.org/schema#id> "T7"',
 'T7 <http://example.org/schema#text> "{A|An} <term 1>\\n  {of {a|an} <term 2>| }\\nis by definition\\n{a|an|the} <term 3>\\n  <qualifying clause>.\\n"',
 'T7 <http://example.org/schema#usesSubtemplate> <http://example.org/schema#S14>',
 'T7 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/schema#Template>']

In [None]:
# Import necessary libraries
from franz.openrdf.connect import ag_connect
from franz.openrdf.query.query import QueryLanguage
from rdflib import URIRef, Namespace

# Function to recursively fetch subtemplate relationships
def fetch_subtemplate_details(repo, host, port, user, password, initial_templates):
    # Connect to the AllegroGraph repository
    conn = ag_connect(repo=repo, catalog='root', host=host, port=port, 
                      user=user, password=password)
    try:
        # Initialize a set to store all visited templates and avoid re-querying them
        visited_templates = set()
        templates_to_query = set(initial_templates)
        results = []  # Store results here

        # Define the namespace for predicates and objects
        ns1 = Namespace("http://example.org/schema#")
        uses_subtemplate_predicate = URIRef(ns1.usesSubtemplate)
        
        while templates_to_query:
            template_id = templates_to_query.pop()
            if template_id in visited_templates:
                continue

            # Mark this template as visited
            visited_templates.add(template_id)

            # Query to get predicates and objects of the given template
            query_string = f"""
                PREFIX ns1: <http://example.org/schema#>
                SELECT ?p ?o
                WHERE {{
                    ns1:{template_id} ?p ?o .
                }}
                ORDER BY ?p
            """
            
            # Prepare and execute the SPARQL query
            tuple_query = conn.prepareTupleQuery(QueryLanguage.SPARQL, query_string)
            result = tuple_query.evaluate()

            print(result)
            
            with result:
                for binding_set in result:
                    p = binding_set.getValue("p")
                    o = binding_set.getValue("o")
                    result_str = f"{template_id} {p} {o}"
                    results.append(result_str)
                    
                    # Check if the predicate is 'usesSubtemplate'
                    if p == uses_subtemplate_predicate:
                        subtemplate_id = str(o).split('#')[-1]
                        if subtemplate_id not in visited_templates:
                            templates_to_query.add(subtemplate_id)
                    
                    # Check if the object is within the namespace and starts with 'S' or 'T'
                    if isinstance(o, URIRef) and o.startswith(ns1):
                        subtemplate_id = str(o).split('#')[-1]
                        if subtemplate_id not in visited_templates:
                            templates_to_query.add(subtemplate_id)
                            print(subtemplate_id)
                    print(templates_to_query)

        # Return all results
        return results
    
    finally:
        # Close the connection
        conn.close()

# Configuration for the AllegroGraph connection
repo = 'cfr2sbvr'
host = 'localhost'
port = '10035'
user = 'super'
password = '2002'

# Initial template to start the query
initial_templates = ['T7']

# Fetch subtemplate details
fetch_subtemplate_details(repo, host, port, user, password, initial_templates)

<franz.openrdf.query.queryresult.TupleQueryResult object at 0x7f52d758f290>
T7 <http://example.org/schema#explanation> ""
set()
T7 <http://example.org/schema#id> "T7"
set()
T7 <http://example.org/schema#text> "{A|An} <term 1>\n  {of {a|an} <term 2>| }\nis by definition\n{a|an|the} <term 3>\n  <qualifying clause>.\n"
set()
T7 <http://example.org/schema#usesSubtemplate> <http://example.org/schema#S14>
set()
T7 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/schema#Template>
set()


['T7 <http://example.org/schema#explanation> ""',
 'T7 <http://example.org/schema#id> "T7"',
 'T7 <http://example.org/schema#text> "{A|An} <term 1>\\n  {of {a|an} <term 2>| }\\nis by definition\\n{a|an|the} <term 3>\\n  <qualifying clause>.\\n"',
 'T7 <http://example.org/schema#usesSubtemplate> <http://example.org/schema#S14>',
 'T7 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/schema#Template>']

In [39]:
import yaml

def load_yaml(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file) or {}

def get_template_data(template_key, data):
    if isinstance(data, dict):
        return data.get(template_key, None)
    elif isinstance(data, list):
        for item in data:
            if isinstance(item, dict) and template_key in item:
                return item[template_key]
    return None

def format_template_output(template_key, template_data):
    output = f"# {template_key}\n\n"
    if 'usesSubtemplate' in template_data:
        output += f"## usesSubtemplate\n{template_data['usesSubtemplate']}\n\n"
    if 'text' in template_data:
        output += f"## text\n{template_data['text']}\n\n"
    if 'explanation' in template_data:
        output += f"## explanation\n{template_data['explanation']}\n\n"
    return output

def main():
    # Load the YAML files
    witt_template_file = '../data/witt_template_subtemplate_relationship.yaml'
    witt_templates_file = '../data/witt_templates.yaml'
    witt_subtemplates_file = '../data/witt_subtemplates.yaml'
    
    witt_template_data = load_yaml(witt_template_file).get('template_subtemplate_relationship', {})
    witt_templates_data = load_yaml(witt_templates_file).get('template_list', [])
    witt_subtemplates_data = load_yaml(witt_subtemplates_file).get('subtemplate_list', [])
    
    # Template key to search for
    template_key = 'T7'

    # Get data for the given template key from witt_template_file
    template_data = get_template_data(template_key, witt_template_data)
    if not template_data:
        # If not found in witt_template_file, look in witt_templates and witt_subtemplates
        template_data = get_template_data(template_key, witt_templates_data)
        if not template_data:
            template_data = get_template_data(template_key, witt_subtemplates_data)

    # If the data is a list (like ['S14']), convert it to a dictionary with usesSubtemplate key
    if template_data and isinstance(template_data, list):
        template_data = {'usesSubtemplate': ', '.join(template_data)}

    # For T* templates, add text and explanation from witt_templates_data if not already present
    if template_key.startswith('T') and template_data:
        additional_data = get_template_data(template_key, witt_templates_data)
        if additional_data:
            template_data.update({k: v for k, v in additional_data.items() if k not in template_data})
        
    # If the template uses subtemplates, get details for each subtemplate from witt_subtemplates_data
    if template_data and 'usesSubtemplate' in template_data:
        subtemplate_keys = template_data['usesSubtemplate'].split(', ')
        for sub_key in subtemplate_keys:
            sub_key_data = get_template_data(sub_key, witt_subtemplates_data)
            if sub_key_data:
                # Append subtemplate text and explanation to the main template data
                if 'text' in sub_key_data:
                    if 'text' in template_data:
                        template_data['text'] += f"\n\nFrom {sub_key} text:\n{sub_key_data['text']}"
                    else:
                        template_data['text'] = f"From {sub_key} text:\n{sub_key_data['text']}"
                if 'explanation' in sub_key_data:
                    if 'explanation' in template_data:
                        template_data['explanation'] += f"\n\nFrom {sub_key} explanation:\n{sub_key_data['explanation']}"
                    else:
                        template_data['explanation'] = f"From {sub_key} explanation:\n{sub_key_data['explanation']}"

    # Debugging output to check the final merged template data
    print(f"Final merged data for key '{template_key}':", template_data)

    # Print the output if the template data is found
    if template_data:
        output = format_template_output(template_key, template_data)
        print(output)
    else:
        print(f"Template key '{template_key}' not found in any of the templates.")

if __name__ == "__main__":
    main()

Final merged data for key 'T7': {'usesSubtemplate': 'S14'}
# T7

## usesSubtemplate
S14




## Get Witt (2012) rules taxonomy classification

### Final version

See:

- [../src/cfr2sbvr_modules.py](../src/cfr2sbvr_modules.py)
- [../data/classify_subtypes.yaml](../data/classification.yaml)
- [../data/witt_templates.yaml](../data/witt_templates.yaml)
- [../data/witt_examples.yaml](../data/witt_examples.yaml)

In [4]:
import yaml

class RuleInformationProvider:
    """
    A class to provide information about rule classifications and templates based on YAML data.

    This class loads and processes rule classification data, template data, and example data from specified YAML files.
    It is used to generate markdown documentation for a given rule type, including details such as templates and examples.

    Attributes:
    -----------
    data_path : str
        Path to the directory containing the YAML files.
    template_dict : dict
        Dictionary containing template information loaded from the templates YAML file.
    examples_dict : dict
        Dictionary containing example information loaded from the examples YAML file.
    """
    
    def __init__(self, data_path):
        """
        Initializes the RuleInformationProvider with the specified data path.

        Parameters:
        -----------
        data_path : str
            Path to the directory containing the YAML files with rules, templates, and examples.
        """
        self.data_path = data_path
        self.template_dict = self._load_yaml(f'{data_path}/witt_templates.yaml', 'template_list')
        self.examples_dict = self._load_yaml(f'{data_path}/witt_examples.yaml', 'example_list')

    def _load_yaml(self, file_path, list_key=None):
        """
        Loads data from a YAML file.

        Parameters:
        -----------
        file_path : str
            Path to the YAML file to be loaded.
        list_key : str, optional
            Key used to extract a specific list from the YAML data. If provided, returns a dictionary indexed by 'id'.

        Returns:
        --------
        dict
            If list_key is provided, returns a dictionary with items indexed by 'id'.
        Any type
            If list_key is not provided, returns the entire data structure from the YAML file.
        """
        with open(file_path, 'r') as file:
            data = yaml.safe_load(file)
            if list_key:
                return {item['id']: item for item in data[list_key]}
            return data

    def get_classification_and_templates(self, section_title):
        """
        Retrieves classification information and templates for a specified rule section.

        Parameters:
        -----------
        section_title : str
            Title of the section for which to retrieve information.

        Returns:
        --------
        str
            A markdown formatted string containing the classification details, templates, and examples for the given section.
        """
        data = self._load_yaml(f'{self.data_path}/classify_subtypes.yaml')
        filtered_data = self._filter_sections_by_title(data, section_title)
        return self._convert_to_markdown(filtered_data)

    def _filter_sections_by_title(self, data, title):
        """
        Filters sections based on the given title.

        Parameters:
        -----------
        data : list
            List of sections to filter from.
        title : str
            Title to filter sections by.

        Returns:
        --------
        list
            A list of sections that match the given title.
        """
        return [section for section in data if section['section_title'] == title]

    def _convert_to_markdown(self, filtered_data):
        """
        Converts filtered rule classification data to markdown format.

        Parameters:
        -----------
        filtered_data : list
            List of filtered sections to convert into markdown.

        Returns:
        --------
        str
            A markdown formatted string representing the filtered sections.
        """
        def process_section(section, level=2):
            """
            Processes a section recursively and converts it to markdown format.

            Parameters:
            -----------
            section : dict
                The section to process.
            level : int, optional
                The heading level for the section title in markdown (default is 1).

            Returns:
            --------
            str
                A markdown formatted string for the section and its subsections.
            """
            markdown = f"{'#' * level} {section['section_title']}\n\n"
            markdown += f"**ID**: {section['section_id']}\n\n"
            markdown += f"**Definition**: {section['section_definition']}\n\n"

            if 'templates' in section and section['templates']:
                for template_id in section['templates']:
                    if template_id in self.template_dict:
                        template = self.template_dict[template_id]
                        markdown += f"**Template ID**: {template_id}\n\n"
                        markdown += f"**Template Explanation**: {template['explanation']}\n\n"
                        markdown += f"**Template Text**:\n\n```template\n{template['text']}```\n\n"
                    else:
                        markdown += f"**Template ID**: {template_id} - No details found.\n\n"
            else:
                markdown += "**Templates**: Look in the subsection(s).\n\n"

            if 'examples' in section and section['examples']:
                for example_id in section['examples']:
                    if example_id in self.examples_dict:
                        example = self.examples_dict[example_id]
                        markdown += f"**Example ID**: {example_id}\n\n"
                        markdown += f"**Example Text**:\n\n```example\n{example['text']}```\n\n"
                    else:
                        markdown += f"**Example ID**: {example_id} - No details found.\n\n"

            if 'subsections' in section:
                for subsection in section['subsections']:
                    markdown += process_section(subsection, level + 1)

            return markdown

        markdown = ""
        for section in filtered_data:
            markdown += process_section(section)
        return markdown



In [9]:
# Example usage
rule_provider = RuleInformationProvider("../data")
markdown_data = rule_provider.get_classification_and_templates("Data rules")
print(markdown_data)

## Data rules

**ID**: 9.3

**Definition**: Data rules (all of which are operative rules) constrains the data included in a transaction (a form or message) or a persistent dataset (e.g., a database record). Data rules can in turn be categorized as:

**Templates**: Look in the subsection(s).

### Data cardinality rules

**ID**: 9.3.1

**Definition**: A data cardinality rule requires the presence or absence of a data item and/or places a restriction on the maximum or minimum number of occurrences of a data item.

**Templates**: Look in the subsection(s).

#### Mandatory data rules

**ID**: 9.3.1.1

**Definition**: A mandatory data rule mandates the presence of data.

**Templates**: Look in the subsection(s).

##### Mandatory data item rules

**ID**: 9.3.1.1.1

**Definition**: A mandatory data item rule requires that a particular data item be present.

**Template ID**: T19

**Template Explanation**: 

**Template Text**:

```template
Each <transaction signifier>
must {specify|contain} <car

### Other attempts

In [None]:
# TODO: Refactor the code to encapsulate the logic. With template and example
import yaml

def generate_markdown_for_section(section_title, data_path):
    def filter_sections_by_title(data, title):
        # Filters the given structure by section_title at the first level
        filtered_sections = [section for section in data if section['section_title'] == title]
        return filtered_sections

    # Read data from yaml file
    with open(f'{data_path}/classify_subtypes.yaml', 'r') as file:
        data = yaml.safe_load(file)

    # Read template explanations from yaml file
    with open(f'{data_path}/witt_templates.yaml', 'r') as yaml_file:
        templates_data = yaml.safe_load(yaml_file)
        template_dict = {template['id']: template for template in templates_data['template_list']}

    # Read examples from yaml file
    with open(f'{data_path}/witt_examples.yaml', 'r') as examples_file:
        examples_data = yaml.safe_load(examples_file)
        examples_dict = {example['id']: example for example in examples_data['example_list']}

    # Filter data for the given section
    filtered_data = filter_sections_by_title(data, section_title)

    # Convert filtered data to markdown format
    def convert_to_markdown(filtered_data):
        def process_section(section, level=1):
            markdown = f"{'#' * level} {section['section_title']}\n\n"
            markdown += f"**ID**: {section['section_id']}\n\n"
            markdown += f"**Definition**: {section['section_definition']}\n\n"
            if 'templates' in section and section['templates']:
                for template_id in section['templates']:
                    if template_id in template_dict:
                        template = template_dict[template_id]
                        markdown += f"**Template ID**: {template_id}\n\n"
                        markdown += f"**Template Explanation**: {template['explanation']}\n\n"
                        markdown += f"**Template Text**:\n\n```template\n{template['text']}```\n\n"
                    else:
                        markdown += f"**Template ID**: {template_id} - No details found.\n\n"
            else:
                markdown += "**Templates**: Look in the subsection(s).\n\n"
            if 'examples' in section and section['examples']:
                for example_id in section['examples']:
                    if example_id in examples_dict:
                        example = examples_dict[example_id]
                        markdown += f"**Example ID**: {example_id}\n\n"
                        markdown += f"**Example Text**:\n\n```example\n{example['text']}```\n\n"
                    else:
                        markdown += f"**Example ID**: {example_id} - No details found.\n\n"
            if 'subsections' in section:
                for subsection in section['subsections']:
                    markdown += process_section(subsection, level + 1)
            return markdown

        markdown = ""
        for section in filtered_data:
            markdown += process_section(section)
        return markdown

    return convert_to_markdown(filtered_data)

# Example usage
markdown_data = generate_markdown_for_section("Data rules", "../data")
print(markdown_data)


# Data rules

**ID**: 9.3

**Definition**: Data rules (all of which are operative rules) constrains the data included in a transaction (a form or message) or a persistent dataset (e.g., a database record). Data rules can in turn be categorized as:

**Templates**: Look in the subsection(s).

## Data cardinality rules

**ID**: 9.3.1

**Definition**: A data cardinality rule requires the presence or absence of a data item and/or places a restriction on the maximum or minimum number of occurrences of a data item.

**Templates**: Look in the subsection(s).

### Mandatory data rules

**ID**: 9.3.1.1

**Definition**: A mandatory data rule mandates the presence of data.

**Templates**: Look in the subsection(s).

#### Mandatory data item rules

**ID**: 9.3.1.1.1

**Definition**: A mandatory data item rule requires that a particular data item be present.

**Template ID**: T19

**Template Explanation**: 

**Template Text**:

```template
Each <transaction signifier>
must {specify|contain} <cardina

In [None]:
# TODO: Refactor the code to encapsulate the logic. With template only
import yaml

def filter_sections_by_title(data, title):
    # Filters the given structure by section_title at the first level
    filtered_sections = [section for section in data if section['section_title'] == title]
    return filtered_sections

# Read data from yaml file
with open('../data/classify_subtypes.yaml', 'r') as file:
    data = yaml.safe_load(file)

# Read template explanations from yaml file
with open('../data/witt_templates.yaml', 'r') as yaml_file:
    templates_data = yaml.safe_load(yaml_file)
    template_dict = {template['id']: template for template in templates_data['template_list']}

# Example usage
filtered_data = filter_sections_by_title(data, "Data rules")

# Convert filtered data to markdown format
def convert_to_markdown(filtered_data):
    def process_section(section, level=1):
        markdown = f"{'#' * level} {section['section_title']}\n\n"
        markdown += f"**Section ID**: {section['section_id']}\n\n"
        markdown += f"**Definition**: {section['section_definition']}\n\n"
        if 'templates' in section and section['templates']:
            for template_id in section['templates']:
                if template_id in template_dict:
                    template = template_dict[template_id]
                    markdown += f"**Template ID**: {template_id}\n\n"
                    markdown += f"**Explanation**: {template['explanation']}\n\n"
                    markdown += f"**Text**: {template['text']}\n\n"
                else:
                    markdown += f"**Template ID**: {template_id} - No details found.\n\n"
        else:
            markdown += "**Templates**: Look in the subsection.\n\n"
        if 'subsections' in section:
            for subsection in section['subsections']:
                markdown += process_section(subsection, level + 1)
        return markdown

    markdown = ""
    for section in filtered_data:
        markdown += process_section(section)
    return markdown

markdown_data = convert_to_markdown(filtered_data)
print(markdown_data)

# Data rules

**Section ID**: 9.3

**Definition**: Data rules (all of which are operative rules) constrains the data included in a transaction (a form or message) or a persistent dataset (e.g., a database record). Data rules can in turn be categorized as:

**Templates**: Look in the subsection.

## Data cardinality rules

**Section ID**: 9.3.1

**Definition**: A data cardinality rule requires the presence or absence of a data item and/or places a restriction on the maximum or minimum number of occurrences of a data item

**Templates**: Look in the subsection.

### Mandatory data rules

**Section ID**: 9.3.1.1

**Definition**: A mandatory data rule mandates the presence of data:

**Templates**: Look in the subsection.

#### Mandatory data item rules

**Section ID**: 9.3.1.1.1

**Definition**: A mandatory data item rule requires that a particular data item be present.

**Template ID**: T19

**Explanation**: 

**Text**: Each <transaction signifier>
must {specify|contain} <cardinality> <da

In [None]:
# Without template

import json

def filter_sections_by_title(data, title):
    # Filters the given structure by section_title at the first level
    filtered_sections = [section for section in data if section['section_title'] == title]
    return filtered_sections

# Read data from file
with open('../data/classify_subtypes.json', 'r') as file:
    data = json.load(file)

# Example usage
filtered_data = filter_sections_by_title(data, "Data rules")

# Convert filtered data to markdown format
def convert_to_markdown(filtered_data):
    def process_section(section, level=1):
        markdown = f"{'#' * level} {section['section_title']}\n\n"
        markdown += f"**Section ID**: {section['section_id']}\n\n"
        markdown += f"**Definition**: {section['section_definition']}\n\n"
        if 'templates' in section and section['templates']:
            markdown += f"**Templates**: {', '.join(section['templates'])}\n\n"
        else:
            markdown += "**Templates**: Look in the subsection(s).\n\n"
        if 'subsections' in section:
            for subsection in section['subsections']:
                markdown += process_section(subsection, level + 1)
        return markdown

    markdown = ""
    for section in filtered_data:
        markdown += process_section(section)
    return markdown

markdown_data = convert_to_markdown(filtered_data)
print(markdown_data)


# Data rules

**Section ID**: 9.3

**Definition**: Data rules (all of which are operative rules) constrains the data included in a transaction (a form or message) or a persistent dataset (e.g., a database record). Data rules can in turn be categorized as:

**Templates**: Look in the subsection.

## Data cardinality rules

**Section ID**: 9.3.1

**Definition**: A data cardinality rule requires the presence or absence of a data item and/or places a restriction on the maximum or minimum number of occurrences of a data item

**Templates**: Look in the subsection.

### Mandatory data rules

**Section ID**: 9.3.1.1

**Definition**: A mandatory data rule mandates the presence of data:

**Templates**: Look in the subsection.

#### Mandatory data item rules

**Section ID**: 9.3.1.1.1

**Definition**: A mandatory data item rule requires that a particular data item be present.

**Templates**: T19

#### Mandatory option selection rules

**Section ID**: 9.3.1.1.2

**Definition**: A mandatory option

## Json to yaml

### Experimental version

In [1]:
import json
import yaml


def json_to_yaml(input_file, output_file):
    # Read the JSON data
    with open(input_file, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)

    # Convert the data to YAML and write it to the output file
    with open(output_file, "w", encoding="utf-8") as yaml_file:
        yaml.dump(data, yaml_file, allow_unicode=True, sort_keys=False)

    print(f"Data has been successfully transformed to {output_file}")


# Specify input and output files
input_file = "../data/classify_subtypes.json"
output_file = "../data/classify_subtypes.yaml"

# Run the transformation
json_to_yaml(input_file, output_file)

Data has been successfully transformed to ../data/classify_subtypes.yaml


## Estimate number of tokens

### Final version

See:

- [../src/cfr2sbvr_modules.py](../src/cfr2sbvr_modules.py)

In [38]:
import tiktoken

def estimate_tokens_tiktoken(text, model="gpt-3.5-turbo"):
    """
    Estimates the number of tokens in a given text using the OpenAI `tiktoken` library, 
    which closely approximates the tokenization method used by OpenAI language models.

    Parameters:
        text (str): The text to be tokenized and counted.
        model (str): The model to use for tokenization. Defaults to "gpt-3.5-turbo".
                     Supported models include "gpt-3.5-turbo" and "gpt-4".

    Returns:
        int: The estimated number of tokens in the text.
    
    Raises:
        ValueError: If the specified model is not supported by `tiktoken`.

    Example:
        >>> text = "This is a sample text."
        >>> estimate_tokens_tiktoken(text)
        6
    """
    # Load the appropriate tokenizer
    try:
        tokenizer = tiktoken.encoding_for_model(model)
    except KeyError:
        raise ValueError(f"Model '{model}' is not supported by tiktoken.")
    
    # Tokenize the text and return the token count
    tokens = tokenizer.encode(text)
    return len(tokens)

# Example usage
text = "This is an example sentence to estimate token count."
print("Estimated tokens:", estimate_tokens_tiktoken(text))

print("Estimated tokens:", estimate_tokens_tiktoken(markdown_data, model="gpt-4o"))


Estimated tokens: 10
Estimated tokens: 6244
