# Lab 4 - Rules classification

Initial version.

## Imports

In [3]:
# only for labs
import sys
sys.path.append(r'../src')

In [4]:
# Standard library imports
from collections import defaultdict
import json
import re
from pathlib import Path

# Third-party libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pydantic import BaseModel, Field
from sklearn.metrics import confusion_matrix, classification_report
from typing import List, Dict, Optional, Any, Tuple, Set

# Local application/library-specific imports
import checkpoint.main as checkpoint
from checkpoint.main import restore_checkpoint, save_checkpoint, Document, DocumentProcessor, get_all_checkpoints
import configuration.main as configuration
import logging_setup.main as logging_setup
import token_estimator.main as token_estimator
from token_estimator.main import estimate_tokens
import rules_taxonomy_provider.main as rules_taxonomy_provider
from rules_taxonomy_provider.main import RuleInformationProvider, RulesTemplateProvider
import llm_query.main as llm_query
from llm_query.main import query_instruct_llm

DEV_MODE = True

if DEV_MODE:
    # Development mode
    import importlib

    importlib.reload(configuration)
    importlib.reload(logging_setup)
    importlib.reload(checkpoint)
    importlib.reload(token_estimator)
    importlib.reload(rules_taxonomy_provider)
    importlib.reload(llm_query)

## Settings

Default settings, check them before run the notebook.

### Get configuration

In [5]:
# load config
DEFAULT_CONFIG_FILE = "../config.yaml"
config = configuration.load_config(DEFAULT_CONFIG_FILE)

Generated files for analysis in this run

In [6]:
print(config["DEFAULT_CHECKPOINT_FILE"],
config["DEFAULT_EXTRACTION_REPORT_FILE"],
config["DEFAULT_EXCEL_FILE"])

../data/checkpoints/documents-2024-11-03-2.json ../outputs/extraction_report-2024-11-03-1.html ../outputs/compare_items_metrics.xlsx


### Logging configuration

In [7]:
logger = logging_setup.setting_logging(config["DEFAULT_LOG_DIR"], config["LOG_LEVEL"])

2024-11-03 23:19:59 - INFO - Logging is set up with daily rotation.


## Checkpoints

Documents, annoted datasets, statistics and metrics about the execution of the notebook are stored by checkpoint module.

Checkpoints are stored / retrieved at the directory `DEFAULT_CHECKPOINT_FILE` in the configuration file.

During the execution, it will restore the checkpoint at the beginning of the section and saved at the end. We can run and restore the checkpoint several times. If the run fails, check the closest checkpoint and restore it.

### Restore the checkpoint

In [8]:
# Restore the checkpoint

# For development only
config["DEFAULT_CHECKPOINT_FILE"] = "../data/checkpoints/documents-2024-11-01-3.json"

manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])

2024-11-03 23:19:59 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-11-01-3.json.


## Datasets

Datasets used in the notebook. They are divided into sections and true tables. The sections are the documents from CFR and true tables are annoted  or "golden" datasets.

### General functions and data structures

In [9]:
def basic_text_stats(text: str) -> Tuple[int, int, int]:
    """
    Computes basic text statistics: number of lines, words, and average words per line.

    Args:
        text (str): The text to analyze.

    Returns:
        Tuple[int, int, int]: A tuple containing the number of lines, total words, and average words per line.
    """
    lines=len(text.split("\n"))
    words=len(text.split(" "))
    avg_words_per_line=round(words/lines)
    return lines, words, avg_words_per_line

In [10]:
def get_section_from_kg(conn: Any, section_num: str) -> str:
    """
    Retrieves a section from the Knowledge Graph based on the section number.

    Args:
        conn: The connection object to the Knowledge Graph.
        section_num (str): The section number to query.

    Returns:
        str: The retrieved section content as a string.

    Raises:
        Exception: If there is an error executing the query.
    """
    # Query section number from KG
    query = """
    PREFIX fro-cfr: <http://finregont.com/fro/cfr/Code_Federal_Regulations.ttl#>
    PREFIX fro-leg-ref: <http://finregont.com/fro/ref/LegalReference.ttl#>

    SELECT ?section ?section_seq ?section_num ?section_subject ?section_citation ?section_notes ?divide ?divide_seq ?paragraph_enum ?paragraph_text
    WHERE {
      ?section a fro-cfr:CFR_Section ;
        fro-leg-ref:hasSequenceNumber ?section_seq ;
        fro-cfr:hasSectionNumber ?section_num ;
        fro-cfr:hasSectionSubject ?section_subject .
      OPTIONAL {?section fro-leg-ref:refers_toNote ?section_notes} .
      OPTIONAL {?section fro-cfr:hasSectionCitation ?section_citation} .

      ?divide fro-leg-ref:divides ?section ; # rdf:type fro-cfr:CFR_Parapraph
        fro-leg-ref:hasSequenceNumber ?divide_seq ;
        fro-cfr:hasParagraphText ?paragraph_text ;
        fro-leg-ref:hasSequenceNumber ?paragraph_seq .
      OPTIONAL {?divide fro-cfr:hasParagraphEnumText ?paragraph_enum} .
    """ + f"""
      FILTER("{section_num}" = ?section_num)
    """ + """
    }
    ORDER BY ?section_num ?section ?divide_seq
    """
    tuple_query = conn.prepareTupleQuery(QueryLanguage.SPARQL, query)
    result = tuple_query.evaluate()

    logger.debug(f"result.metadata: {result.metadata}")
    logger.debug(f"result.variable_names: {result.variable_names}")

    body_text = ""
    previous_section = None
    previous_paragraph_id = None
    with result:
      for binding_set in result:
          section = binding_set.getValue("section")
          section_seq = str(binding_set.getValue("section_seq")).replace('"', '')
          section_num = str(binding_set.getValue("section_num")).replace('"', '')
          section_subject = str(binding_set.getValue("section_subject")).replace('"', '')
          section_citation = str(binding_set.getValue("section_citation")).replace('"', '')
          section_notes = str(binding_set.getValue("section_notes")).replace('"', '')
          divide = binding_set.getValue("divide")
          divide_seq = str(binding_set.getValue("divide_seq")).replace('"', '')
          paragraph_enum = str(binding_set.getValue("paragraph_enum")).replace('"', '')
          paragraph_text = str(binding_set.getValue("paragraph_text")).replace('"', '')
          # Header
          if previous_section != section:
            previous_section = section
            header = f"""
    section_number: {section_num}
    section_subject: {section_subject}
    section_id: {section}
    citations: {section_citation}
    notes: {section_notes}
            """
          # Body
          if paragraph_enum != "None":
            body_text += f"""
    paragraph_enumeration: {paragraph_enum}
    paragraph_text: {paragraph_text}
    """
          else:
            body_text += f"""
    paragraph_text: {paragraph_text}
    """

    return header + body_text


In [11]:
def calculate_content_quantities_p1(doc_id, content_data, filename):
    elements = content_data.get("elements", [])
    logger.debug(elements)

    # Collect statistics
    num_elements = len(elements)
    fact_count = 0
    fact_type_count = 0
    rule_count = 0
    verb_count = 0
    term_count = 0

    # Process each element within the document
    for element in elements:
        classification = element.get("classification", "Unknown")
        if classification == "Fact":
            fact_count += 1
        elif classification == "Fact Type":
            fact_type_count += 1
        elif classification == "Rule":
            rule_count += 1
        verb_count += len(element.get("verb_symbols", []))
        term_count += len(element.get("terms", []))

    return {
        "document_id": doc_id,
        "quantity_of_elements": num_elements,
        "quantity_of_facts": fact_count,
        "quantity_of_fact_types": fact_type_count,
        "quantity_of_rules": rule_count,
        "quantity_of_verbs": verb_count,
        "quantity_of_terms": term_count,
        "filename": filename,
    }

In [12]:
def process_documents_p1(file_path, file_name, doc_ids):
    # Initialize data containers for the two tables
    table_data = []

    with open(file_path, 'r') as file:
        content = json.load(file)

        # Iterate over each document in the file
        for doc_id, content_data in content.items():
            logger.debug(doc_id, content_data)
            # Check if the document ID is in the list to process
            #if doc_id in doc_ids and 'content' in doc_data:
            if all([doc_id in doc_ids, 'content' in content_data]):
                table_data.append(calculate_content_quantities_p1(doc_id, content_data['content'], file_name))

    return table_data


In [13]:
def calculate_content_quantities_p2(doc_id, content_data, filename):
    terms_relationship = content_data['content'].get('terms_relationship', [])
    logger.debug(f"terms_relationship: {terms_relationship}")
    terms = content_data['content']['terms']
    logger.debug(f"terms: {terms}")

    # Count terms with and without definitions
    total_terms = len(terms)
    terms_with_definition = sum(1 for term in terms if term.get('definition'))
    terms_without_definition = total_terms - terms_with_definition

    # Check for term relationships and count them
    terms_relationship_count = len(terms_relationship)

    # Add data to table
    return {
        "document_id": doc_id,
        "count_of_terms": total_terms,
        "terms_with_definition": terms_with_definition,
        "terms_without_definition": terms_without_definition,
        "terms_relationship_count": terms_relationship_count,
        "filename": filename
    }

In [14]:
def process_documents_p2(file_path, file_name, doc_ids):
    table_data = []
    with open(file_path, 'r') as file:
        content = json.load(file)

        # Iterate over each document in the file
        for doc_id, doc_data in content.items():
            # Check if the document has terms in its content
            #if doc_id in doc_ids and 'content' in doc_data and 'terms' in doc_data['content']:
            if all([doc_id in doc_ids, 'content' in doc_data, 'terms' in doc_data['content']]):
                table_data.append(calculate_content_quantities_p2(doc_id, doc_data, file_name))
    return table_data

### Texts to extract the elements

CFR Sections 275.0-2, 275.0-5, 275.0-7

Section 275.0-2

In [15]:
manager.add_document(
    Document(
        id="§ 275.0-2",
        type="section",
content = """
§ 275.0-2 General procedures for serving non-residents.
(a) General procedures for serving process, pleadings, or other papers on non-resident investment advisers, general partners and managing agents.  Under Forms ADV and ADV-NR [17 CFR 279.1 and 279.4], a person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents:
  (1) A person may serve a non-resident investment adviser, non-resident general partner, or non-resident managing agent by furnishing the Commission with one copy of the process, pleadings, or papers, for each named party, and one additional copy for the Commission's records.
  (2) If process, pleadings, or other papers are served on the Commission as described in this section, the Secretary of the Commission (Secretary) will promptly forward a copy to each named party by registered or certified mail at that party's last address filed with the Commission.
  (3) If the Secretary certifies that the Commission was served with process, pleadings, or other papers pursuant to paragraph (a)(1) of this section and forwarded these documents to a named party pursuant to paragraph (a)(2) of this section, this certification constitutes evidence of service upon that party.
(b) Definitions.  For purposes of this section:
  (1) Managing agent  means any person, including a trustee, who directs or manages, or who participates in directing or managing, the affairs of any unincorporated organization or association other than a partnership.
  (2) Non-resident  means:
    (i) An individual who resides in any place not subject to the jurisdiction of the United States;
    (ii) A corporation that is incorporated in or that has its principal office and place of business in any place not subject to the jurisdiction of the United States; and
    (iii) A partnership or other unincorporated organization or association that has its principal office and place of business in any place not subject to the jurisdiction of the United States.
  (3) Principal office and place of business  has the same meaning as in § 275.203A-3(c) of this chapter.
"""
    )
)

Section 275.0-5

In [16]:
manager.add_document(
    Document(
        id="§ 275.0-5",
        type="section",
content = """
§ 275.0-5 Procedure with respect to applications and other matters.
The procedure hereinbelow set forth will be followed with respect to any proceeding initiated by the filing of an application, or upon the Commission's own motion, pursuant to any section of the Act or any rule or regulation thereunder, unless in the particular case a different procedure is provided:
(a) Notice of the initiation of the proceeding will be published in the Federal Register and will indicate the earliest date upon which an order disposing of the matter may be entered. The notice will also provide that any interested person may, within the period of time specified therein, submit to the Commission in writing any facts bearing upon the desirability of a hearing on the matter and may request that a hearing be held, stating his reasons therefor and the nature of his interest in the matter.
(b) An order disposing of the matter will be issued as of course following the expiration of the period of time referred to in paragraph (a) of this section, unless the Commission thereafter orders a hearing on the matter.
(c) The Commission will order a hearing on the matter, if it appears that a hearing is necessary or appropriate in the public interest or for the protection of investors,
  (1) upon the request of any interested person or
  (2) upon its own motion.
(d) Definition of application. For purposes of this rule, an “application” means any application for an order of the Commission under the Act other than an application for registration as an investment adviser.
"""
    )
)

Section 275.0-7

In [17]:
manager.add_document(
    Document(
        id="§ 275.0-7",
        type="section",
content = """
§ 275.0-7 Small entities under the Investment Advisers Act for purposes of the Regulatory Flexibility Act.
(a) For purposes of Commission rulemaking in accordance with the provisions of Chapter Six of the Administrative Procedure Act (5 U.S.C. 601 et seq.) and unless otherwise defined for purposes of a particular rulemaking proceeding, the term small business or small organization for purposes of the Investment Advisers Act of 1940 shall mean an investment adviser that:
  (1) Has assets under management, as defined under Section 203A(a)(3) of the Act (15 U.S.C. 80b-3a(a)(2)) and reported on its annual updating amendment to Form ADV (17 CFR 279.1), of less than $25 million, or such higher amount as the Commission may by rule deem appropriate under Section 203A(a)(1)(A) of the Act (15 U.S.C. 80b-3a(a)(1)(A));
  (2) Did not have total assets of $5 million or more on the last day of the most recent fiscal year; and
  (3) Does not control, is not controlled by, and is not under common control with another investment adviser that has assets under management of $25 million or more (or such higher amount as the Commission may deem appropriate), or any person (other than a natural person) that had total assets of $5 million or more on the last day of the most recent fiscal year.
(b) For purposes of this section:
  (1) Control  means the power, directly or indirectly, to direct the management or policies of a person, whether through ownership of securities, by contract, or otherwise.
    (i) A person is presumed to control a corporation if the person:
      (A) Directly or indirectly has the right to vote 25 percent or more of a class of the corporation's voting securities; or
      (B) Has the power to sell or direct the sale of 25 percent or more of a class of the corporation's voting securities.
    (ii) A person is presumed to control a partnership if the person has the right to receive upon dissolution, or has contributed, 25 percent or more of the capital of the partnership.
    (iii) A person is presumed to control a limited liability company (LLC) if the person:
      (A) Directly or indirectly has the right to vote 25 percent or more of a class of the interests of the LLC;
      (B) Has the right to receive upon dissolution, or has contributed, 25 percent or more of the capital of the LLC; or
      (C) Is an elected manager of the LLC.
    (iv) A person is presumed to control a trust if the person is a trustee or managing agent of the trust.
  (2) Total assets  means the total assets as shown on the balance sheet of the investment adviser or other person described above under paragraph (a)(3) of this section, or the balance sheet of the investment adviser or such other person with its subsidiaries consolidated, whichever is larger.
"""
    )
)

### True tables

True tables are annotated or "golden" datasets in which entities have been manually identified and labeled within the original source data.

True tables for sectiona 275.0-2, 275.0-5 and 275.0-7

Load true table for part 1.

In [18]:
with open(f"{config['DEFAULT_DATA_DIR']}/p1_true_table.json", 'r') as file:
    data = json.load(file)

    manager.add_document(
        Document.model_validate(data["§ 275.0-2_P1|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-5_P1|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-7_P1|true_table"])
    )

Load true table for part 2.

In [19]:
with open(f"{config['DEFAULT_DATA_DIR']}/p2_true_table.json", 'r') as file:
    data = json.load(file)

    manager.add_document(
        Document.model_validate(data["§ 275.0-2_P2|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-5_P2|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-7_P2|true_table"])
    )

### Save checkpoint

In [20]:
# Persist the state to a file
save_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"], manager=manager)

2024-11-03 23:19:59 - INFO - Checkpoint saved.


## Processes

### extract / classify elements

#### General functions and data structures

LLM response model for P2

In [21]:
class Classification(BaseModel):
    type: str = Field(..., description="Type of the rule (e.g., Party, Data, Activity)")
    confidence: float = Field(..., ge=0, le=1, description="Confidence level of the classification")
    explanation: str = Field(..., description="Explanation of why the classification was made")


class ExpressionClassification(BaseModel):
    doc_id: str = Field(..., description="Document ID associated with the expression")
    expression_id: int = Field(..., description="Unique ID of the expression")
    expression_text: str = Field(..., description="The text of the expression to be classified")
    expression_source: str = Field(..., description="Source of the expression")
    classification: List[Classification] = Field(..., description="List of classifications with explanations")

LLM response model for P2

In [22]:
class SubClassification(BaseModel):
    subtype: str = Field(..., description="Subtype of the rule. The title of the section/subsection.")
    templates_ids: List[str] = Field(..., description="List of template IDs that matched the expression.")
    confidence: float = Field(..., ge=0, le=1, description="Confidence level of the classification")
    explanation: str = Field(..., description="Explanation of why the classification was made")


class ExpressionSubClassification(BaseModel):
    doc_id: str = Field(..., description="Document ID associated with the expression")
    expression_id: int = Field(..., description="Unique ID of the expression")
    expression_text: str = Field(..., description="The text of the expression to be classified")
    expression_source: str = Field(..., description="Source of the expression")
    classification: List[SubClassification] = Field(..., description="List of classifications with explanations")


#### Prompt engeneering

##### Taxonomy classification and templates for rules

In [23]:
# TODO: Integrate lab 4
...

Instructions for classify rules using the top level of Witt (2012) taxonomy.

In examining the classification of terms, names, fact types, and rules within business systems, key distinctions arise between definitional and operative rules, as well as in the structuring of term definitions. Terms, names, and fact types are foundational elements in the taxonomy of definitional rules, though they are not rules themselves. Instead, they serve as the core vocabulary for creating precise definitions and facilitating the rule-making process within an organization. Terms represent general concepts or classes, names uniquely identify specific instances or entities, and fact types capture relationships between terms. Together, these elements enable the consistent and unambiguous use of language across rule statements, models, and documentation, ensuring that definitional and operative rules function cohesively.

Definitional rules aim to provide precise structures for organizational concepts, establishing clear, logical statements that support consistent interpretation and application of business language. These rules formalize term definitions, establish categorization schemes, and delineate relationships using fact types, which are classified by their structure as unary, binary, or higher-order and allow organizations to express relationships from simple Boolean properties to complex multi-term associations. When structured through definitional rule templates, fact types enhance clarity, facilitate shared understanding, and ensure accurate application within organizational rules, while supporting a conceptual backbone for business definitions.

Names function as unique identifiers for specific instances or entities, adding precision to rule statements that require exact identification. Embedded within definitional rules, names provide specific references crucial for rules that depend on individual entities, distinguishing them from broader terms and reducing ambiguity in complex business contexts. Through structured templates within the definitional rule taxonomy, organizations can integrate names with uniformity and clarity, allowing for reliable reference to distinct entities within rule statements and models. 

Operative rules, on the other hand, govern actionable requirements, setting conditions under which actions must or must not occur. These include data, activity, and party rules, each ensuring compliance, standardization, and procedural integrity across business processes. Operative rules provide the necessary conditions for maintaining organizational consistency, detailing what actions are authorized in particular circumstances or specifying roles within defined tasks, thereby aligning processes with business objectives.

To formalize term definitions, expressions must adhere to specific templates within the definitional rule taxonomy, which clearly articulate each term's scope, meaning, and responsibilities. For instance, the term "Commission" can be formally defined within Template T7, part of the definitional rule taxonomy, to clarify its procedural functions as the entity that receives and forwards legal documents. By using templates for definitional rules, organizations achieve consistent, unambiguous documentation of terms, fact types, and names, which minimizes ambiguity and ensures standardized interpretation across all business contexts. This structured approach supports precise rule governance and enhances communication within organizational processes, reinforcing the integrity and reliability of rule-based systems.

System prompt (Instructions) for classify rules using the top level of Witt (2012) taxonomy.

In [24]:
def get_system_prompt_classify_p1():
    return """
You are an expert in SBVR (Semantics of Business Vocabulary and Business Rules).

You will be provided with a list of expressions formatted as JSON.

Your task is to classify each expression into one or more Operative Rules types according to the given definitions.

You also need to record a confidence level for each classification and provide an explanation for why the classification was made.

# Classifications
The **Operative rules** Govern actions or constraints that must or must not happen under certain conditions, such as Data Rules, Activity Rules, and Party Rules. types to classify are:
- **Party rules**: A type of operative rule that restrict what parties can perform processes, activities, or play roles. They are operative rules.
- **Data rules**: A type of operative rule that constrain the data included in a transaction (e.g., forms or messages) or a persistent dataset.
- **Activity rules**: A type of operative rule that constrain the operation of one or more business processes or activities.

# Input JSON Format
The expressions are provided in the following JSON format:

```
[
    {
        "doc_id": "some doc id",
        "expression_id": "some id",
        "expression_source": "some source",
        "expression_text": "some text",
    }
]
```

# Task Requirements

1. Classify each "expression" into one or more of the provided rule types (Party, Data, Activity).
2. Assess a **confidence level** for each classification between 0 and 1.
3. Provide a **clear explanation** for the classification decision.

# Output Format
Your output must also be in JSON format. It should contain, for each expression:
- The `doc_id`
- The `expression_id`
- The original `expression_text`
- The `expression_source` of the expression
- A list of classifications (`classification`), each containing:
  - The `type` of the rule.
  - The `confidence` in your classification.
  - An `explanation` detailing why you made the classification decision.

Here is an example of the expected output:

```
[
    {
        "doc_id": "some doc id",
        "expression_id": "some id",
        "expression_text": "some text",
        "expression_source": "some source",
        "classification": [
            {
                "type": "Data",
                "confidence": 0.9,
                "explanation": "This expression defines a constaint that mandates the presence of data."
            },
            {
                "type": "Party",
                "confidence": 0.2,
                "explanation": "There is little reference to any restriction on participants or parties, which means this may not be a valid classification."
            },
            ...
        ]
    },
    {
        "doc_id": ...,
        "expression_id": ...,
        "expression_text": ...,
        "expression_source": ...,
        "classification": ...
    }
]
```

# Notes
- **Detail the Reasoning**: Make sure to provide explanations that justify why a particular rule type was chosen.
- **Confidence Values**: The confidence value should genuinely represent how strongly you believe the classification is correct, with 1 being an absolute match and 0 meaning unlikely.

Make sure that every expression is analyzed thoroughly, and the final justification for each classification is straightforward and adequately supports both the type choice and confidence level.
"""

In [25]:
def get_user_prompt_classify_p1(rules_to_classify):
    return f"""
# Here's the expressions you'll need to classify

{json.dumps(rules_to_classify, indent=2)}
"""

System prompt (Instructions) for classify rules using the top level of Witt (2012) taxonomy.

In [26]:
def get_system_prompt_classify_p2(classification):
    rule_information_provider = RuleInformationProvider("../data")

    subclassification_text = rule_information_provider.get_classification_and_templates(f"{classification} rules")

    return f"""
Classify each expression from the given JSON list into one or more Operative Rules subtypes, provide an explanation for each classification, and assign a confidence value between 0 and 1.

Use the Operative Rule subtype definitions, templates, and guidelines provided to perform a thorough analysis of each expression.

# Steps

1. **Identify Operative Rule Type(s)**: For each `expression_text` provided, classify the rule type according to the given Operative Rules subtypes and templates.
    - Use provided templates, definitions, and details on each subtype to determine the correct classification.
    - Cross-reference with template examples to accurately determine the appropriate rule subtype.

2. **Assign Confidence Level**: Assess the suitability of each classification by assigning a confidence level between 0 and 1. 
    - 1 indicates a very strong match, while lower numbers indicate weaker matches.

3. **Provide Explanation**: Provide a detailed, but concise, explanation justifying why a given subtype was assigned to the expression.
    - Include reasoning related to the template structure, terminology used, or specific conditions the rule mentions.

# Operative Rule subtype

{subclassification_text}

## Definitions
- **attribute term**: A term that signifies a non-Boolean property of an entity class (or object class).
- **role term**: A term that signifies the role played by one of the participating parties or objects in a relationship: for example, employer and employee are role terms (with respect to the relationship whereby an organization employs a person), whereas organization and person are not role terms.
- **category attribute term**: A term is usually admin-defined, with some external inputs. They have unique labels (e.g., 'Cash') and may use internal codes. Boolean attributes indicate "Yes" or "No" responses, shown as checkboxes or "Y/N" fields.
- **quantitative attribute**: An attribute on which some arithmetic can be performed (e.g., addition, subtraction) and on which comparisons other than "=" and "<>" can be performed.
- **qualifying clause**: refines a rule's scope or specificity by limiting the subject or other terms to particular subsets or conditions (e.g., “for a return journey” or “that is current”).

# Output Format

The output must be provided in JSON format. Each element of the expression list must contain:
- `doc_id`: The Document ID from the input.
- `expression_id`: The Expression ID from the input.
- `expression_text`: The original expression text from the input.
- `expression_source`: The source of the expression from the input.
- `classification`: A list that may contain multiple entries, each of which should have:
- `subtype`: Assigned rule subtype, use the title of the section/subsection (e.g., "Activity time limit rules").
- `templates_ids`: A list of template IDs that matched the expression.
- `confidence`: A float indicating confidence in classification.
- `explanation`: A textual explanation detailing why the classification was appropriate.

The output JSON should look like this:

```
[
    {{
        "doc_id": "some doc id",
        "expression_id": "some id",
        "expression_text": "some text",
        "expression_source": "some source",
        "classification": [
            {{
                "subtype": "Some Subtype Title",
                "templates_ids": ["T123", "T456"],
                "confidence": 0.9,
                "explanation": "This expression restricts the occurrence of an activity during a specified time. The use of 'must not occur' clearly indicates an Activity time limit rule."
            }},
            {{
                "subtype": "Another Subtype Title",
                "templates_ids": ["T789"],
                "confidence": 0.4,
                "explanation": "There are elements of a participation restriction, but since it isn't clearly specified, the match is weak."
            }}
        ]
    }},
    {{
        "doc_id": "another doc id",
        "expression_id": "another id",
        "expression_text": "another text",
        "expression_source": "another source",
        "classification": [
            {{
                "subtype": "Subtype Title",
                "templates_ids": ["T123"],
                "confidence": 0.7,
                "explanation": "The clause dictates who is allowed to perform the task, indicating a party restriction context."
            }}
        ]
    }}
]
```

# Notes
- **Detail the Reasoning**: Ensure the explanations refer to template structure, specific terminology used, or matching requirements for the rule subtype.
- **Confidence Assessment**: Be honest about the degree of certainty in your classification, and provide meaningful values for confidence (e.g., if templates do not match perfectly but there are similarities, confidence should be moderate to low).
- **Multiple Classifications**: In cases where one expression seems to fit multiple rule subtypes, include multiple classifications with appropriate confidence ratings and explanations for each.

This output will be used to not only understand classifications but also inform next steps regarding validation and business rule structuring.
"""

In [27]:
def get_user_prompt_classify_p2(content):
    return f"""
# Here's the expressions you'll need to classify

{json.dumps(content, indent=2)}
"""

##### Taxonomy and templates for terms and fact types

In [28]:
# TODO: Develop the classification functions for the two prompts

#### Execution

##### Rule classification

Processing P1 - Classify Operative Rules

User prompt (Data)

In [29]:
processor = DocumentProcessor(manager)

rules_to_classify_p1 = [
    {
        "doc_id": item["doc_id"],
        "expression_id": item["expression_id"],
        "expression_source": item["source"],
        "expression_text": item["expression"]
    }
    for item in processor.get_rules()
]

In [30]:
# Part 1 - Classify Operative Rules
user_prompt = get_user_prompt_classify_p1(rules_to_classify_p1)
system_prompt = get_system_prompt_classify_p1()

logger.info("P1. Classifing Operative Rules...")
logger.debug(system_prompt)
logger.debug(user_prompt)

response_classify_p1 = query_instruct_llm(
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    document_model=List[ExpressionClassification],
    llm_model=config["LLM"]["MODEL"],
    temperature=config["LLM"]["TEMPERATURE"],
    max_tokens=config["LLM"]["MAX_TOKENS"],
)

logger.debug(response_classify_p1)

doc_1 = Document(id="classify_P1", type="llm_response_classification", content=response_classify_p1)
manager.add_document(doc_1)

logger.info("Saving checkpoint...")
save_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"], manager=manager)

logger.info("Finished processing classification.")

2024-11-03 23:20:00 - INFO - P1. Classifing Operative Rules...
2024-11-03 23:20:10 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-03 23:20:10 - INFO - Execution time for query_instruct_llm: 10.36 seconds
2024-11-03 23:20:10 - INFO - Saving checkpoint...
2024-11-03 23:20:10 - INFO - Checkpoint saved.
2024-11-03 23:20:10 - INFO - Finished processing classification.


Processing P2 - Classify and get templates

User prompt (Data)

System and user prompt data

Create a prompt for each of the type of rule (Activity, Data, Party) from P1 with the expressions for that type.

In [31]:
document = manager.retrieve_document("classify_P1", doc_type="llm_response_classification")

rules_to_classify_p2 = [
    {
        "doc_id": item.doc_id,
        "expression_id": item.expression_id,
        "expression_source": item.expression_source,
        "expression_text": item.expression_text,
        "expression_type": max(item.classification, key=lambda x: x.confidence).type  # Get type with highest confidence
    }
    for item in document.content
]

In [35]:
user_prompt_classify_p2 = []
system_prompt_classify_p2 = []

# Group by 'expression_type'
grouped_data = defaultdict(list)
for item in rules_to_classify_p2:
    grouped_data[item["expression_type"]].append(item)

for item in grouped_data.keys():
    system_prompt = get_system_prompt_classify_p2(item)
    system_prompt_classify_p2.append(system_prompt)
    user_prompt = get_user_prompt_classify_p2(grouped_data[item])
    user_prompt_classify_p2.append(user_prompt)
    print(f"token count system prompt {item}: {token_estimator.estimate_tokens(system_prompt)}")
    print(f"token count user prompt {item}: {token_estimator.estimate_tokens(user_prompt)}")
    manager.add_document(
        Document(
            id="prompt-classify_P1",
            type="prompt",
            content=system_prompt
        )
    )

    manager.add_document(
        Document(
            id="prompt-classify_P2",
            type="prompt",
            content=user_prompt
        )
    )

token count system prompt Party: 2195
token count user prompt Party: 225
token count system prompt Activity: 4086
token count user prompt Activity: 383


In [36]:
# Persist the state to a file
save_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"], manager=manager)

2024-11-03 23:24:18 - INFO - Checkpoint saved.


P2. User Prompt to classify Operative Rules and get templates

Processing P2 - Subclassify Operative Rules

For each type of rule get response from the LLM.

In [37]:
# Part 2 - Classify and get templates for Operative Rules
logger.info(f"P2. processing {len(rules_to_classify_p2)} expressions and {len(system_prompt_classify_p2)} prompts...")
logger.info("P2. Classifying and getting templates for Operative Rules...")

# Initialize an empty list to accumulate all responses
all_responses_classify_p2 = []

for user_prompt, system_prompt in zip(user_prompt_classify_p2, system_prompt_classify_p2):
    logger.info("Processing classification and templates...")
    logger.debug(system_prompt)
    logger.debug(user_prompt)

    response_classify_p2 = query_instruct_llm(
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        document_model=List[ExpressionSubClassification],
        llm_model=config["LLM"]["MODEL"],
        temperature=config["LLM"]["TEMPERATURE"],
        max_tokens=config["LLM"]["MAX_TOKENS"],
    )

    logger.debug(response_classify_p2)

    # Accumulate the responses in the list
    all_responses_classify_p2.extend(response_classify_p2)

    logger.info("Finished processing classification and templates.")

# After the loop, create a single Document with all the accumulated responses
doc_2 = Document(
    id="classify_P2",
    type="llm_response_classification",
    content=all_responses_classify_p2,
)
manager.add_document(doc_2)

# Save the checkpoint after adding the combined document
logger.info("Saving checkpoint...")
save_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"], manager=manager)


2024-11-03 23:25:01 - INFO - P2. processing 6 expressions and 2 prompts...
2024-11-03 23:25:01 - INFO - P2. Classifying and getting templates for Operative Rules...
2024-11-03 23:25:01 - INFO - Processing classification and templates...
2024-11-03 23:25:10 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-03 23:25:10 - INFO - Execution time for query_instruct_llm: 5.32 seconds
2024-11-03 23:25:10 - INFO - Finished processing classification and templates.
2024-11-03 23:25:10 - INFO - Processing classification and templates...
2024-11-03 23:25:19 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-03 23:25:19 - INFO - Execution time for query_instruct_llm: 8.55 seconds
2024-11-03 23:25:19 - INFO - Finished processing classification and templates.
2024-11-03 23:25:19 - INFO - Saving checkpoint...
2024-11-03 23:25:19 - INFO - Checkpoint saved.


##### Rule transformation

Restore checkpoint

In [62]:
manager = restore_checkpoint(filename=config["DEFAULT_CHECKPOINT_FILE"])

2024-11-03 23:55:17 - INFO - Checkpoint restored from ../data/checkpoints/documents-2024-11-01-3.json.


Prompt

Formulation is expressed using a template (WITT, 2012, p. 162).

In [None]:
rule_template_formulation = """
Each formulation is expressed using a template, in which the various symbols have the following meanings:
1. Each item enclosed in “angle brackets” ("<" and ">") is a placeholder, in place of which any suitable text may be substituted. For example, any of the following may be substituted in place of <operative rule statement subject>:
    a. a term: for example, "flight booking request",
    b. a term followed by a qualifying clause: for example, "flight booking request for a one-way journey",
    c. a reference to a combination of items: for example, "combination of enrollment date and graduation date", with or without a qualifying clause,
    d. a reference to a set of items: for example, "set of passengers", with or without a qualifying clause.
2. Each pair of braces ("{" and "}") encloses a set of options (separated from each other by the bar symbol: "|"), one of which is included in the rule statement. For example,
    a. each rule statement conforming to formulation T1, T3, or T4 starts with either "Each" or "The";
    b. a conditional clause in a rule statement conforming to formulation T1, T2, or T4 is preceded by either "if" or "unless".
3. If a pair of braces includes a bar symbol immediately before the closing brace, the null option is allowed: that is, you can, if necessary, include none of the options at that point in the rule statement. For example, each rule statement conforming to formulation T1, T2, or T4 may include or omit a conditional clause preceded by "if" or "unless".
4. Sets of options may be nested. For example, in each of the templates above
    a. a conditional clause may be included or omitted,
    b. if included, the conditional clause should be preceded by either "if" or "unless".
5. A further notation, introduced later in this section, uses square brackets to indicate that a syntactic element may be repeated indefinitely.
"""

Get expressions to transform

In [63]:
document_p2 = manager.retrieve_document("classify_P2", doc_type="llm_response_classification")

rules_to_transform_p3 = [
    {
        "doc_id": item["doc_id"],
        "expression_id": item["expression_id"],
        "expression_source": item["expression_source"],
        "expression_text": item["expression_text"],
        "expression_subtype": max(item["classification"], key=lambda x: x["confidence"])["subtype"],  # Get subtype with highest confidence
        "expression_template_ids": max(item["classification"], key=lambda x: x["confidence"])["templates_ids"]  # Get subtype with highest confidence

    }
    for item in document_p2.content
]


Main prompt

In [66]:
def get_system_prompt_transform_p3(rule_template_formulation, markdown_data):
    return f"""
Transform the given expressions into a structured format using the provided templates.

# How to interpret the templates

{rule_template_formulation}

# Templates for Operative Rules

{markdown_data}

# Output Format

[
    {{
        "doc_id": <doc_id>,
        "expression_id": <expression_id>,
        "expression_text": <expression_text>,
        "expression_transformed": <expression_transformed>,
        "expression_template": <expression_template>,
    }},
    ...
]
"""

User prompt

In [75]:
def get_user_prompt_transform_p3(expression, template_id):
    return f"""
# Here's the expressions you'll need to transform using template(s) {template_id}

{expression}
"""

Run LLM prompt for each expression

In [76]:
rule_template_provider = RulesTemplateProvider("../data")

for item in rules_to_transform_p3:
    markdown_data = ""
    for template_id in item["expression_template_ids"]:
        markdown_data += rule_template_provider.get_rules_template(template_id)
    system_prompt = get_system_prompt_transform_p3(rule_template_formulation, markdown_data)
    user_prompt = get_user_prompt_transform_p3(item["expression_text"], item["expression_template_ids"])
    print(system_prompt)
    print(user_prompt)


Transform the given expressions into a structured format using the provided templates.

# How to interpret the templates


Each formulation is expressed using a template, in which the various symbols have the following meanings:
1. Each item enclosed in “angle brackets” ("<" and ">") is a placeholder, in place of which any suitable text may be substituted. For example, any of the following may be substituted in place of <operative rule statement subject>:
    a. a term: for example, "flight booking request",
    b. a term followed by a qualifying clause: for example, "flight booking request for a one-way journey",
    c. a reference to a combination of items: for example, "combination of enrollment date and graduation date", with or without a qualifying clause,
    d. a reference to a set of items: for example, "set of passengers", with or without a qualifying clause.
2. Each pair of braces ("{" and "}") encloses a set of options (separated from each other by the bar symbol: "|"), one

Get template

In [None]:
rule_template_provider = RulesTemplateProvider("../data")
markdown_data = processor.get_rules_template("T7")
print(markdown_data)

#### Discussion

For the the first parte (prompt_classify_p1), the assigned confidence levels reflect a calibrated approach to expressions involving multiple classifications where a dominant rule type is not explicitly evident. For instance, when an expression primarily constrains data (Data rule) but also includes specific parties (Party rule), a high confidence level is attributed to Data while a moderate confidence level is applied to Party, acknowledging its secondary relevance. Similarly, expressions referencing roles such as “Secretary” or “interested person” without explicit party restrictions are assigned moderate confidence for Party classification due to interpretive ambiguity. Procedural elements that impact data handling, such as document forwarding, receive high confidence for Data rules; however, a moderate confidence level is assigned for Activity rules when procedural references are indirect. This methodology prioritizes primary rule types while accounting for the interpretive limits of secondary classifications.

In [None]:
raise Exception("Stop here")

Exception: Stop here

## Validation (Move to Chapter 7 - Validation)

### DocumentProcessor

Use DocumentProcessor class to create a list of managers and range over all managers

In [None]:
# TODO: Refactor the semantic annotations and validation to use the DocumentProcessor class

managers, file_info_list = get_all_checkpoints(config["DEFAULT_CHECKPOINT_DIR"])

for manager, file_info in zip(managers, file_info_list):
    print(manager)
    # Process documents
    print(file_info)
    processor = DocumentProcessor(manager)

    # Access processed data
    unique_terms = processor.get_unique_terms()
    unique_names = processor.get_unique_names()
    terms = processor.get_terms()
    names = processor.get_names()
    facts = processor.get_facts()
    rules = processor.get_rules()

    print(f"Unique terms: {len(unique_terms)}")
    print(f"Unique names: {len(unique_names)}")


../data/checkpoints/documents-2024-10-30-1.json
../data/checkpoints/documents-2024-11-01-3.json
documents={('§ 275.0-2', 'section'): Document(id='§ 275.0-2', type='section', content="\n§ 275.0-2 General procedures for serving non-residents.\n(a) General procedures for serving process, pleadings, or other papers on non-resident investment advisers, general partners and managing agents.  Under Forms ADV and ADV-NR [17 CFR 279.1 and 279.4], a person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents:\n  (1) A person may serve a non-resident investment adviser, non-resident general partner, or non-resident managing agent by furnishing the Commission with one copy of the process, pleadings, or papers, for each named party, and one additional copy for the Commission's records.\n  (2) If process, pleadings, or other papers 

### Validate classification

In [None]:
# Predictions and True values
predictions = response.model_dump()

In [None]:
# Convert to DataFrames
pred_df = pd.DataFrame(predictions["paragraphs"])
true_df = pd.DataFrame(true_values)

# Merge the two datasets on paragraph_id
merged_df = pd.merge(pred_df, true_df, on="id")

In [None]:
merged_df

Compute confusion matrix and precision, recall, and accuracy.

In [None]:
# Compute confusion matrix
conf_matrix = confusion_matrix(merged_df['classification_y'], merged_df['classification_x'], labels=labels)

# Compute precision, recall, and accuracy
precision = precision_score(merged_df['classification_y'], merged_df['classification_x'], average='weighted')
recall = recall_score(merged_df['classification_y'], merged_df['classification_x'], average='weighted')
accuracy = accuracy_score(merged_df['classification_y'], merged_df['classification_x'])
f1 = f1_score(merged_df['classification_y'], merged_df['classification_x'], average='weighted')

disp = ConfusionMatrixDisplay.from_predictions(merged_df['classification_y'], merged_df['classification_x'], labels=labels)

print(f"precison: {precision}, recall: {recall}, accuracy: {accuracy}, f1_score: {f1}")

## Classify (P6)

### Classify rule type

In [None]:
class ParagraphData(BaseModel):
    type: str = Field(..., description="Type of rule according to the definition.")
    text: str = Field(..., description="The text of the given paragraph.")
    id: str = Field(..., description="The id of the given paragraph.")

class ParagraphDataset(BaseModel):
    paragraphs: List[ParagraphData] = []

Paragraphs to classify:

In [None]:
document1 = """
'R70', 'A senior passenger is by definition a passenger whose age is at least 70 years at the time of travel.',
'R73', 'End of financial year is by definition June 30.',
'R76', 'A payment is by definition one of the following: a cash payment, a credit card payment, or an electronic funds transfer payment.',
'R85', '1 ft is by definition equal to 12 in.',
'R91', 'Each flight booking confirmation must specify exactly one travel class for each flight.',
'R98', 'Each flight booking confirmation must specify exactly one of the following: a postal address, an e-mail address, or a fax number.',
'R99', 'A flight booking request for a one-way journey must not specify a return date.',
'R130', 'Online check-in for a flight may occur only during the 24 h before the departure time of that flight.',
'R135', 'A driver must not operate any vehicle if that driver is intoxicated.',
'R140', 'A person may travel alone only if the age of that person is at least 2 years.',
'R78', 'The status of an employee is by definition one of the following: probational, permanent, or temporary.',
'R349', 'Each flight booking confirmation for an international journey must specify for each passenger specified in that flight booking request a passport number or a visa number but not both.',
'R75', 'A person is by definition either an adult or a minor.',
'R93', 'Each combination of departure date, flight number, and departure city must be allocated exactly one departure time.',
'R96', 'Each flight booking confirmation must specify a mobile phone number, an e-mail address, or both.',
'R143', 'A person may be rostered on a flight crew only if that person holds an airline transport pilot license that is current and a type endorsement that is current for each aircraft type to be flown by that flight crew.',
'R142', 'A passenger may be allocated to a seat in an exit row only if that passenger is able to open an aircraft door.',
'R77', 'The gender of a person is by definition either male or female.',
'R348', 'Each customer complaint must specify a mobile phone number, an e-mail address, or both.'
"""

True values

In [None]:
true_values = [
    {"id": "R70", "type": "Definitional"},
    {"id": "R73", "type": "Definitional"},
    {"id": "R76", "type": "Definitional"},
    {"id": "R85", "type": "Definitional"},
    {"id": "R91", "type": "Data"},
    {"id": "R98", "type": "Data"},
    {"id": "R99", "type": "Data"},
    {"id": "R130", "type": "Activity"},
    {"id": "R135", "type": "Activity"},
    {"id": "R140", "type": "Party"},
    {"id": "R78", "type": "Definitional"},
    {"id": "R349", "type": "Data"},
    {"id": "R75", "type": "Definitional"},
    {"id": "R93", "type": "Data"},
    {"id": "R96", "type": "Data"},
    {"id": "R143", "type": "Party"},
    {"id": "R142", "type": "Party"},
    {"id": "R77", "type": "Definitional"},
    {"id": "R348", "type": "Data"}
]
labels = ["Definitional", "Data", "Activity", "Party"]

Choose prompt to test:

In [None]:
system_prompt = prompt_b

system_prompt

In [None]:
response = query_llm(user_prompt=document1, system_prompt=system_prompt, response_model=ParagraphDataset)

In [None]:
response.paragraphs

#### Validation


Set predictions.

In [None]:
# Predictions and True values
predictions = response.model_dump()

Convert to dataframe.

In [None]:
# Convert to DataFrames
pred_df = pd.DataFrame(predictions["paragraphs"])
true_df = pd.DataFrame(true_values)

# Merge the two datasets on paragraph_id
merged_df = pd.merge(pred_df, true_df, on="id")

In [None]:
merged_df

Compute confusion matrix and precision, recall, and accuracy.

In [None]:
# Compute confusion matrix
conf_matrix = confusion_matrix(merged_df['type_y'], merged_df['type_x'], labels=labels)

# Compute precision, recall, and accuracy
precision = precision_score(merged_df['type_y'], merged_df['type_x'], average='weighted')
recall = recall_score(merged_df['type_y'], merged_df['type_x'], average='weighted')
accuracy = accuracy_score(merged_df['type_y'], merged_df['type_x'])
f1 = f1_score(merged_df['type_y'], merged_df['type_x'], average='weighted')

disp = ConfusionMatrixDisplay.from_predictions(merged_df['type_y'], merged_df['type_x'], labels=labels)

print(f"precison: {precision}, recall: {recall}, accuracy: {accuracy}, f1_score: {f1}")

### Classify rule sub-type

In [None]:
class ParagraphData(BaseModel):
    type: str = Field(..., description="The type of the rule.")
    text: str = Field(..., description="The text of the given paragraph.")
    id: str = Field(..., description="The id of the given paragraph.")
    subtype: str = Field(..., description="The sub-type of the rule.")
    templates: list[str] = Field(..., description="The templates of the rule.")
    examples: list[str] = Field(..., description="The examples of the rule.")

class ParagraphDataset(BaseModel):
    paragraphs: List[ParagraphData] = []

In [None]:
prompt_a = """
You are an expert in SBVR (Semantics of Business Vocabulary and Business Rules). You'll be asked to classify paragraphs into types of rules. You'll be given a definition of the types of rules.

Given the definition of types of rules, and their templates, classify each rule by their sub-types. see template syntax to interpret the templates.
# Template and subtemplate sintaxe
1. Each item enclosed in angle brackets (\< and \>) is a placeholder, in place of which any suitable text may be substituted. For example, any of the following may be substituted in place of \<operative rule statement subject\>:
	- a term: for example, "flight booking request",
	- a term followed by a qualifying clause: for example, "flight booking request for a one-way journey",
	- a reference to a combination of items: for example, "combination of enrollment date and graduation date", with or without a qualifying clause,
	- a reference to a set of items: for example, "set of passengers", with or without a qualifying clause.
2. Each pair of braces ({ and }) encloses a set of options (separated from each other by the bar symbol: |), one of which is included in the rule statement. For example,
	- each rule statement starts with either "Each" or "The";
	- a conditional clause in a rule statement is preceded by either "if" or "unless".
3. If a pair of braces includes a bar symbol immediately before the closing brace, the null option is allowed: that is, you can, if necessary, include none of the options at that point in the rule statement. For example, each rule statement may include or omit a conditional clause preceded by "if" or "unless".
4. Sets of options may be nested. For example, in each of the templates above
	- a conditional clause may be included or omitted,
	- if included, the conditional clause should be preceded by either "if" or "unless".
5. A further notation, introduced later in this section, uses square brackets to indicate that a syntactic element may be repeated indefinitely.
6. Any text not enclosed in either angle brackets or braces (i.e., "must", "not", "may", and "only") is included in every rule statement conforming to the relevant template.
7. Subtemplate has a designator before the symbol "::=". The text after the symbol "::=" can be substituted in place of any placeholder (in a template or subtemplate) that has the same designator.

# Definition
## 9.2 Definitional rules
Definitional rules constrains how we define a construct created or used by the organization or the industry within which it operates. Definitional rules can in turn be categorized as:
### 9.2.1 Formal term definitions:
A formal term definition defines a particular business term in a formal manner. They are categorized as:
#### 9.2.1.1 Formal intensional definitions
A formal intensional definition defines the subject business term using an intensional definition: one that cites both a hypernym (a term that refers to a superset of the set referred to by the original term) and the characteristics that distinguish members of the set referred to by the original term.
T7.
```template
{A|An} <term 1>
	{of {a|an} <term 2>| }
is by definition
{a|an|the} <term 3>
	<qualifying clause>.
```
#### 9.2.1.2 Formal extensional definitions
Formal extensional definition defines the subject business term by using an extensional definition: one that lists a complete set of hyponyms (terms that refer to subsets of the set referred to by the original term).
T8.
```template
{A|An} <term 1>
	{of {a|an} <term 2>| }
is by definition
[<article> <term 3>, or]
	{of that <term 2>| }.
```
#### 9.2.1.3 Symbolic literal definitions
A symbolic literal definition defines the subject business term using one or more literals.
T9.
```template
{<literal 1>|{A|An} <term 1>
	{of {a|an} <term 2>| }}
is by definition
{<literal 2>|
[<literal 3>, or] from a <literal 4> to the following <literal 5>}.
```
### 9.2.2 Categorization scheme enumerations
A categorization scheme enumeration defines the members of a categorization scheme that is both mutually exclusive and jointly exhaustive.
T10.
```template
{{A|An} <category attribute term>|
The <category attribute term>
	of {a |an} <entity class term>}
is by definition
{either <literal 1> or <literal 2>|
one of the following: [<literal 3>, or]}.
```
### 9.2.3 Category transition constraints
A category transition constraint specifies allowed or disallowed transitions between categories or statuses.
T11.
```template
A transition
	of the <category attribute term> of {a|an} <entity class term>
	from {<literal 1>| [<literal 2>, or]}
	to {<literal 3>| [<literal 4>, or]}
is by definition
impossible.
```
### 9.2.4 Complex concept structure rules
A complex concept structure rule defines a particular constraint on one or more components of a complex concept. They are categorized as:
#### 9.2.4.1 Complex concept cardinality rules
A complex concept cardinality rule defines the number of (or minimum and/or maximum number of) components of a particular type within a particular concept.
T12.
```template
{A|An} <term 1>
<verb phrase> by definition
{<cardinality>|at most <positive integer>} <term 2>
	{{for |in} {each|the} <term 3>| }.
```
#### 9.2.4.2 Complex concept equivalence rules
A complex concept equivalence rule defines a pair of components within a particular concept that are of necessity the same.
T13.
```template
The <term 1>
	<qualifying clause 1>
is by definition
the same as the <term 2>
	<qualifying clause 2>.
```
#### 9.2.4.3 Complex concept set constraints
A complex concept set constraint defines two sets of components within a particular concept that must be identical.
T14.
```template
The set of <term 1>
	<qualifying clause 1>
is by definition
the same as the set of <term 1>
	<qualifying clause 2>.
```
### 9.2.5 Valid value definitions
A valid value definition defines the valid values of a particular measure as a range or (occasionally) as a list of discrete values.
T15.
```template
{The| } <attribute term>
	{of {a|an} <entity class term>| }
is by definition
{<inequality operator> <literal 1>
	{and <inequality operator> <literal 2>| } |
	[<literal 3>, or]}.
```
### 9.2.6 Data calculation rules
A data calculation rule defines the algorithm or formula for a particular quantity or a conversion factor between two units. They are categorized as:
#### 9.2.6.1 Data calculation algorithms
A data calculation algorithm defines how a particular quantity or amount (whether for operational purposes, such as a fee, or for business intelligence purposes, such as a performance measure) is calculated.
T16.
```template
{The| } <attribute term>
	{of | for} {a|an} <entity class term>
	{<qualifying clause>| }
is by definition calculated as
<expression>.
```
#### 9.2.6.2 Conversion factor definitions
A conversion factor definition defines a conversion factor between two units of measurement.
T17.
```template
<literal 1>
is by definition {approximately | } equal to
<literal 2>.
```
### 9.2.7 Standard format definitions
A standard format definition defines the standard format for data items of a particular type in terms of individual characters and/or component data items.
T18.
```template
A valid <term>
is by definition composed of
<format definition>.
```
## 9.3 Data rules
Data rules (all of which are operative rules) constrains the data included in a transaction (a form or message) or a persistent dataset (e.g., a database record). Data rules can in turn be categorized as:
### 9.3.1 Data cardinality rules
A data cardinality rule requires the presence or absence of a data item and/or places a restriction on the maximum or minimum number of occurrences of a data item
#### 9.3.1.1 Mandatory data rules
A mandatory data rule mandates the presence of data:
##### 9.3.1.1.1 Mandatory data item rules
A mandatory data item rule requires that a particular data item be present.
T19.
```template
Each <transaction signifier>
must {specify|contain} <cardinality> <data item term>
	{{in| for} {each|the} <subform term> {(if any)| }
	{<qualifying clause>| } | }
{{if |unless} <conditional clause>| }.
```
##### 9.3.1.1.2 Mandatory option selection rules
A mandatory option selection rule requires that one of a set of pre-defined options be specified.
with two or more options:
T20.
```template
Each <transaction signifier>
must
{({if |unless} <conditional clause>) | }
specify whether {it |{the |each} <term>
	{<qualifying clause>| }}
<verb phrase> [<object>, or].
```
with a single option:
T21.
```template
Each <transaction signifier>
must
{({if |unless} <conditional clause>) | }
specify whether {or not| } {it |{the |each} <term>
	{<qualifying clause>| }}
<verb phrase> {<object>| }.
```
##### 9.3.1.1.3 Mandatory group rules:
A mandatory group rule requires that at least one of a group of data items be present.
two data items in the group:
T22.
```template
Each <transaction signifier>
must {specify|contain}
	{{in| for} {each|the} <subform term> {(if any)| }
	{<qualifying clause>| } | }
	{a|an} <data item term 1>, {a|an} <data item term 2>
	{, or|but not} both
{{if |unless} <conditional clause>| }.
```
more than two data items in the group:
T23.
```template
Each <transaction signifier>
must
{({if |unless} <conditional clause>) | }
{specify |contain}
	{{in| for} {each|the} <subform term> {(if any)| }
	{<qualifying clause>| } | }
	<cardinality> of the following:
	[<data item term>, or].
```
#### 9.3.1.2 Prohibited data rules
A prohibited data rule mandates the absence of some data item in a particular situation.
T24.
```template
{A|An} <transaction signifier>
must not {specify |contain} a <data item term>
	{{in | for} {any|the} <subform term> {(if any)| }
	{<qualifying clause>| } | }
{{if |unless} <conditional clause>| }.
```
#### 9.3.1.3 Maximum cardinality rules
A maximum cardinality rule places an upper limit (usually but not necessarily one) on how many instances of a particular data item there may be.
T25.
```template
{A|An} <transaction signifier>
must not {specify |contain} more than <positive integer>
	<data item term>
	{{in | for} {any one|the} <subform term> {(if any)| }
	{<qualifying clause>| } | }
{{if |unless} <conditional clause>| }.
```
#### 9.3.1.4 Multiple data rules
A multiple data rule mandates the presence of two or more instances of a particular data item in a particular situation.
T19.
```template
Each <transaction signifier>
	must {specify|contain} <cardinality> <data item term>
	{{in| for} {each|the} <subform term> {(if any)| }
	{<qualifying clause>| } | }
{{if |unless} <conditional clause>| }.
```

these rule statements `<cardinality>` may only take one of the following forms:
1. exactly `<positive integer>`, where `<positive integer>` is at least two;
2. at least `<positive integer>`, where `<positive integer>` is at least two;
3. at least `<positive integer 1>` and at most `<positive integer 2>`, where `<positive integer 1>` is at least two.
#### 9.3.1.5 Dependent cardinality rules
A dependent cardinality rule mandates how many of a particular data item must be present based on the value of another data item.
T26.
```template
The number of <data item term 1>
	{specified|contained}
	{{in| for} {the|each} <subform term> {(if any) | } | }
	in each <transaction signifier>
must be {{no|} {more|less} than|equal to} the <data item term 2>
	{<qualifying clause>| }
{{if |unless} <conditional clause>| }.
```
### 9.3.2 Data content rules
A data content rule places a restriction on the values contained in a data item or set of data items (rather than whether they must be present and how many there may or must be).
#### 9.3.2.1 Value set rules
A value set rule requires either: that the content of a data item be (or not be) one of a particular set of values (either a fixed set, or a set that may change over time), or; that the content of a combination of data items match or not match a corresponding combination in a set of records;
##### 9.3.2.1.1 Value set rules constraining single data items
T27.
```template
{The|Each} <data item term> {(if any)| }
	specified {{in| for} {the|each} <subform term> {(if any)| } | }
	in each <transaction signifier>
must be
	{{other than| } one of the <term> <qualifying clause>| [<literal>, or]}
{{if |unless} <conditional clause>| }.
```
##### 9.3.2.1.2 Value set rules constraining combinations of data items
T28.
```template
{The|Each} combination of [<data item term 1>, and] {(if any)| }
	specified {{in| for} {the|each} <subform term> {(if any)| } | }
	in each <transaction signifier>
must be one of the combinations of [<data item term 2>, and]
	{<qualifying clause>| }
{{if |unless} <conditional clause>| }.
```
#### 9.3.2.2. Range rules
A range rule requires that the content of a data item be a value within a particular inclusive or exclusive single-bounded or double-bounded range.
T29.
```template
{The|Each} <data item term> {(if any)| }
	specified {{in| for} {the|each} <subform term> {(if any)| } | }
	in each <transaction signifier>
must be <inequality operator> <object> {and <inequality operator> <object>| }
{{if |unless} <conditional clause>| }.
```
#### 9.3.2.3 Equality rules
An equality rule requires that the content of a data item be the same as or not the same as that of some other data item.
T30.
```template
{The|Each} <data item term> {(if any)| }
	specified {{in| for} {the|each} <subform term> {(if any)| } | }
	in each <transaction signifier>
must be <equality operator> <object>
{{if |unless} <conditional clause>| }.
```
#### 9.3.2.4 Uniqueness constraints
A uniqueness constraint requires that the content of a data item (or combination or set of data items) be different from that of the corresponding data item(s) in the same or other records or transactions;
##### 9.3.2.4.1 Uniqueness constraints constraining single data items.
T31.
```template
{The|Each} <data item term 1> {(if any)| }
	<verb part> {the <subform term 1> {(if any)| }
	in|} each <transaction signifier 1>
	{<qualifying clause 1>| }
must be different from the <data item term 1>
	<verb part> {{the |any other} <subform term 1> {(if any)| }
	in| } {that |any other} <transaction signifier 1>
	{<qualifying clause 2>| }
{{if |unless} <conditional clause>| }.
```
##### 9.3.2.4.2 Uniqueness constraints constraining combinations of data items
T32.
```template
{The|Each} combination of [<data item term 1>, and] {(if any)| }
	<verb part> {the <subform term 1> {(if any)| }
	in| } each <transaction signifier 1>
	{<qualifying clause 1>| }
must be different from the combination of [<data item term 1>, and]
	<verb part> {{the|any other} <subform term 1> {(if any) | }
	in| } {that |any other} <transaction signifier 1>
	{<qualifying clause 2>| }
{{if |unless} <conditional clause>| }.
```
##### 9.3.2.4.3 Uniqueness constraints constraining sets of data items
T33.
```template
{The|Each} set of <data item term 1> {(if any)| }
	<verb part> {the <subform term 1> {(if any)| }
	in|} each <transaction signifier 1>
	{<qualifying clause 1>| }
must be different from the set of <data item term 1>
	<verb part> {{the |any other} <subform term 1> {(if any)| }
	in| } {that |any other} <transaction signifier 1>
	{<qualifying clause 2>| }
{{if |unless} <conditional clause>| }.
```
#### 9.3.2.5 Data consistency rules
A data consistency rule requires the content of multiple data items to be consistent with each other, other than as provided for by a value set rule, range rule, or equality rule;
##### 9.3.2.5.1 Data consistency rules constraining a combination of data items
T34.
```template
{The|Each} combination of [<data item term>, and] {(if any)| }
	specified {{in| for} {the|each} <subform term> {(if any)| } | }
	in each <transaction signifier>
	{<qualifying clause>| }
must be such that <conditional clause 1>
{{if |unless} <conditional clause 2>| }.
```
##### 9.3.2.5.2 Data consistency rules constraining a set function
T35.
```template
The <set function> of {the| } <data item term> {(if any)| }
	specified {{in| for} {the|each} <subform term> {(if any)| } | }
	in each <transaction signifier>
	{<qualifying clause>| }
must be {<inequality operator>|<equality operator>} <object>
{{if |unless} <conditional clause>| }.
```
##### 9.3.2.5.3 Data consistency rules constraining a set
T36.
```template
{The|Each} set of <data item term> {(if any)| }
	specified {{in| for} {the|each} <subform term> {(if any)| } | }
	in each <transaction signifier>
	{<qualifying clause 1>| }
must {be {the same as| different from} |include} the set of <term>
	{<qualifying clause 2>| }
{{if |unless} <conditional clause>| }.
```
#### 9.3.2.6 Temporal data constraints
A temporal data constraint constrains one or more temporal data items (data items that represent time points or time periods). There are various subcategories of temporal constraint:
##### 9.3.2.6.1 Simple temporal data constraints
A simple temporal data constraint requires that a particular date or time fall within a certain temporal range.
T29.
```template
{The|Each} <data item term> {(if any)| }
	specified {{in| for} {the|each} <subform term> {(if any)| } | }
	in each <transaction signifier>
must be <temporal inequality operator> <object> {and <temporal inequality operator> <object>| }
{{if |unless} <conditional clause>| }.
```
R373.
```example
The departure time of the outgoing flight
	specified in each flight booking confirmation
	that is made online
must be no earlier than 3 h
	after the booking confirmation time
	of that flight booking confirmation.
```
##### 9.3.2.6.2 Temporal data non-overlap constraints
Temporal data non-overlap constraint requires that the time periods specified in a set of records do not overlap each other.
T37.
```template
{The|Each} <time period term 1> {(if any)| }
	specified {{in| for} {the|each} <subform term 1> {(if any)| } | }
	in each <transaction signifier 1>
	{<qualifying clause 1>| }
must not overlap the <time period term 1>
	specified {{in| for} {the|each} <subform term 1> {(if any)| } | }
	in any other <transaction signifier 1>
	{<qualifying clause 2>| }
{{if |unless} <conditional clause>| }.
```
##### 9.3.2.6.3 Temporal data completeness constraints
A temporal data completeness constraint requires that the time periods specified in a set of records be contiguous and between them completely span some other time period.
T38.
```template
Each <time period term 1>
	within the <time period term 2> {(if any)| }
	specified {{in| for} {the|each} <subform term 1> {(if any)| } | }
	in each <transaction signifier 1>
	{<qualifying clause 1>| }
must be within the <time period term 3>
	specified {{in| for} {the|each} <subform term 2> {(if any)| } | }
	in <cardinality> <transaction signifier 2>
	{<qualifying clause 2>| }
{{if |unless} <conditional clause>| }.
```
##### 9.3.2.6.4 Temporal data inclusion constraints
A temporal data inclusion constraint requires that the time periods specified in a set of records do not fall outside some other time period.
T38.
```template
Each <time period term 1>
	within the <time period term 2> {(if any)| }
	specified {{in| for} {the|each} <subform term 1> {(if any)| } | }
	in each <transaction signifier 1>
	{<qualifying clause 1>| }
must be within the <time period term 3>
	specified {{in| for} {the|each} <subform term 2> {(if any)| } | }
	in <cardinality> <transaction signifier 2>
	{<qualifying clause 2>| }
{{if |unless} <conditional clause>| }.
```
##### 9.3.2.6.5 Temporal single record constraints
A temporal single record constraint requires that a temporal state of affairs be recorded using a single record rather than multiple records.
single data item is involved:
T39.
```template
{The|Each} <data item term 1> {(if any)| }
	specified {{in| for} {the|each} <subform term 1> {(if any)| } | }
	in each <transaction signifier 1>
must be different from the <data item term 1>
	specified {{in| for} {the|each} <subform term 1> {(if any)| } | }
	in the latest of the earlier <transaction signifier 1>
{{if |unless} <conditional clause>| }.
```
combination of data items is involved:
T40.
```template
{The|Each} combination of [<data item term 1>, and] {(if any)| }
	specified {{in| for} {the|each} <subform term 1> {(if any)| } | }
	in each <transaction signifier 1>
must be different from the combination of [<data item term 1>, and]
	specified {{in| for} {the|each} <subform term 1> {(if any)| } | }
	in the latest of the earlier <transaction signifier 1>
{{if |unless} <conditional clause>| }.
```
##### 9.3.2.6.6 Day type constraints
A day type constraint restricts a date to one or more days of the week or a particular type of day such as a working day (typically but not necessarily any day other than a Saturday, Sunday, or public holiday).
T41.
```template
{The|Each} <data item term> {(if any)| }
	specified {{in| for} {the|each} <subform term> {(if any)| } | }
	in each <transaction signifier>
must be a {<term>|<literal 1>| [<literal 2>, or]}
{{if |unless} <conditional clause>| }.
```
#### 9.3.2.7 Spatial data constraints
A spatial data constraint prescribes or prohibits relationships between data items representing spatial properties (points, line segments or polygons).
T41.
```template
{The|Each|A|An} <spatial term 1> {(if any)| }
	<qualifying clause 1>
must {not| } <spatial operator> the <spatial term 2>
	<qualifying clause 2>
{{if |unless} <conditional clause>| }.
```
#### 9.3.2.8 Data item format rules
A data item format rule specifies the required format of a data item.
T43.
```template
The <data item term> {(if any)| }
	specified {{in| for} {the|each} <subform term> {(if any)| } | }
	in each <transaction signifier>
must be {represented using| } a valid <term>
{{if |unless} <conditional clause>| }.
```
### 9.3.3 Data update rules
A data update rule either prohibits update of a data item or places restrictions on the new value of a data item in terms of its existing value. There are three subcategories of data update rule:
#### 9.3.3.1 Data update prohibition rules
A data update prohibition rule prohibits update of a particular data item or set of data items.
non-transferable relationships:
T44.
```template
{A|An} <transaction signifier 1>
must not be transferred
	from one <transaction signifier 2> to another <transaction signifier 2>
{{if |unless} <conditional clause>| }.
```
other data update:
T45:
```template
{The|A|An} <data item term> {(if any)| }
	{{in |for} {any|the} <subform term> {(if any)| } | }
	{in|of} a <transaction signifier>
must not be updated
{{if |unless} <conditional clause>| }.
```
#### 9.3.3.2 State transition constraints
A state transition constraint limits the changes in a data item to a set of valid transitions.
T46.
```template
The <data item term> {(if any)| }
	{{in |for} {any|the} <subform term> {(if any)| } | }
	{in|of} a <transaction signifier>
may be updated to {<literal 1>| [<literal 2>, or]}
only if <conditional clause>.
```
#### 9.3.3.3 Monotonic transition constraints
A monotonic transition constraint requires that a numeric value either only increase or only decrease.
T47.
```template
The <data item term> {(if any)| }
	{{in| for} {any|the} <subform term> {(if any)| } | }
	{in |of} a <transaction signifier>
must not be {increased|decreased}
{{if |unless} <conditional clause>| }.
```
## 9.4 Activity rules
Activity rules (all of which are operative rules) constrains the operation of one or more business processes or other activities. Activity rules can in turn be categorized as:
### 9.4.1 Activity restriction rules
An activity restriction rule restricts a business process or other activity in some way. There are various subcategories of activity restriction rules:
#### 9.4.1.1 Rules restricting when an activity can occur
Many activity restriction rules place time restrictions on activities.
##### 9.4.1.1.1 Activity time limit rules
An activity time limit rule restricts a business process or other activity to within a particular time period.
T48.
```template
{The| } <process term> {of | for} {a|an} <object term>
	{<qualifying clause>| }
{must {not| } occur|may occur only}
<time restriction 1> {{and| or} <time restriction 2>| }
{{if |unless} <conditional clause>| }.
```
##### 9.4.1.1.2 Activity exclusion period rules
An activity exclusion period rule prohibits a business process or other activity during a particular time period.
T49.
```template
{Each|A|An} <term>
	{<qualifying clause 1>| }
{must {not| } <verb phrase 1> {<object 1>| }
	{<qualifying clause 2>| } |
may <verb phrase 2> {<object 2>| }
	{<qualifying clause 3>| } only}
<time restriction 1> {{and| or} <time restriction 2>| }
{{if |unless} <conditional clause>| }.
```
##### 9.4.1.1.3 Activity obligation rule
An activity obligation rule requires a business process or other activity to occur either within a maximum time after a particular event (such as the completion of some other process) or as soon as practical after a particular event.
T49.
```template
{Each|A|An} <term>
	{<qualifying clause 1>| }
{must {not| } <verb phrase 1> {<object 1>| }
	{<qualifying clause 2>| } |
may <verb phrase 2> {<object 2>| }
	{<qualifying clause 3>| } only}
<time restriction 1> {{and| or} <time restriction 2>| }
{{if |unless} <conditional clause>| }.
```
#### 9.4.1.2 Activity pre-condition rules
An activity pre-condition rule prohibits a business process or other activity unless some other activity or event has previously occurred or some prerequisite condition exists.
T50.
```template
{A|An} <subject term>
	{<qualifying clause>| }
may <verb phrase> {<object>| }
only {<time restriction>| if <conditional clause>}.
```
#### 9.4.1.3 Activity prohibition rules
An activity prohibition rule prohibits a business process or other activity if some event or other process has previously occurred or some dangerous or illegal condition exists.
T51.
```template
{A|An} <subject term>
	{<qualifying clause>| }
must not <verb phrase> {<object>| }
if <conditional clause>.
```
#### 9.4.1.4 Information retention rules
An information retention rule defines the minimum period for which a particular type of information is retained.
T49.
```template
{Each|A|An} <term>
	{<qualifying clause 1>| }
{must {not| } <verb phrase 1> {<object 1>| }
	{<qualifying clause 2>| } |
may <verb phrase 2> {<object 2>| }
	{<qualifying clause 3>| } only}
<time restriction 1> {{and| or} <time restriction 2>| }
{{if |unless} <conditional clause>| }.
```
#### 9.4.1.5 Activity conflict rules
An activity conflict rule restricts the simultaneous occurrence of multiple processes or other activities.
```
R136.
```example
R136. A folder
must not be renamed
while any file within that folder is open for editing.
```
### 9.4.2 Process decision rules
A process decision rule determines what action a business process or device is to take in specific situations;
9.4.3 Activity obligation rules: An activity obligation rule requires a business process or other activity to occur either within a maximum time after a particular event (such as the completion of some other process) or when particular conditions apply.
T52.
```template
Each <actor term>
must <verb phrase> {<object>| }
	{<qualifying clause>| }
{{if |unless} <conditional clause>| }.
```
## 9.5 Party rules
Party rules (all of which are operative rules) restricts the parties who can perform a process or activity or play a role. Party rules can in turn be categorized as
### 9.5.1 Party restriction rules
A party restriction rule places restrictions on who can perform some processes or activities or play some roles, based on age, some other physical characteristic or capability, or training, testing, and certification in the appropriate skills.
T53.
```template
A <party signifier 1>
	{<qualifying clause>| }
may <predicate 1>
only if {the <attribute signifier> of| } that <party signifier 1>
	<predicate 2>.
```
### 9.5.2 Role separation and binding rules
A role separation rule prohibits the same party from performing two activities.
T54.
```template
The <party signifier 1>
	<qualifying clause 1>
must {not| } be {the same|one of the} <party signifier 1>
	<qualifying clause 2>
{{if |unless} <conditional clause>| }.
```
### 9.5.3 Information access rules
An information access rule defines who can view, create, or update particular information.
T55.
```template
{The|A|An} <information signifier>
	<qualifying clause>
may be <information access process> by
only {<object 1>| [<object 2>, or]}
{{if |unless} <conditional clause>| }.
```
### 9.5.4 Responsibility rules
A responsibility rule defines who is responsible for performing a particular process or liable for a particular fee, duty, or tax.
T56.
```template
{The|A|An| } <responsibility signifier>
	{<qualifying clause 1>| }
must <verb phrase> {the |a|an} <party signifier>
	{<qualifying clause 2>| }
{{if |unless} <conditional clause>| }.
```
### Subtemplates
S1.
```subtemplate
<operative rule statement subject>::=
{<term>|combination of [<term>, and]|set of <term>}
{<qualifying clause>|}
```
S2.
```subtemplate
<article>::= {a|an|the}
```
S3.
```subtemplate
<cardinality>::=
{exactly|at least {<positive integer 1> and at most| }}
<positive integer 2>
```
S4.
```subtemplate
<determiner>::=
{<article>|each|that |those|
<cardinality>|at most <positive integer>|
```
S5.
```subtemplate
<set function>::=
{number |sum| total |maximum|minimum|average|mean|median|
latest | earliest}
```
S6.
```subtemplate
<inequality operator>::=
{{no|} {more|less | later | earlier} than|
at {least |most} <literal> {more| later} than|
{no|} {later | earlier} than <literal> {after |before}}
```
S7.
```subtemplate
<equality operator>::=
{the same as| different from|equal to|unequal to}
```
S8.
```subtemplate
<transaction signifier>::=
{<term>|{record of a| } combination of [<term>, and]}
{<qualifying clause>| }
```
S9.
```subtemplate
<verb part>::=
{<participle>|<adjective>| } <preposition>
```
S10.
```
<predicate>::=
{{<verb phrase>| is {<equality operator>|<inequality operator>}}
<object>|
<verb phrase> {[<object>, and] | [<object>, or] | }}
```
S11.
```subtemplate
<object>::=
{{<determiner>|the <set function> of {<determiner>| } | } <term>
{<qualifying clause>| } |
{<determiner>| } <literal>}
```
S12.
```subtemplate
<expression>::=
{<object>|
<set function> of {<determiner>| } <term> {<qualifying clause>| } |
<expression> {plus|minus|multiplied by|divided by} <expression>|
{sum|product} of [<expression>, and] |
{square|cube} {root |} of <expression>}
```
S13.
```subtemplate
<conditional clause>::=
{{<determiner> <term> {<qualifying clause>| } |<expression>| it}
{<predicate>| [<predicate> and]| [<predicate> or]} |
[<conditional clause> and] | [<conditional clause> or]}
```
S14.
```subtemplate
<qualifying clause>::=
{{that |who} <verb phrase> {<object>| } |
<verb part> <object>|
other than {<object>| [<object>, or]} |
{<preposition> {which|whom}|whose} <conditional clause>|
{that |who} <verb phrase> {that | if |whether} <conditional clause>|
<and-qualifying clause>|
<or-qualifying clause>|
<both-and-qualifying clause>|
<either-or-qualifying clause>}
```
S15.
```subtemplate
<and-qualifying clause>::=
{that |who}
{[<verb phrase> {<object>| } and]|
is [<verb part> {<object>| } and]
<verb phrase> [<object> and]}
```
S16.
```subtemplate
<both-and-qualifying clause>::=
{that |who}
{both <verb phrase> {<object>| } and <verb phrase> {<object>| } |
{is |are} both <verb part> {<object>| } and <verb part> {<object>| } |
<verb phrase> both <object> and <object>}
```
S17.
```subtemplate
<or-qualifying clause>::=
{that |who}
{[<verb phrase> {<object>| } or] |
is [<verb part> {<object>| } or]
<verb phrase> [<object> or]}
```
S18.
```subtemplate
<either-or-qualifying clause>::=
{that |who}
{either <verb phrase> {<object>|} or <verb phrase> {<object>| } |
{is |are} either <verb part> {<object>|} or <verb part> {<object>| } |
<verb phrase> either <object> or <object>}
```
S19.
```subtemplate
<format definition>::=
{{exactly|at least |up to} <positive integer 1>|
from <positive integer 2> to <positive integer 3>} <term>
{followed by <format definition>| }
```
S20.
```subtemplate
<spatial operator>::=
{overlap|be within|enclose|span|intersect |meet|be on}
```
S21.
```subtemplate
<time restriction>::=
{at any time| }
{{before| after |during| until |within| {no|} {earlier | later} than} <object>| }
{{before| after |during| until |within|on} <object>|
{before| after |while| until} <conditional clause>}
```
S22.
```subtemplate
<information access process>::=
{viewed|created|updated|deleted}
```

Return a JSON object with the following fields: paragraph, rule_type.

 If the query is invalid, return an empty classification.

 Here's an example
 [
    {
        "id": "some id",
        "text": "some text",
        "type": "Definitional"
        "subtype": "Formal intensional definitions",
        "templates": ["T7"],
        "examples": ["R70"]
    },
    {
        "id": ...
        "text": ...
        "type": ...
        "subtype": ...
        "templates": ...
        "examples": ...
    },

]

Here's the paragraphs you'll need to classify:
"""

In [None]:
prompt_b = """
You are an expert in SBVR (Semantics of Business Vocabulary and Business Rules). You'll be asked to classify paragraphs into types of rules. You'll be given a definition of the types of rules.

Given the definition of types of rules, classify each rule by following subtypes.

Hints:
- If a section has no templates, look in the next section. E.g. 9.2.1 has no template, so look at 9.2.1.1, it has a template T7;
- If a section has more tha one template, check when apply each template. E.g. 9.2.1.2 has two templates, so look at 9.2.1.2.1, it has a template T8;
- The subtype is the section name.

# Definition

## 9.2 Definitional rules
Definitional rules constrains how we define a construct created or used by the organization or the industry within which it operates. Definitional rules can in turn be categorized as:

### 9.2.1 Formal term definitions:
A formal term definition defines a particular business term in a formal manner. They are categorized as:

#### 9.2.1.1 Formal intensional definitions
A formal intensional definition defines the subject business term using an intensional definition: one that cites both a hypernym (a term that refers to a superset of the set referred to by the original term) and the characteristics that distinguish members of the set referred to by the original term.
Template: T7

#### 9.2.1.2 Formal extensional definitions
Formal extensional definition defines the subject business term by using an extensional definition: one that lists a complete set of hyponyms (terms that refer to subsets of the set referred to by the original term).
Template: T8

#### 9.2.1.3 Symbolic literal definitions
A symbolic literal definition defines the subject business term using one or more literals.
Template: T9

### 9.2.2 Categorization scheme enumerations
A categorization scheme enumeration defines the members of a categorization scheme that is both mutually exclusive and jointly exhaustive.
Template: T10

### 9.2.3 Category transition constraints
A category transition constraint specifies allowed or disallowed transitions between categories or statuses.
Template: T11

### 9.2.4 Complex concept structure rules
Template: A complex concept structure rule defines a particular constraint on one or more components of a complex concept. They are categorized as:

#### 9.2.4.1 Complex concept cardinality rules
A complex concept cardinality rule defines the number of (or minimum and/or maximum number of) components of a particular type within a particular concept.
Template: T12

#### 9.2.4.2 Complex concept equivalence rules
A complex concept equivalence rule defines a pair of components within a particular concept that are of necessity the same.
Template: T13

#### 9.2.4.3 Complex concept set constraints
A complex concept set constraint defines two sets of components within a particular concept that must be identical.
Template: T14

### 9.2.5 Valid value definitions
A valid value definition defines the valid values of a particular measure as a range or (occasionally) as a list of discrete values.
Template: T15

### 9.2.6 Data calculation rules
A data calculation rule defines the algorithm or formula for a particular quantity or a conversion factor between two units. They are categorized as:

#### 9.2.6.1 Data calculation algorithms
A data calculation algorithm defines how a particular quantity or amount (whether for operational purposes, such as a fee, or for business intelligence purposes, such as a performance measure) is calculated.
Template: T16

#### 9.2.6.2 Conversion factor definitions
A conversion factor definition defines a conversion factor between two units of measurement.
Template: T17

### 9.2.7 Standard format definitions
A standard format definition defines the standard format for data items of a particular type in terms of individual characters and/or component data items.
Template: T18

## 9.3 Data rules
Data rules (all of which are operative rules) constrains the data included in a transaction (a form or message) or a persistent dataset (e.g., a database record). Data rules can in turn be categorized as:

### 9.3.1 Data cardinality rules
A data cardinality rule requires the presence or absence of a data item and/or places a restriction on the maximum or minimum number of occurrences of a data item

#### 9.3.1.1 Mandatory data rules
A mandatory data rule mandates the presence of data:

##### 9.3.1.1.1 Mandatory data item rules
A mandatory data item rule requires that a particular data item be present.
Template: T19

##### 9.3.1.1.2 Mandatory option selection rules
A mandatory option selection rule requires that one of a set of pre-defined options be specified.
When to apply: Rule statements for rules with two or more options.
Template: T20

When to apply: Rule statements for rules with a single option which may or may not be the case.
Template: T21

##### 9.3.1.1.3 Mandatory group rules:
A mandatory group rule requires that at least one of a group of data items be present.
When to apply: If there are only two data items in the group.
Template: T22

When to apply: If there are more than two data items in the group.
Template: T23

#### 9.3.1.2 Prohibited data rules
A prohibited data rule mandates the absence of some data item in a particular situation.
Template: T24

#### 9.3.1.3 Maximum cardinality rules
A maximum cardinality rule places an upper limit (usually but not necessarily one) on how many instances of a particular data item there may be.
Template: T25

#### 9.3.1.4 Multiple data rules
A multiple data rule mandates the presence of two or more instances of a particular data item in a particular situation.
Template: T19

#### 9.3.1.5 Dependent cardinality rules
A dependent cardinality rule mandates how many of a particular data item must be present based on the value of another data item.
Template: T26

### 9.3.2 Data content rules
A data content rule places a restriction on the values contained in a data item or set of data items (rather than whether they must be present and how many there may or must be).

#### 9.3.2.1 Value set rules
A value set rule requires either: that the content of a data item be (or not be) one of a particular set of values (either a fixed set, or a set that may change over time), or; that the content of a combination of data items match or not match a corresponding combination in a set of records;

##### 9.3.2.1.1 Value set rules constraining single data items
Template: T27

##### 9.3.2.1.2 Value set rules constraining combinations of data items
Template: T28

#### 9.3.2.2. Range rules
A range rule requires that the content of a data item be a value within a particular inclusive or exclusive single-bounded or double-bounded range.
Template: T29

#### 9.3.2.3 Equality rules
An equality rule requires that the content of a data item be the same as or not the same as that of some other data item.
Template: T30

#### 9.3.2.4 Uniqueness constraints
A uniqueness constraint requires that the content of a data item (or combination or set of data items) be different from that of the corresponding data item(s) in the same or other records or transactions;

##### 9.3.2.4.1 Uniqueness constraints constraining single data items.
Template: T31

##### 9.3.2.4.2 Uniqueness constraints constraining combinations of data items
Template: T32

##### 9.3.2.4.3 Uniqueness constraints constraining sets of data items
Template: T33

#### 9.3.2.5 Data consistency rules
A data consistency rule requires the content of multiple data items to be consistent with each other, other than as provided for by a value set rule, range rule, or equality rule;

##### 9.3.2.5.1 Data consistency rules constraining a combination of data items
Template: T34

##### 9.3.2.5.2 Data consistency rules constraining a set function
Template: T35

##### 9.3.2.5.3 Data consistency rules constraining a set
Template: T36

#### 9.3.2.6 Temporal data constraints
A temporal data constraint constrains one or more temporal data items (data items that represent time points or time periods). There are various subcategories of temporal constraint:

##### 9.3.2.6.1 Simple temporal data constraints
A simple temporal data constraint requires that a particular date or time fall within a certain temporal range.
Template: T29

##### 9.3.2.6.2 Temporal data non-overlap constraints
Temporal data non-overlap constraint requires that the time periods specified in a set of records do not overlap each other.
Template: T37

##### 9.3.2.6.3 Temporal data completeness constraints
A temporal data completeness constraint requires that the time periods specified in a set of records be contiguous and between them completely span some other time period.
Template: T38

##### 9.3.2.6.4 Temporal data inclusion constraints
A temporal data inclusion constraint requires that the time periods specified in a set of records do not fall outside some other time period.
Template: T38

##### 9.3.2.6.5 Temporal single record constraints
A temporal single record constraint requires that a temporal state of affairs be recorded using a single record rather than multiple records.
When to apply: a single data item is involved.
Template: T39

When to apply: a combination of data items is involved.
Template: T40

##### 9.3.2.6.6 Day type constraints
A day type constraint restricts a date to one or more days of the week or a particular type of day such as a working day (typically but not necessarily any day other than a Saturday, Sunday, or public holiday).
Template: T41

#### 9.3.2.7 Spatial data constraints
A spatial data constraint prescribes or prohibits relationships between data items representing spatial properties (points, line segments or polygons).
Template: T41

#### 9.3.2.8 Data item format rules
A data item format rule specifies the required format of a data item.
Template: T43

### 9.3.3 Data update rules
A data update rule either prohibits update of a data item or places restrictions on the new value of a data item in terms of its existing value. There are three subcategories of data update rule:

#### 9.3.3.1 Data update prohibition rules
A data update prohibition rule prohibits update of a particular data item or set of data items.
Rule statements for rules governing non-transferable relationships:
Template: T44

Rule statements for other data update prohibition rules:
Template: T45

#### 9.3.3.2 State transition constraints
A state transition constraint limits the changes in a data item to a set of valid transitions.
Template: T46

#### 9.3.3.3 Monotonic transition constraints
A monotonic transition constraint requires that a numeric value either only increase or only decrease.
Template: T47

## 9.4 Activity rules
Activity rules (all of which are operative rules) constrains the operation of one or more business processes or other activities. Activity rules can in turn be categorized as:

### 9.4.1 Activity restriction rules
An activity restriction rule restricts a business process or other activity in some way. There are various subcategories of activity restriction rules:

#### 9.4.1.1 Rules restricting when an activity can occur
Many activity restriction rules place time restrictions on activities.

##### 9.4.1.1.1 Activity time limit rules
An activity time limit rule restricts a business process or other activity to within a particular time period.
Template: T48

##### 9.4.1.1.2 Activity exclusion period rules
An activity exclusion period rule prohibits a business process or other activity during a particular time period.
Template: T49

##### 9.4.1.1.3 Activity obligation rule
An activity obligation rule requires a business process or other activity to occur either within a maximum time after a particular event (such as the completion of some other process) or as soon as practical after a particular event.
Template: T49

#### 9.4.1.2 Activity pre-condition rules
An activity pre-condition rule prohibits a business process or other activity unless some other activity or event has previously occurred or some prerequisite condition exists.
Template: T50

#### 9.4.1.3 Activity prohibition rules
An activity prohibition rule prohibits a business process or other activity if some event or other process has previously occurred or some dangerous or illegal condition exists.
Template: T51

#### 9.4.1.4 Information retention rules
An information retention rule defines the minimum period for which a particular type of information is retained.
Template: T49

#### 9.4.1.5 Activity conflict rules
An activity conflict rule restricts the simultaneous occurrence of multiple processes or other activities.
Template: T?

### 9.4.2 Process decision rules
A process decision rule determines what action a business process or device is to take in specific situations;
9.4.3 Activity obligation rules: An activity obligation rule requires a business process or other activity to occur either within a maximum time after a particular event (such as the completion of some other process) or when particular conditions apply.
Template: T52

## 9.5 Party rules
Party rules (all of which are operative rules) restricts the parties who can perform a process or activity or play a role. Party rules can in turn be categorized as

### 9.5.1 Party restriction rules
A party restriction rule places restrictions on who can perform some processes or activities or play some roles, based on age, some other physical characteristic or capability, or training, testing, and certification in the appropriate skills.
Template: T53

### 9.5.2 Role separation and binding rules
A role separation rule prohibits the same party from performing two activities.
Template: T54

### 9.5.3 Information access rules
An information access rule defines who can view, create, or update particular information.
Template: T55

### 9.5.4 Responsibility rules
A responsibility rule defines who is responsible for performing a particular process or liable for a particular fee, duty, or tax.
Template: T56

Return a JSON object with the following fields.

 If the query is invalid, return an empty classification.

 Here's an example of output:
 [
    {
        "id": "some id",
        "text": "some text",
        "type": "Definitional"
        "subtype": "Formal intensional definitions",
        "templates": ["T7"],
        "examples": ["R70"]
    },
    {
        "id": ...
        "text": ...
        "type": ...
        "subtype": ...
        "templates": ...
        "examples": ...
    },

]

Here's the paragraphs you'll need to classify:
"""

In [None]:
true_values = [
    {
        "id": "R70",
        "text": "A senior passenger is by definition a passenger whose age is at least 70 years at the time of travel.",
        "type": "Definitional",
        "subtype": "Formal intensional definitions"
    },
    {
        "id": "R73",
        "text": "End of financial year is by definition June 30.",
        "type": "Definitional",
        "subtype": "Symbolic literal definitions"
    },
    {
        "id": "R76",
        "text": "A payment is by definition one of the following: a cash payment, a credit card payment, or an electronic funds transfer payment.",
        "type": "Definitional",
        "subtype": "Categorization scheme enumerations"
    },
    {
        "id": "R85",
        "text": "1 ft is by definition equal to 12 in.",
        "type": "Definitional",
        "subtype": "Conversion factor definitions"
    },
    {
        "id": "R91",
        "text": "Each flight booking confirmation must specify exactly one travel class for each flight.",
        "type": "Data",
        "subtype": "Mandatory data item rules"
    },
    {
        "id": "R98",
        "text": "Each flight booking confirmation must specify exactly one of the following: a postal address, an e-mail address, or a fax number.",
        "type": "Data",
        "subtype": "Mandatory group rules"
    },
    {
        "id": "R99",
        "text": "A flight booking request for a one-way journey must not specify a return date.",
        "type": "Data",
        "subtype": "Prohibited data rules"
    },
    {
        "id": "R130",
        "text": "Online check-in for a flight may occur only during the 24 h before the departure time of that flight.",
        "type": "Activity",
        "subtype": "Activity time limit rules"
    },
    {
        "id": "R135",
        "text": "A driver must not operate any vehicle if that driver is intoxicated.",
        "type": "Activity",
        "subtype": "Activity prohibition rules"
    },
    {
        "id": "R140",
        "text": "A person may travel alone only if the age of that person is at least 2 years.",
        "type": "Party",
        "subtype": "Party restriction rules"
    },
    {
        "id": "R78",
        "text": "The status of an employee is by definition one of the following: probational, permanent, or temporary.",
        "type": "Definitional",
        "subtype": "Categorization scheme enumerations"
    },
    {
        "id": "R349",
        "text": "Each flight booking confirmation for an international journey must specify for each passenger specified in that flight booking request a passport number or a visa number but not both.",
        "type": "Data",
        "subtype": "Mandatory group rules"
    },
    {
        "id": "R75",
        "text": "A person is by definition either an adult or a minor.",
        "type": "Definitional",
        "subtype": "Categorization scheme enumerations"
    },
    {
        "id": "R93",
        "text": "Each combination of departure date, flight number, and departure city must be allocated exactly one departure time.",
        "type": "Data",
        "subtype": "Mandatory data item rules"
    },
    {
        "id": "R96",
        "text": "Each flight booking confirmation must specify a mobile phone number, an e-mail address, or both.",
        "type": "Data",
        "subtype": "Mandatory group rules"
    },
    {
        "id": "R143",
        "text": "A person may be rostered on a flight crew only if that person holds an airline transport pilot license that is current and a type endorsement that is current for each aircraft type to be flown by that flight crew.",
        "type": "Party",
        "subtype": "Party restriction rules"
    },
    {
        "id": "R142",
        "text": "A passenger may be allocated to a seat in an exit row only if that passenger is able to open an aircraft door.",
        "type": "Party",
        "subtype": "Party restriction rules"
    },
    {
        "id": "R77",
        "text": "The gender of a person is by definition either male or female.",
        "type": "Definitional",
        "subtype": "Categorization scheme enumerations"
    },
    {
        "id": "R348",
        "text": "Each customer complaint must specify a mobile phone number, an e-mail address, or both.",
        "type": "Data",
        "subtype": "Mandatory group rules"
    }
]

In [None]:
document1="""
[
    {
        "id": "R70",
        "text": "A senior passenger is by definition a passenger whose age is at least 70 years at the time of travel.",
        "type": "Definitional"
    },
    {
        "id": "R73",
        "text": "End of financial year is by definition June 30.",
        "type": "Definitional"
    },
    {
        "id": "R76",
        "text": "A payment is by definition one of the following: a cash payment, a credit card payment, or an electronic funds transfer payment.",
        "type": "Definitional"
    },
    {
        "id": "R85",
        "text": "1 ft is by definition equal to 12 in.",
        "type": "Definitional"
    },
    {
        "id": "R91",
        "text": "Each flight booking confirmation must specify exactly one travel class for each flight.",
        "type": "Data"
    },
    {
        "id": "R98",
        "text": "Each flight booking confirmation must specify exactly one of the following: a postal address, an e-mail address, or a fax number.",
        "type": "Data"
    },
    {
        "id": "R99",
        "text": "A flight booking request for a one-way journey must not specify a return date.",
        "type": "Data"
    },
    {
        "id": "R130",
        "text": "Online check-in for a flight may occur only during the 24 h before the departure time of that flight.",
        "type": "Activity"
    },
    {
        "id": "R135",
        "text": "A driver must not operate any vehicle if that driver is intoxicated.",
        "type": "Activity"
    },
    {
        "id": "R140",
        "text": "A person may travel alone only if the age of that person is at least 2 years.",
        "type": "Party"
    },
    {
        "id": "R78",
        "text": "The status of an employee is by definition one of the following: probational, permanent, or temporary.",
        "type": "Definitional"
    },
    {
        "id": "R349",
        "text": "Each flight booking confirmation for an international journey must specify for each passenger specified in that flight booking request a passport number or a visa number but not both.",
        "type": "Data"
    },
    {
        "id": "R75",
        "text": "A person is by definition either an adult or a minor.",
        "type": "Definitional"
    },
    {
        "id": "R93",
        "text": "Each combination of departure date, flight number, and departure city must be allocated exactly one departure time.",
        "type": "Data"
    },
    {
        "id": "R96",
        "text": "Each flight booking confirmation must specify a mobile phone number, an e-mail address, or both.",
        "type": "Data"
    },
    {
        "id": "R143",
        "text": "A person may be rostered on a flight crew only if that person holds an airline transport pilot license that is current and a type endorsement that is current for each aircraft type to be flown by that flight crew.",
        "type": "Party"
    },
    {
        "id": "R142",
        "text": "A passenger may be allocated to a seat in an exit row only if that passenger is able to open an aircraft door.",
        "type": "Party"
    },
    {
        "id": "R77",
        "text": "The gender of a person is by definition either male or female.",
        "type": "Definitional"
    },
    {
        "id": "R348",
        "text": "Each customer complaint must specify a mobile phone number, an e-mail address, or both.",
        "type": "Data"
    }
]
"""

Query LLM

In [None]:
response=query_llm(user_prompt=document1, system_prompt=prompt_b, response_model=ParagraphDataset)

In [None]:
response.paragraphs

#### Validation


Set predictions.

In [None]:
# Predictions and True values
predictions = response.model_dump()

In [None]:
labels_predict=[item['subtype'] for item in predictions["paragraphs"]]

In [None]:
labels_true=[item['subtype'] for item in true_values]

In [None]:
labels = list(set(labels_true + labels_predict))

In [None]:
labels

Convert to dataframe.

In [None]:
# Convert to DataFrames
pred_df = pd.DataFrame(predictions["paragraphs"])
true_df = pd.DataFrame(true_values)

# Merge the two datasets on paragraph_id
merged_df = pd.merge(pred_df, true_df, on="id")

In [None]:
merged_df

Compute confusion matrix and precision, recall, and accuracy.

In [None]:
# Compute confusion matrix
conf_matrix = confusion_matrix(merged_df['subtype_y'], merged_df['subtype_x'], labels=labels)

# Compute precision, recall, and accuracy
precision = precision_score(merged_df['subtype_y'], merged_df['subtype_x'], average='weighted')
recall = recall_score(merged_df['subtype_y'], merged_df['subtype_x'], average='weighted')
accuracy = accuracy_score(merged_df['subtype_y'], merged_df['subtype_x'])
f1 = f1_score(merged_df['subtype_y'], merged_df['subtype_x'], average='weighted')

disp = ConfusionMatrixDisplay.from_predictions(merged_df['subtype_y'], merged_df['subtype_x'], labels=labels, xticks_rotation='vertical')

print(f"precison: {precision}, recall: {recall}, accuracy: {accuracy}, f1_score: {f1}")

# Classification evaluation

## Classify (P6)

Two prompts were tested.

As shown in the next sections, the accuracy of prompt a was 0.8947.

The confusion matrix for prompt a reveals misclassifications involving Activity and Party. The table presents the paragraph ID, prediction, and true value. For instance, paragraph R99, which is a Data rule, was misclassified as an Activity. Similarly, paragraph R135, which is an Activity rule, was misclassified as a Party.

In contrast, prompt b achieved 100% correct classifications. The key difference between prompt a and prompt b is that prompt b includes only the definitions of the rules, without instructions on how to classify them.

### prompt a

precison: 0.9078947368421053, recall: 0.8947368421052632, accuracy: 0.8947368421052632

![classify_prompt_a.png](https://github.com/asantos2000/master-degree-santos-anderson/blob/main/code/media/classify_prompt_a.png?raw=1)

> Rows where the prediction differs from the true value are highlighted in **bold**.

| Paragraph ID | Prediction    | Paragraph                                                                                                                                                                    | True Value    |
|--------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|
| R70          | Definitional  | A senior passenger is by definition a passenger whose age is at least 70 years at the time of travel.                                                                        | Definitional  |
| R73          | Definitional  | End of financial year is by definition June 30.                                                                                                                              | Definitional  |
| R76          | Definitional  | A payment is by definition one of the following: a cash payment, a credit card payment, or an electronic funds transfer payment.                                             | Definitional  |
| R85          | Definitional  | 1 ft is by definition equal to 12 in.                                                                                                                                        | Definitional  |
| R91          | Data          | Each flight booking confirmation must specify exactly one travel class for each flight.                                                                                      | Data          |
| R98          | Data          | Each flight booking confirmation must specify exactly one of the following: a postal address, an e-mail address, or a fax number.                                            | Data          |
| **R99**      | **Activity**  | **A flight booking request for a one-way journey must not specify a return date.**                                                                                               | **Data**      |
| R130         | Activity      | Online check-in for a flight may occur only during the 24 h before the departure time of that flight.                                                                        | Activity      |
| **R135**         | **Party**      | **A driver must not operate any vehicle if that driver is intoxicated.**                                                                                                         | **Activity**      |
| R140     | Party  | A person may travel alone only if the age of that person is at least 2 years.                                                                                                | Party      |
| R78          | Definitional  | The status of an employee is by definition one of the following: probational, permanent, or temporary.                                                                       | Definitional  |
| R349         | Data          | Each flight booking confirmation for an international journey must specify for each passenger specified in that flight booking request a passport number or a visa number but not both. | Data          |
| R75          | Definitional  | A person is by definition either an adult or a minor.                                                                                                                        | Definitional  |
| R93          | Data          | Each combination of departure date, flight number, and departure city must be allocated exactly one departure time.                                                          | Data          |
| R96          | Data          | Each flight booking confirmation must specify a mobile phone number, an e-mail address, or both.                                                                             | Data          |
| R143         | Party         | A person may be rostered on a flight crew only if that person holds an airline transport pilot license that is current and a type endorsement that is current for each aircraft type to be flown by that flight crew. | Party         |
| R142         | Party         | A passenger may be allocated to a seat in an exit row only if that passenger is able to open an aircraft door.                                                               | Party         |
| R77          | Definitional  | The gender of a person is by definition either male or female.                                                                                                               | Definitional  |
| R348         | Data          | Each customer complaint must specify a mobile phone number, an e-mail address, or both.                                                                                      | Data          |

### prompt b
precison: 1.0, recall: 1.0, accuracy: 1.0

![image.png](https://github.com/asantos2000/master-degree-santos-anderson/blob/main/code/media/classify_prompt_b.png?raw=1)

> Rows where the prediction differs from the true value are highlighted in **bold**.

| Paragraph ID | Prediction    | Paragraph                                                                                                                                                                    | True Value    |
|--------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|
| R70          | Definitional  | A senior passenger is by definition a passenger whose age is at least 70 years at the time of travel.                                                                        | Definitional  |
| R73          | Definitional  | End of financial year is by definition June 30.                                                                                                                              | Definitional  |
| R76          | Definitional  | A payment is by definition one of the following: a cash payment, a credit card payment, or an electronic funds transfer payment.                                             | Definitional  |
| R85          | Definitional  | 1 ft is by definition equal to 12 in.                                                                                                                                        | Definitional  |
| R91          | Data          | Each flight booking confirmation must specify exactly one travel class for each flight.                                                                                      | Data          |
| R98          | Data          | Each flight booking confirmation must specify exactly one of the following: a postal address, an e-mail address, or a fax number.                                            | Data          |
| R99      | Activity  | A flight booking request for a one-way journey must not specify a return date.                                                                                               | Activity      |
| R130         | Activity      | Online check-in for a flight may occur only during the 24 h before the departure time of that flight.                                                                        | Activity      |
| R135         | Activity      | A driver must not operate any vehicle if that driver is intoxicated.                                                                                                         | Activity      |
| R140     | Activity  | A person may travel alone only if the age of that person is at least 2 years.                                                                                                | Activity      |
| R78          | Definitional  | The status of an employee is by definition one of the following: probational, permanent, or temporary.                                                                       | Definitional  |
| R349         | Data          | Each flight booking confirmation for an international journey must specify for each passenger specified in that flight booking request a passport number or a visa number but not both. | Data          |
| R75          | Definitional  | A person is by definition either an adult or a minor.                                                                                                                        | Definitional  |
| R93          | Data          | Each combination of departure date, flight number, and departure city must be allocated exactly one departure time.                                                          | Data          |
| R96          | Data          | Each flight booking confirmation must specify a mobile phone number, an e-mail address, or both.                                                                             | Data          |
| R143         | Party         | A person may be rostered on a flight crew only if that person holds an airline transport pilot license that is current and a type endorsement that is current for each aircraft type to be flown by that flight crew. | Party         |
| R142         | Party         | A passenger may be allocated to a seat in an exit row only if that passenger is able to open an aircraft door.                                                               | Party         |
| R77          | Definitional  | The gender of a person is by definition either male or female.                                                                                                               | Definitional  |
| R348         | Data          | Each customer complaint must specify a mobile phone number, an e-mail address, or both.                                                                                      | Data          |



### Classification 1st attempt

### True tables

True tables are annotated or "golden" datasets in which entities have been manually identified and labeled within the original source data.

True tables for sectiona 275.0-2, 275.0-5 and 275.0-7

Load true table for part 1.

In [None]:
with open(f"{config['DEFAULT_DATA_DIR']}/p1_true_table.json", 'r') as file:
    data = json.load(file)

    manager.add_document(
        Document.model_validate(data["§ 275.0-2_P1|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-5_P1|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-7_P1|true_table"])
    )

Load true table for part 2.

In [None]:
with open(f"{config['DEFAULT_DATA_DIR']}/p2_true_table.json", 'r') as file:
    data = json.load(file)

    manager.add_document(
        Document.model_validate(data["§ 275.0-2_P2|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-5_P2|true_table"])
    )

    manager.add_document(
        Document.model_validate(data["§ 275.0-7_P2|true_table"])
    )

### Elements classification taxonomy

In [None]:
system_prompt_taxonomy_classification = """
Classify each element using the provided taxonomy. Use the example of the class to help you.

Answer adding taxonomy classification to each element in the following format:


elements:
  - doc_id: § 275.0-2_P1
    - id: 1
      expression: "A person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents."
      template_id: T7
      template: |
      {A|An|The|} <definitional rule statement subject>
      {<qualifying clause>|}
      <verb phrase> by definition
      <definition>
      classification": Formal intensional definitions
    - id: 2
      expression: "A person may serve a non-resident investment adviser, non-resident general partner, or non-resident managing agent by furnishing the Commission with one copy of the process, pleadings, or papers, for each named party, and one additional copy for the Commission's records."
      template_id: T8
      template: |
      {A|An|The|} <definitional rule statement subject>
      {<qualifying clause>|}
      <verb phrase> by definition
      <definition>
      classification": Formal extensional definitions
  - doc_id: § 275.0-5_P1
    - id: 1
      expression: "An order disposing of the matter will be issued as of course following the expiration of the period of time referred to in paragraph (a) of this section, unless the Commission thereafter orders a hearing on the matter."
      template_id: T9
      template: |
      {A|An|The|} <definitional rule statement subject>
      {<qualifying clause>|}
      <verb phrase> by definition
      <definition>
      classification": Formal extensional definitions
    ...
  ...
"""

In [None]:
example = {
  "§ 275.0-2_P1": {
    "id": "§ 275.0-2_P1",
    "type": "section",
    "elements": {
      "fact_types": [
        {
          "id": 1,
          "expression": "A person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents.",
          "template_id": "T7",
          "template": "{A|An|The|} <definitional rule statement subject> {<qualifying clause>|} <verb phrase> by definition <definition>",
          "classification": "Formal intensional definitions"
        }
      ],
      "rules": [],
      "terms": []
    }
  }
}

In [None]:
with open(f"{config['DEFAULT_DATA_DIR']}/classify_subtypes.json", 'r') as file:
    data = json.load(file)

    print(data)

In [None]:
def find_sections_by_title(data, title):
    result = []

    # Recursively search for matching section_title
    def search_sections(sections):
        for section in sections:
            if section['section_title'] == title:
                result.append(section)
            if 'subsections' in section:
                search_sections(section['subsections'])

    # Start the search from the root level
    search_sections(data)
    return result

In [None]:
# Example: Find sections with the title 'Definitional rules'
filtered_sections = find_sections_by_title(data, 'Definitional rules')

# Output the filtered sections
print(json.dumps(filtered_sections, indent=2))

In [None]:
# Example: Find sections with the title 'Definitional rules'
filtered_sections = find_sections_by_title(data, 'Formal term definitions')

# Output the filtered sections
print(json.dumps(filtered_sections, indent=2))

## Other approaches

### Classification

In [None]:
# 1st approach

# process terms
def add_definition(definitions, doc_id, term, definition):
    # Ensure the doc_id dictionary exists, and then add the term
    definitions.setdefault(doc_id, {})[term] = definition

docs_p2 = [s for s in manager.list_document_ids(doc_type="llm_response") if s.endswith("_P2")]

elements_terms_definition = {}

for doc in docs_p2:
    print(doc)
    doc_id = doc.replace("_P2", "")
    doc_content = manager.retrieve_document(doc, doc_type="llm_response").content
    doc_terms = doc_content.get("terms")
    for term in doc_terms:
        #print(term)
        add_definition(elements_terms_definition, doc_id, term.get("term"), term.get("definition"))

# process elements

docs_p1 = [s for s in manager.list_document_ids(doc_type="llm_response") if s.endswith("_P1")]

elements_terms_set = set()
elements_names_set = set()
elements_terms = []
elements_names = []
elements_facts = []
elements_rules = []

for doc in docs_p1: 
    doc_content = manager.retrieve_document(doc, doc_type="llm_response").content
    #print(doc_content)
    doc_id = doc_content.get("section")
    #print(f"Document: {doc_id}")
    doc_elements = doc_content.get("elements")
    #print(f"Elements: {doc_elements}")
    for element in doc_elements:
        #print(f"Element: {element}")
        element_classification = element.get("classification")
        #print(f"Classification: {element_classification}")
        element_id = element.get("id")
        element_dict = {
            "doc_id": doc_id,
            "expression_id": element_id,
            "expression": element.get("expression"),
            "source": element.get("source")
        }
        #print(element_dict)
        match element_classification:
            case "Fact" | "Fact Type":
                elements_facts.append(element_dict)
            case "Operative Rule":
                elements_rules.append(element_dict)
        element_terms = element.get("terms", [])
        if element_terms:
            #print(element_terms)
            for term in element_terms:
                signifier = term.get("term")
                term_dict = {
                    "doc_id": doc_id,
                    "signifier": signifier,
                    "expression_id": element_id,
                    "definition": elements_terms_definition.get(doc_id).get(signifier),
                    "source": element.get("source")
                }
                if term.get("classification") == "Common Noun":
                    elements_terms.append(term_dict)
                    elements_terms_set.add(signifier)
                else:
                    elements_names.append(term_dict)
                    elements_names_set.add(signifier)





Take a functional approach to the problem

In [None]:
def add_definition(definitions_dict, doc_id, term, definition):
    """
    Adds a term definition to the given dictionary.

    Args:
        definitions_dict (dict): Dictionary to store term definitions.
        doc_id (str): Identifier of the document.
        term (str): The term to be defined.
        definition (str): The definition of the term.
    """
    definitions_dict.setdefault(doc_id, {})[term] = definition


def process_definitions(manager):
    """
    Processes document terms definitions.

    Args:
        manager: Object used to manage document retrieval.

    Returns:
        dict: A dictionary containing terms definitions for each document.
    """
    elements_terms_definition = {}
    docs_p2 = [s for s in manager.list_document_ids(doc_type="llm_response") if s.endswith("_P2")]

    for doc in docs_p2:
        doc_id = doc.replace("_P2", "")
        doc_content = manager.retrieve_document(doc, doc_type="llm_response").content
        doc_terms = doc_content.get("terms", [])
        for term in doc_terms:
            add_definition(elements_terms_definition, doc_id, term.get("term"), term.get("definition"))

    return elements_terms_definition


def process_elements(manager, elements_terms_definition):
    """
    Processes elements from documents and categorizes them into terms, names, facts, and rules.

    Args:
        manager: Object used to manage document retrieval.
        elements_terms_definition (dict): Dictionary of term definitions.

    Returns:
        dict: A dictionary containing categorized elements including terms, names, facts, and rules.
    """
    elements_terms_set = set()
    elements_names_set = set()
    elements_terms = []
    elements_names = []
    elements_facts = []
    elements_rules = []

    docs_p1 = [s for s in manager.list_document_ids(doc_type="llm_response") if s.endswith("_P1")]

    for doc in docs_p1:
        doc_content = manager.retrieve_document(doc, doc_type="llm_response").content
        doc_id = doc_content.get("section")
        doc_elements = doc_content.get("elements", [])
        for element in doc_elements:
            element_classification = element.get("classification")
            element_id = element.get("id")
            verb_symbols = element.get("verb_symbols") or element.get("verb_symbol")
            if isinstance(verb_symbols, str):
                verb_symbols = [verb_symbols]
            elif verb_symbols is None:
                verb_symbols = []
            element_dict = {
                "doc_id": doc_id,
                "expression_id": element_id,
                "expression": element.get("expression"),
                "source": element.get("source"),
                "terms": element.get("terms", []),
                "verb_symbols": verb_symbols
            }

            match element_classification:
                case "Fact" | "Fact Type":
                    elements_facts.append(element_dict)
                case "Operative Rule":
                    elements_rules.append(element_dict)

            element_terms = element.get("terms", [])
            if element_terms:
                for term in element_terms:
                    signifier = term.get("term")
                    term_dict = {
                        "doc_id": doc_id,
                        "signifier": signifier,
                        "expression_id": element_id,
                        "definition": elements_terms_definition.get(doc_id, {}).get(signifier),
                        "source": element.get("source")
                    }
                    if term.get("classification") == "Common Noun":
                        elements_terms.append(term_dict)
                        elements_terms_set.add(signifier)
                    else:
                        elements_names.append(term_dict)
                        elements_names_set.add(signifier)

    return {
        "elements_terms_set": elements_terms_set,
        "elements_names_set": elements_names_set,
        "elements_terms": elements_terms,
        "elements_names": elements_names,
        "elements_facts": elements_facts,
        "elements_rules": elements_rules
    }


def get_term_info(elements_terms, elements_names, elements_terms_definition, doc_id, term):
    """
    Retrieves information about a specific term from elements.

    Args:
        elements_terms (list): List of element terms.
        elements_names (list): List of element names.
        elements_terms_definition (dict): Dictionary of term definitions.
        doc_id (str): Document identifier.
        term (str): Term to retrieve information for.

    Returns:
        dict or None: A dictionary containing term information if found, otherwise None.
    """
    definition = elements_terms_definition.get(doc_id, {}).get(term)
    if definition:
        for term_dict in elements_terms + elements_names:
            if term_dict["doc_id"] == doc_id and term_dict["signifier"] == term:
                return {
                    "definition": definition,
                    "source": term_dict["source"],
                    "expression_id": term_dict["expression_id"]
                }
    return None


def get_name_info(elements_names, doc_id, name):
    """
    Retrieves information about a specific name from elements.

    Args:
        elements_names (list): List of element names.
        doc_id (str): Document identifier.
        name (str): Name to retrieve information for.

    Returns:
        dict or None: A dictionary containing name information if found, otherwise None.
    """
    for name_dict in elements_names:
        if name_dict["doc_id"] == doc_id and name_dict["signifier"] == name:
            return {
                "definition": name_dict.get("definition"),
                "source": name_dict["source"],
                "expression_id": name_dict["expression_id"]
            }
    return None


def get_fact_info(elements_facts, doc_id, expression_id):
    """
    Retrieves information about a specific fact from elements.

    Args:
        elements_facts (list): List of element facts.
        doc_id (str): Document identifier.
        expression_id (str): Expression identifier of the fact.

    Returns:
        dict or None: A dictionary containing fact information if found, otherwise None.
    """
    for fact_dict in elements_facts:
        if fact_dict["doc_id"] == doc_id and fact_dict["expression_id"] == expression_id:
            terms = [term.get("term") for term in fact_dict.get("terms", []) if term.get("classification") == "Common Noun"]
            names = [term.get("term") for term in fact_dict.get("terms", []) if term.get("classification") == "Proper Noun"]
            return {
                "expression": fact_dict["expression"],
                "source": fact_dict["source"],
                "terms": terms,
                "names": names,
                "verb_symbols": fact_dict.get("verb_symbols", [])
            }
    return None


def get_rule_info(elements_rules, doc_id, expression_id):
    """
    Retrieves information about a specific rule from elements.

    Args:
        elements_rules (list): List of element rules.
        doc_id (str): Document identifier.
        expression_id (str): Expression identifier of the rule.

    Returns:
        dict or None: A dictionary containing rule information if found, otherwise None.
    """
    for rule_dict in elements_rules:
        if rule_dict["doc_id"] == doc_id and rule_dict["expression_id"] == expression_id:
            terms = [term.get("term") for term in rule_dict.get("terms", []) if term.get("classification") == "Common Noun"]
            names = [term.get("term") for term in rule_dict.get("terms", []) if term.get("classification") == "Proper Noun"]
            return {
                "expression": rule_dict.get("expression"),
                "source": rule_dict.get("source"),
                "terms": terms,
                "names": names,
                "verb_symbols": rule_dict.get("verb_symbols", [])
            }
    return None

# Example usage

# Process definitions and elements
elements_terms_definition = process_definitions(manager)
elements_data = process_elements(manager, elements_terms_definition)

# Access processed data
unique_terms = elements_data["elements_terms_set"]
unique_names = elements_data["elements_names_set"]
terms = elements_data["elements_terms"]
names = elements_data["elements_names"]
facts = elements_data["elements_facts"]
rules = elements_data["elements_rules"]

# Example to get term info
term_info = get_term_info(terms, names, elements_terms_definition, "doc1", "example_term")
print(term_info)

# Example to get fact info
fact_info = get_fact_info(facts, "doc1", "fact1")
print(fact_info)

# Example to get rule info
rule_info = get_rule_info(rules, "doc1", "rule1")
print(rule_info)
