# Evaluating checkpoints

Dependencies:
- Checkpoint files at: `cfr2sbvr_db`
- Python DuckDB: `pip install duckdb --upgrade`

## Imports

In [2]:
import duckdb
import json
from pathlib import Path
import glob

## Functions

In [4]:
def upsert_table_from_json(conn, suffix_key_pattern, prefix_key_pattern, table_name, source, key_value, drop, content_key, alias, doc_id_key="id") -> bool:

    _data = []
    _source_file = Path(source).name
    directory_path = Path("temp")

    # Combine the directory path and the filename
    _temp_file = directory_path / _source_file

    with open(source, "r") as f:
        loaded_data = json.load(f)

    keys = loaded_data.keys()

    for key in keys:
        if key.startswith(prefix_key_pattern) and key.endswith(suffix_key_pattern):
            print(key)
            _data.append(loaded_data[key])

    if not _data:
        print("No data found. Check the key pattern.")
        return False
    
    with open(_temp_file, "w") as f:
        json.dump(_data, f, indent=4)

    if drop:
        _query_drop_table = f"""
        DROP TABLE IF EXISTS {table_name};
        """

        print(f"{_query_drop_table=}")
        print(f"Trying to drop table because drop parameter is {drop}")

        try:
            conn.execute(_query_drop_table)
            print(f"Table {table_name} dropped")
        except duckdb.CatalogException as e:
            print(e)

    _query_insert_data = f"""
    INSERT INTO {table_name} ({doc_id_key}, prompt, file_source, {alias}, created_at)
    SELECT 
        {doc_id_key}, 
        '{key_value}' as prompt, 
        '{_source_file}' as file_source, 
        unnest({content_key}) as {alias},
        now() as created_at
    FROM 
        read_json_auto("{_temp_file}");
    """

    print(f"{_query_insert_data=}")
    print(f"Trying to insert into {table_name}")

    try:
        conn.execute(_query_insert_data)
        print(f"Data inserted into {table_name}.")
    except duckdb.CatalogException as e:
        print(e)
        print(f"Failed to insert, trying create {table_name}")
        _query_create_table = f"""
        CREATE TABLE {table_name} AS
        SELECT {doc_id_key}, 
        '{key_value}' as prompt, 
        '{_source_file}' as file_source, 
        unnest({content_key}) as {alias},
        now() as 'created_at'
        FROM read_json_auto("{_temp_file}");
        """
        try:
            print(f"{_query_create_table=}")
            conn.execute(_query_create_table)
            print(f"Table {table_name} created and loaded.")
        except duckdb.CatalogException as e:
            print(e)
            print(f"Failed to create {table_name}")
            return False

    return True

## Connect database

In [5]:
conn = duckdb.connect("cfr2sbvr_db/database_v3.db")

### Clean-up

In [4]:
CLEAN_UP = False

In [5]:
query = """
show tables;
"""

tables = conn.sql(query).fetchall()

tables

[('CHECKPOINT_METADATA',),
 ('RAW_CLASSIFY_P1_OPERATIVE_RULES',),
 ('RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_FACTS',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_NAMES',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_TERMS',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE',),
 ('RAW_CLASSIFY_P2_OPERATIVE_RULES',),
 ('RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE',),
 ('RAW_SECTION_P1_EXTRACTED_ELEMENTS',),
 ('RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE',),
 ('RAW_SECTION_P2_EXTRACTED_NOUN',),
 ('RAW_SECTION_P2_EXTRACTED_NOUN_TRUE',),
 ('RAW_TRANSFORM_FACT_TYPES',),
 ('RAW_TRANSFORM_NAMES',),
 ('RAW_TRANSFORM_OPERATIVE_RULES',),
 ('RAW_TRANSFORM_TERMS',)]

In [6]:
len(tables)

19

In [7]:
for table_name in tables:
    query = f"""
    drop table {table_name[0]};
    """

    # If True then drop the table
    if CLEAN_UP:
        print(query)
        conn.sql(query)

## Metadata

In [8]:
query = """
DROP TABLE IF EXISTS CHECKPOINT_METADATA;

CREATE TABLE CHECKPOINT_METADATA AS
SELECT process,
doc_source,
doc_id,
doc_type,
table_name, 
now() as 'created_at'
FROM read_csv_auto("cfr2sbvr_db/metadata/checkpoints_metadata.csv");
"""

if CLEAN_UP:
    conn.sql(query)

In [9]:
query = """
SELECT *,
FROM CHECKPOINT_METADATA;
"""

conn.sql(query)

┌────────────────┬────────────┬────────────────────────────────┬─────────────────────────────┬─────────────────────────────────────────┬────────────────────────────┐
│    process     │ doc_source │             doc_id             │          doc_type           │               table_name                │         created_at         │
│    varchar     │  varchar   │            varchar             │           varchar           │                 varchar                 │  timestamp with time zone  │
├────────────────┼────────────┼────────────────────────────────┼─────────────────────────────┼─────────────────────────────────────────┼────────────────────────────┤
│ classification │ true       │ classify_P1                    │ true_table                  │ RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE    │ 2024-12-18 00:31:26.968-03 │
│ classification │ pred       │ classify_P1                    │ llm_response_classification │ RAW_CLASSIFY_P1_OPERATIVE_RULES         │ 2024-12-18 00:31:26.968-03 │
│ cl

In [10]:
query = """
SELECT DISTINCT TABLE_NAME,
FROM CHECKPOINT_METADATA
ORDER BY 1;
"""

conn.sql(query)

┌─────────────────────────────────────────┐
│               table_name                │
│                 varchar                 │
├─────────────────────────────────────────┤
│ RAW_CLASSIFY_P1_OPERATIVE_RULES         │
│ RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE    │
│ RAW_CLASSIFY_P2_DEFINITIONAL_FACTS      │
│ RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE │
│ RAW_CLASSIFY_P2_DEFINITIONAL_NAMES      │
│ RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE │
│ RAW_CLASSIFY_P2_DEFINITIONAL_TERMS      │
│ RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE │
│ RAW_CLASSIFY_P2_OPERATIVE_RULES         │
│ RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE    │
│ RAW_SECTION_P1_EXTRACTED_ELEMENTS       │
│ RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE  │
│ RAW_SECTION_P2_EXTRACTED_NOUN           │
│ RAW_SECTION_P2_EXTRACTED_NOUN_TRUE      │
│ RAW_TRANSFORM_FACT_TYPES                │
│ RAW_TRANSFORM_NAMES                     │
│ RAW_TRANSFORM_OPERATIVE_RULES           │
│ RAW_TRANSFORM_TERMS                     │
├───────────────────────────────

## Extract

Metadata

In [11]:
query = """
SELECT *,
FROM CHECKPOINT_METADATA
WHERE PROCESS='extraction'
ORDER BY DOC_ID, DOC_TYPE DESC;
"""

conn.sql(query)

┌────────────┬────────────┬──────────────┬──────────────┬────────────────────────────────────────┬────────────────────────────┐
│  process   │ doc_source │    doc_id    │   doc_type   │               table_name               │         created_at         │
│  varchar   │  varchar   │   varchar    │   varchar    │                varchar                 │  timestamp with time zone  │
├────────────┼────────────┼──────────────┼──────────────┼────────────────────────────────────────┼────────────────────────────┤
│ extraction │ true       │ § 275.0-2_P1 │ true_table   │ RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE │ 2024-12-18 00:31:26.968-03 │
│ extraction │ pred       │ § 275.0-2_P1 │ llm_response │ RAW_SECTION_P1_EXTRACTED_ELEMENTS      │ 2024-12-18 00:31:26.968-03 │
│ extraction │ true       │ § 275.0-2_P2 │ true_table   │ RAW_SECTION_P2_EXTRACTED_NOUN_TRUE     │ 2024-12-18 00:31:26.968-03 │
│ extraction │ pred       │ § 275.0-2_P2 │ llm_response │ RAW_SECTION_P2_EXTRACTED_NOUN          │ 2024-

### Extract elements from sections (P1)

True tables

In [12]:
upsert_table_from_json(
    conn,
    suffix_key_pattern="_P1|true_table",
    prefix_key_pattern="§",
    table_name="RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE",
    source="cfr2sbvr_db/true/documents_true_table.json",
    key_value="extract_p1",
    drop=True,
    content_key="content.elements",
    alias="elements",
    doc_id_key="id"
)

§ 275.0-2_P1|true_table
§ 275.0-5_P1|true_table
§ 275.0-7_P1|true_table
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE;\n        '
Trying to drop table because drop parameter is True
Table RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE dropped
_query_insert_data='\n    INSERT INTO RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE (id, prompt, file_source, elements, created_at)\n    SELECT \n        id, \n        \'extract_p1\' as prompt, \n        \'documents_true_table.json\' as file_source, \n        unnest(content.elements) as elements,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents_true_table.json");\n    '
Trying to insert into RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE
Catalog Error: Table with name RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE does not exist!
Did you mean "database_v3.RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE"?
Failed to insert, trying create RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE
_query_create_table='\n        CREATE TA

True

In [13]:
query = """
SELECT * FROM RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE;
"""

conn.sql(query)

┌──────────────┬────────────┬───────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

Total verb symbols extracted

In [14]:
query = """
SELECT
    COUNT(verb) AS total_verbs
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE
) AS flattened_verbs;
"""

conn.sql(query)

┌─────────────┐
│ total_verbs │
│    int64    │
├─────────────┤
│          61 │
└─────────────┘

distinct verb symbols

In [15]:
query = """
SELECT
    COUNT(DISTINCT verb) AS distinct_verbs_count
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE
) AS flattened_verbs;
"""

conn.sql(query)

┌──────────────────────┐
│ distinct_verbs_count │
│        int64         │
├──────────────────────┤
│                   43 │
└──────────────────────┘

In [16]:
query = """
SELECT DISTINCT
    verb,
    doc_id,
    source
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb,
        id AS doc_id,
        CAST(json_extract(elements, '$.sources') AS VARCHAR) AS source
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE
) AS distinct_combinations;
"""

section_p1_extracted_elements_true_values=conn.sql(query).fetchall()

conn.sql(query)



┌────────────────────────┬──────────────┬─────────────────┐
│          verb          │    doc_id    │     source      │
│        varchar         │   varchar    │     varchar     │
├────────────────────────┼──────────────┼─────────────────┤
│ orders                 │ § 275.0-5_P1 │ ["(b)"]         │
│ had                    │ § 275.0-7_P1 │ ["(a)(3)"]      │
│ has                    │ § 275.0-7_P1 │ ["(b)(1)(ii)"]  │
│ means                  │ § 275.0-7_P1 │ ["(b)(2)"]      │
│ shown                  │ § 275.0-7_P1 │ ["(b)(2)"]      │
│ with                   │ § 275.0-2_P1 │ ["(a)(1)"]      │
│ at                     │ § 275.0-2_P1 │ ["(a)(2)"]      │
│ was served             │ § 275.0-2_P1 │ ["(a)(3)"]      │
│ may submit             │ § 275.0-5_P1 │ ["(a)"]         │
│ is appropriate         │ § 275.0-5_P1 │ ["(c)"]         │
│    ·                   │      ·       │    ·            │
│    ·                   │      ·       │    ·            │
│    ·                   │      ·       

Predict tables

In [87]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="_P1|llm_response",
        prefix_key_pattern="§",
        table_name="RAW_SECTION_P1_EXTRACTED_ELEMENTS",
        source=file_path,
        key_value="extract_p1",
        drop=drop,
        content_key="content.elements",
        alias="elements",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
§ 275.0-2_P1|llm_response
§ 275.0-5_P1|llm_response
§ 275.0-7_P1|llm_response
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_SECTION_P1_EXTRACTED_ELEMENTS;\n        '
Trying to drop table because drop parameter is True
Table RAW_SECTION_P1_EXTRACTED_ELEMENTS dropped
_query_insert_data='\n    INSERT INTO RAW_SECTION_P1_EXTRACTED_ELEMENTS (id, prompt, file_source, elements, created_at)\n    SELECT \n        id, \n        \'extract_p1\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content.elements) as elements,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_SECTION_P1_EXTRACTED_ELEMENTS
Catalog Error: Table with name RAW_SECTION_P1_EXTRACTED_ELEMENTS does not exist!
Did you mean "database_v3.RAW_SECTION_P1_EXTRACTED_ELEMENTS"?
Failed to insert, trying create RAW_SECTION_P1_EXTRACTED_ELEMENTS
_query_create_

In [88]:
query = """
SELECT * FROM RAW_SECTION_P1_EXTRACTED_ELEMENTS;
"""

conn.sql(query)

┌──────────────┬────────────┬─────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

Total verb symbols extracted

In [89]:
query = """
SELECT
    COUNT(verb) AS total_verbs
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS
) AS flattened_verbs;
"""

conn.sql(query)

┌─────────────┐
│ total_verbs │
│    int64    │
├─────────────┤
│         610 │
└─────────────┘

distinct verb symbols

In [90]:
query = """
SELECT
    COUNT(DISTINCT verb) AS distinct_verbs_count
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS
) AS flattened_verbs;
"""

conn.sql(query)

┌──────────────────────┐
│ distinct_verbs_count │
│        int64         │
├──────────────────────┤
│                   43 │
└──────────────────────┘

Distinct verb symbols with doc_id and source

In [91]:
query = """
SELECT DISTINCT
    verb,
    doc_id,
    source
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb,
        id AS doc_id,
        CAST(json_extract(elements, '$.sources') AS VARCHAR) AS source
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS
) AS distinct_combinations;
"""

#pred_values=duckdb.sql(query).fetchall()

conn.sql(query)



┌────────────────────────┬──────────────┬──────────────────┐
│          verb          │    doc_id    │      source      │
│        varchar         │   varchar    │     varchar      │
├────────────────────────┼──────────────┼──────────────────┤
│ orders                 │ § 275.0-5_P1 │ ["(b)"]          │
│ had                    │ § 275.0-7_P1 │ ["(a)(3)"]       │
│ has                    │ § 275.0-7_P1 │ ["(b)(1)(ii)"]   │
│ means                  │ § 275.0-7_P1 │ ["(b)(2)"]       │
│ shown                  │ § 275.0-7_P1 │ ["(b)(2)"]       │
│ will forward           │ § 275.0-2_P1 │ ["(a)(2)"]       │
│ forwarded              │ § 275.0-2_P1 │ ["(a)(3)"]       │
│ manages                │ § 275.0-2_P1 │ ["(b)(1)"]       │
│ means                  │ § 275.0-2_P1 │ ["(b)(2)"]       │
│ resides                │ § 275.0-2_P1 │ ["(b)(2)"]       │
│  ·                     │      ·       │     ·            │
│  ·                     │      ·       │     ·            │
│  ·                    

Verb symbols with doc_id and source for evaluation

In [92]:
query = """
SELECT
    verb,
    doc_id,
    source
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb,
        id AS doc_id,
        CAST(json_extract(elements, '$.sources') AS VARCHAR) AS source
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS
) AS distinct_combinations;
"""

section_p1_extracted_elements_pred_values=conn.sql(query).fetchall()

conn.sql(query)



┌────────────────────────┬──────────────┬─────────────────┐
│          verb          │    doc_id    │     source      │
│        varchar         │   varchar    │     varchar     │
├────────────────────────┼──────────────┼─────────────────┤
│ serve                  │ § 275.0-2_P1 │ ["(a)"]         │
│ by serving             │ § 275.0-2_P1 │ ["(a)"]         │
│ serve                  │ § 275.0-2_P1 │ ["(a)(1)"]      │
│ by furnishing          │ § 275.0-2_P1 │ ["(a)(1)"]      │
│ with                   │ § 275.0-2_P1 │ ["(a)(1)"]      │
│ will forward           │ § 275.0-2_P1 │ ["(a)(2)"]      │
│ by                     │ § 275.0-2_P1 │ ["(a)(2)"]      │
│ at                     │ § 275.0-2_P1 │ ["(a)(2)"]      │
│ certifies              │ § 275.0-2_P1 │ ["(a)(3)"]      │
│ was served             │ § 275.0-2_P1 │ ["(a)(3)"]      │
│  ·                     │      ·       │     ·           │
│  ·                     │      ·       │     ·           │
│  ·                     │      ·       

### Extract Terms and Names definitions (P2)

True table

In [25]:
upsert_table_from_json(
    conn,
    suffix_key_pattern="_P2|true_table",
    prefix_key_pattern="§",
    table_name="RAW_SECTION_P2_EXTRACTED_NOUN_TRUE",
    source="cfr2sbvr_db/true/documents_true_table.json",
    key_value="extract_p2",
    drop=True,
    content_key="content.terms",
    alias="terms",
    doc_id_key="id"
)

§ 275.0-2_P2|true_table
§ 275.0-5_P2|true_table
§ 275.0-7_P2|true_table
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_SECTION_P2_EXTRACTED_NOUN_TRUE;\n        '
Trying to drop table because drop parameter is True
Table RAW_SECTION_P2_EXTRACTED_NOUN_TRUE dropped
_query_insert_data='\n    INSERT INTO RAW_SECTION_P2_EXTRACTED_NOUN_TRUE (id, prompt, file_source, terms, created_at)\n    SELECT \n        id, \n        \'extract_p2\' as prompt, \n        \'documents_true_table.json\' as file_source, \n        unnest(content.terms) as terms,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents_true_table.json");\n    '
Trying to insert into RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
Catalog Error: Table with name RAW_SECTION_P2_EXTRACTED_NOUN_TRUE does not exist!
Did you mean "database_v3.RAW_SECTION_P2_EXTRACTED_NOUN_TRUE"?
Failed to insert, trying create RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
_query_create_table='\n        CREATE TABLE RAW_SECTION_P2_EXTRACTED_NOUN_TRU

True

In [26]:
query = """
SELECT * FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE;
"""

conn.sql(query)

┌──────────────┬────────────┬───────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│      id      │   prompt   │        file_source        │                                                                                                                                            terms                                                                                                                                             │         created_at         │
│   varchar    │  varchar   │          varchar          │                                                                                                                struct(term varchar, definition varchar, islocalscope boolean)                     

Total terms per id

In [27]:
query = """
SELECT
    id, COUNT(terms) AS total_terms, COUNT(DISTINCT terms) AS total_terms_distinct
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
GROUP BY id;
"""

conn.sql(query)

┌──────────────┬─────────────┬──────────────────────┐
│      id      │ total_terms │ total_terms_distinct │
│   varchar    │    int64    │        int64         │
├──────────────┼─────────────┼──────────────────────┤
│ § 275.0-2_P2 │          31 │                   31 │
│ § 275.0-7_P2 │          31 │                   31 │
│ § 275.0-5_P2 │          22 │                   22 │
└──────────────┴─────────────┴──────────────────────┘

terms

In [28]:
query = """
SELECT
    id, terms.term as term, terms.definition as definition, if(terms.definition is NULL, 0, 1) as hasDefinition, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
-- GROUP BY id;
"""

conn.sql(query)

┌──────────────┬────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────┬──────────────┐
│      id      │                  term                  │                                                                                                            definition                                                                                                            │ hasDefinition │ isLocalScope │
│   varchar    │                varchar                 │                                                                                                             varchar                                                                                                              │     int32     │   boolean    │
├──────────────┼────────────────────────────────────

How many terms has definition

In [29]:
query = """
SELECT
    id, count(terms) as term, count(if(terms.definition is NULL, NULL, 1)) as hasDefinition, count(if(terms.definition is NULL, NULL, 1)) / count(terms) as ratio
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
GROUP BY id
ORDER BY id
;
"""

conn.sql(query)

┌──────────────┬───────┬───────────────┬─────────────────────┐
│      id      │ term  │ hasDefinition │        ratio        │
│   varchar    │ int64 │     int64     │       double        │
├──────────────┼───────┼───────────────┼─────────────────────┤
│ § 275.0-2_P2 │    31 │             9 │  0.2903225806451613 │
│ § 275.0-5_P2 │    22 │            19 │  0.8636363636363636 │
│ § 275.0-7_P2 │    31 │             8 │ 0.25806451612903225 │
└──────────────┴───────┴───────────────┴─────────────────────┘

In [30]:
query = """
SELECT
    id, count(terms) as term, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
GROUP BY id, terms.isLocalScope
ORDER BY id
;
"""

conn.sql(query)

┌──────────────┬───────┬──────────────┐
│      id      │ term  │ isLocalScope │
│   varchar    │ int64 │   boolean    │
├──────────────┼───────┼──────────────┤
│ § 275.0-2_P2 │     7 │ true         │
│ § 275.0-2_P2 │    24 │ false        │
│ § 275.0-5_P2 │    19 │ true         │
│ § 275.0-5_P2 │     3 │ false        │
│ § 275.0-7_P2 │     8 │ true         │
│ § 275.0-7_P2 │    23 │ false        │
└──────────────┴───────┴──────────────┘

isLocalScope per document

In [31]:
query = """
WITH TotalCounts AS (
    SELECT
        id,
        COUNT(terms) AS total_terms
    FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
    GROUP BY id
)
SELECT
    e.id,
    COUNT(e.terms) AS term,
    e.terms.isLocalScope AS isLocalScope,
    (COUNT(e.terms) * 100.0 / tc.total_terms) AS percentage
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE e
JOIN TotalCounts tc ON e.id = tc.id
GROUP BY e.id, e.terms.isLocalScope, tc.total_terms
ORDER BY e.id, e.terms.isLocalScope;
"""

conn.sql(query)

┌──────────────┬───────┬──────────────┬────────────────────┐
│      id      │ term  │ isLocalScope │     percentage     │
│   varchar    │ int64 │   boolean    │       double       │
├──────────────┼───────┼──────────────┼────────────────────┤
│ § 275.0-2_P2 │    24 │ false        │  77.41935483870968 │
│ § 275.0-2_P2 │     7 │ true         │ 22.580645161290324 │
│ § 275.0-5_P2 │     3 │ false        │ 13.636363636363637 │
│ § 275.0-5_P2 │    19 │ true         │  86.36363636363636 │
│ § 275.0-7_P2 │    23 │ false        │  74.19354838709677 │
│ § 275.0-7_P2 │     8 │ true         │ 25.806451612903224 │
└──────────────┴───────┴──────────────┴────────────────────┘

Total terms per id

In [32]:
query = """
SELECT
    id, COUNT(terms) AS total_terms, COUNT(DISTINCT terms) AS total_terms_distinct
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
GROUP BY id;
"""

conn.sql(query)

┌──────────────┬─────────────┬──────────────────────┐
│      id      │ total_terms │ total_terms_distinct │
│   varchar    │    int64    │        int64         │
├──────────────┼─────────────┼──────────────────────┤
│ § 275.0-5_P2 │          22 │                   22 │
│ § 275.0-7_P2 │          31 │                   31 │
│ § 275.0-2_P2 │          31 │                   31 │
└──────────────┴─────────────┴──────────────────────┘

terms

In [33]:
query = """
SELECT
    id, terms.term as term, terms.definition as definition, if(terms.definition is NULL, 0, 1) as hasDefinition, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
-- GROUP BY id;
"""

conn.sql(query)

┌──────────────┬────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────┬──────────────┐
│      id      │                  term                  │                                                                                                            definition                                                                                                            │ hasDefinition │ isLocalScope │
│   varchar    │                varchar                 │                                                                                                             varchar                                                                                                              │     int32     │   boolean    │
├──────────────┼────────────────────────────────────

How many terms has definition

In [34]:
query = """
SELECT
    id, count(terms) as term, count(if(terms.definition is NULL, NULL, 1)) as hasDefinition, count(if(terms.definition is NULL, NULL, 1)) / count(terms) as ratio
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
GROUP BY id
ORDER BY id
;
"""

conn.sql(query)

┌──────────────┬───────┬───────────────┬─────────────────────┐
│      id      │ term  │ hasDefinition │        ratio        │
│   varchar    │ int64 │     int64     │       double        │
├──────────────┼───────┼───────────────┼─────────────────────┤
│ § 275.0-2_P2 │    31 │             9 │  0.2903225806451613 │
│ § 275.0-5_P2 │    22 │            19 │  0.8636363636363636 │
│ § 275.0-7_P2 │    31 │             8 │ 0.25806451612903225 │
└──────────────┴───────┴───────────────┴─────────────────────┘

In [35]:
query = """
SELECT
    id, count(terms) as term, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
GROUP BY id, terms.isLocalScope
ORDER BY id
;
"""

conn.sql(query)

┌──────────────┬───────┬──────────────┐
│      id      │ term  │ isLocalScope │
│   varchar    │ int64 │   boolean    │
├──────────────┼───────┼──────────────┤
│ § 275.0-2_P2 │     7 │ true         │
│ § 275.0-2_P2 │    24 │ false        │
│ § 275.0-5_P2 │    19 │ true         │
│ § 275.0-5_P2 │     3 │ false        │
│ § 275.0-7_P2 │     8 │ true         │
│ § 275.0-7_P2 │    23 │ false        │
└──────────────┴───────┴──────────────┘

In [36]:
query = """
SELECT
    id, terms, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
WHERE terms.isLocalScope is NULL
;
"""

conn.sql(query)

┌─────────┬────────────────────────────────────────────────────────────────┬──────────────┐
│   id    │                             terms                              │ isLocalScope │
│ varchar │ struct(term varchar, definition varchar, islocalscope boolean) │   boolean    │
├─────────┴────────────────────────────────────────────────────────────────┴──────────────┤
│                                         0 rows                                          │
└─────────────────────────────────────────────────────────────────────────────────────────┘

In [37]:
query = """
SELECT
    id, terms, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
WHERE terms.isLocalScope is NULL
;
"""

conn.sql(query)

┌─────────┬────────────────────────────────────────────────────────────────┬──────────────┐
│   id    │                             terms                              │ isLocalScope │
│ varchar │ struct(term varchar, definition varchar, islocalscope boolean) │   boolean    │
├─────────┴────────────────────────────────────────────────────────────────┴──────────────┤
│                                         0 rows                                          │
└─────────────────────────────────────────────────────────────────────────────────────────┘

Predict table

In [94]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="_P2|llm_response",
        prefix_key_pattern="§",
        table_name="RAW_SECTION_P2_EXTRACTED_NOUN",
        source=file_path,
        key_value="extract_p2",
        drop=drop,
        content_key="content.terms",
        alias="terms",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
§ 275.0-2_P2|llm_response
§ 275.0-5_P2|llm_response
§ 275.0-7_P2|llm_response
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_SECTION_P2_EXTRACTED_NOUN;\n        '
Trying to drop table because drop parameter is True
Table RAW_SECTION_P2_EXTRACTED_NOUN dropped
_query_insert_data='\n    INSERT INTO RAW_SECTION_P2_EXTRACTED_NOUN (id, prompt, file_source, terms, created_at)\n    SELECT \n        id, \n        \'extract_p2\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content.terms) as terms,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_SECTION_P2_EXTRACTED_NOUN
Catalog Error: Table with name RAW_SECTION_P2_EXTRACTED_NOUN does not exist!
Did you mean "database_v3.RAW_SECTION_P2_EXTRACTED_NOUN"?
Failed to insert, trying create RAW_SECTION_P2_EXTRACTED_NOUN
_query_create_table='\n        CREATE TABLE RAW_SEC

In [95]:
query = """
SELECT * FROM RAW_SECTION_P2_EXTRACTED_NOUN;
"""

conn.sql(query)

┌──────────────┬────────────┬─────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│      id      │   prompt   │         file_source         │                                                                                                                                                                                                                                                  terms                                                                                                                   

Total verb symbols extracted

In [104]:
query = """
SELECT
    id, COUNT(terms) AS total_terms, COUNT(DISTINCT terms) AS total_terms_distinct
FROM RAW_SECTION_P2_EXTRACTED_NOUN
GROUP BY id;
"""

conn.sql(query)

┌──────────────┬─────────────┬──────────────────────┐
│      id      │ total_terms │ total_terms_distinct │
│   varchar    │    int64    │        int64         │
├──────────────┼─────────────┼──────────────────────┤
│ § 275.0-7_P2 │         320 │                   32 │
│ § 275.0-5_P2 │         230 │                   23 │
│ § 275.0-2_P2 │         300 │                   30 │
└──────────────┴─────────────┴──────────────────────┘

distinct verb symbols

In [97]:
query = """
SELECT
    id, terms.term as term, terms.definition as definition, if(terms.definition is NULL, 0, 1) as hasDefinition, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN
-- GROUP BY id;
"""

conn.sql(query)

┌──────────────┬────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────┬──────────────┐
│      id      │                  term                  │                                                                                                            definition                                                                                                            │ hasDefinition │ isLocalScope │
│   varchar    │                varchar                 │                                                                                                             varchar                                                                                                              │     int32     │   boolean    │
├──────────────┼────────────────────────────────────

How many terms has definition

In [107]:
query = """
SELECT
    id, count(terms) as term, count(if(terms.definition is NULL, NULL, 1)) as hasDefinition, count(if(terms.definition is NULL, NULL, 1)) / count(terms) as ratio
FROM RAW_SECTION_P2_EXTRACTED_NOUN
GROUP BY id
ORDER BY id
;
"""

conn.sql(query)

┌──────────────┬───────┬───────────────┬────────────────────┐
│      id      │ term  │ hasDefinition │       ratio        │
│   varchar    │ int64 │     int64     │       double       │
├──────────────┼───────┼───────────────┼────────────────────┤
│ § 275.0-2_P2 │   300 │            60 │                0.2 │
│ § 275.0-5_P2 │   230 │           190 │ 0.8260869565217391 │
│ § 275.0-7_P2 │   320 │            80 │               0.25 │
└──────────────┴───────┴───────────────┴────────────────────┘

In [111]:
query = """
SELECT
    id, count(terms) as term, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN
GROUP BY id, terms.isLocalScope
ORDER BY id
;
"""

conn.sql(query)

┌──────────────┬───────┬──────────────┐
│      id      │ term  │ isLocalScope │
│   varchar    │ int64 │   boolean    │
├──────────────┼───────┼──────────────┤
│ § 275.0-2_P2 │   240 │ false        │
│ § 275.0-2_P2 │    60 │ true         │
│ § 275.0-5_P2 │   180 │ true         │
│ § 275.0-5_P2 │    50 │ false        │
│ § 275.0-7_P2 │    80 │ true         │
│ § 275.0-7_P2 │   240 │ false        │
└──────────────┴───────┴──────────────┘

isLocalScope per id

In [100]:
query = """
WITH TotalCounts AS (
    SELECT
        id,
        COUNT(terms) AS total_terms
    FROM RAW_SECTION_P2_EXTRACTED_NOUN
    GROUP BY id
)
SELECT
    e.id,
    COUNT(e.terms) AS term,
    e.terms.isLocalScope AS isLocalScope,
    (COUNT(e.terms) * 100.0 / tc.total_terms) AS percentage
FROM RAW_SECTION_P2_EXTRACTED_NOUN e
JOIN TotalCounts tc ON e.id = tc.id
GROUP BY e.id, e.terms.isLocalScope, tc.total_terms
ORDER BY e.id, e.terms.isLocalScope;
"""

conn.sql(query)

┌──────────────┬───────┬──────────────┬───────────────────┐
│      id      │ term  │ isLocalScope │    percentage     │
│   varchar    │ int64 │   boolean    │      double       │
├──────────────┼───────┼──────────────┼───────────────────┤
│ § 275.0-2_P2 │   240 │ false        │              80.0 │
│ § 275.0-2_P2 │    60 │ true         │              20.0 │
│ § 275.0-5_P2 │    50 │ false        │ 21.73913043478261 │
│ § 275.0-5_P2 │   180 │ true         │ 78.26086956521739 │
│ § 275.0-7_P2 │   240 │ false        │              75.0 │
│ § 275.0-7_P2 │    80 │ true         │              25.0 │
└──────────────┴───────┴──────────────┴───────────────────┘

In [101]:
query = """
SELECT
    id, terms, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN
WHERE terms.isLocalScope is NULL
;
"""

conn.sql(query)

┌─────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────┐
│   id    │                                                                            terms                                                                             │ isLocalScope │
│ varchar │ struct(term varchar, definition varchar, confidence double, reason varchar, islocalscope boolean, local_scope_confidence double, local_scope_reason varchar) │   boolean    │
├─────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┤
│                                                                                        0 rows                                                                                         │
└─────────────────────────────────────────────────────────────────────

## Classify

Metadata

In [46]:
query = """
SELECT *,
FROM CHECKPOINT_METADATA
WHERE PROCESS='classification'
ORDER BY DOC_ID, DOC_TYPE DESC;
"""

conn.sql(query)

┌────────────────┬────────────┬────────────────────────────────┬─────────────────────────────┬─────────────────────────────────────────┬────────────────────────────┐
│    process     │ doc_source │             doc_id             │          doc_type           │               table_name                │         created_at         │
│    varchar     │  varchar   │            varchar             │           varchar           │                 varchar                 │  timestamp with time zone  │
├────────────────┼────────────┼────────────────────────────────┼─────────────────────────────┼─────────────────────────────────────────┼────────────────────────────┤
│ classification │ true       │ classify_P1                    │ true_table                  │ RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE    │ 2024-12-18 00:31:26.968-03 │
│ classification │ pred       │ classify_P1                    │ llm_response_classification │ RAW_CLASSIFY_P1_OPERATIVE_RULES         │ 2024-12-18 00:31:26.968-03 │
│ cl

### Classify P1 - Operative Rules into top-level Witt(2012) taxonomy (P1)

True table

In [47]:
upsert_table_from_json(
    conn,
    suffix_key_pattern="classify_P1|true_table",
    prefix_key_pattern="",
    table_name="RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE",
    source="cfr2sbvr_db/true/documents_true_table.json",
    key_value="classify_p1",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id"
)

classify_P1|true_table
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE;\n        '
Trying to drop table because drop parameter is True
Table RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE dropped
_query_insert_data='\n    INSERT INTO RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'classify_p1\' as prompt, \n        \'documents_true_table.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents_true_table.json");\n    '
Trying to insert into RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE
Catalog Error: Table with name RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE does not exist!
Did you mean "database_v3.RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE"?
Failed to insert, trying create RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE
_query_create_table='\n        CREATE TABLE RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE AS\n        SELECT id, \n       

True

In [48]:
query = """
SELECT
    content.doc_id as id,
    content.statement_id,
    content.statement,
    content.sources,
    'documents_true_table.json' as file_source,
    content.type
FROM RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE
;
"""

true_clasify_p1 = conn.sql(query).fetchall()

conn.sql(query)

┌───────────┬──────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────┬───────────────────────────┬────────────────┐
│    id     │ statement_id │                                                                                                                                                    statement                                                                                                                                                     │  sources  │        file_source        │      type      │
│  varchar  │   varchar    │                                                                                                                                                     varchar                              

Predict table

In [49]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="classify_P1|llm_response_classification",
        prefix_key_pattern="",
        table_name="RAW_CLASSIFY_P1_OPERATIVE_RULES",
        source=file_path,
        key_value="classify_p1",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
classify_P1|llm_response_classification
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_CLASSIFY_P1_OPERATIVE_RULES;\n        '
Trying to drop table because drop parameter is True
Table RAW_CLASSIFY_P1_OPERATIVE_RULES dropped
_query_insert_data='\n    INSERT INTO RAW_CLASSIFY_P1_OPERATIVE_RULES (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'classify_p1\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_CLASSIFY_P1_OPERATIVE_RULES
Catalog Error: Table with name RAW_CLASSIFY_P1_OPERATIVE_RULES does not exist!
Did you mean "database_v3.RAW_CLASSIFY_P1_OPERATIVE_RULES"?
Failed to insert, trying create RAW_CLASSIFY_P1_OPERATIVE_RULES
_query_create_table='\n        CREATE TABLE RAW_CLASSIFY_P1_OPERATIVE_RULES 

In [50]:
query = """
SELECT
    *
FROM RAW_CLASSIFY_P1_OPERATIVE_RULES
;
"""

conn.sql(query)

┌─────────────┬─────────────┬──────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [51]:
query = """
SELECT
    content.doc_id as id,
    content.statement_id,
    content.statement_title,
    content.statement_text,
    content.statement_sources,
    content.classification
FROM RAW_CLASSIFY_P1_OPERATIVE_RULES
;
"""

conn.sql(query)

┌───────────┬──────────────┬───────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [52]:
query = """
SELECT 
    content.doc_id as doc_id,
    content.statement_id as statement_id,
    content.statement_title as statement_title,
    content.statement_sources as statement_sources,
    content.file_source as file_source,
    --classification.type as classification_type, 
    --classification.explanation as classification_explanation, 
    --classification.confidence as classification_confidence
    MAX(classification.type) as classification_type, 
    MAX(classification.explanation) as classification_explanation, 
    MAX(classification.confidence) as classification_confidence,
    MAX(file_source) as file_source
FROM (
    SELECT
        file_source,
        content.doc_id,
        content.statement_id,
        content.statement_sources,
        content.statement_title,
        unnest(content.classification) as classification
    FROM
        RAW_CLASSIFY_P1_OPERATIVE_RULES
) AS content
GROUP BY doc_id, statement_id, statement_title, statement_sources, file_source
ORDER BY file_source, doc_id, statement_id
"""

pred_classify_p1_operative_rules= conn.sql(query).fetchall()

conn.sql(query)

┌───────────┬──────────────┬───────────────────────────────────────────┬───────────────────┬──────────────────────────────┬─────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────┬──────────────────────────────┐
│  doc_id   │ statement_id │              statement_title              │ statement_sources │         file_source          │ classification_type │                                                                                       classification_explanation                                                                                        │ classification_confidence │         file_source          │
│  varchar  │   varchar    │                  varchar                  │     varchar[]     │           varchar            │       varchar       │                                         

In [53]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Criar dicionários para fácil acesso, incluindo file_source na chave
pred_dict = {(item[0], item[1], tuple(item[3]), item[4]): item[5] for item in pred_classify_p1_operative_rules}
true_dict = {(item[0], item[1], tuple(item[3])): item[5] for item in true_clasify_p1}

# Alinhar e comparar os dados
y_true = []
y_pred = []

# Usar todas as previsões
for item in pred_classify_p1_operative_rules:
    key = (item[0], item[1], tuple(item[3]))
    y_pred.append(item[5])
    y_true.append(true_dict.get(key, 'No Match'))  # 'No Match' se não houver correspondência

# Calcular métricas
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted')

print(f'Precisão: {precision:.2f}')
print(f'Sensibilidade: {recall:.2f}')
print(f'Acurácia: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

Precisão: 0.25
Sensibilidade: 0.50
Acurácia: 0.50
F1 Score: 0.33


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The best classification of all checkpoints.

In [54]:
query = """
WITH ExpandedClassifications AS (
    SELECT
        content.doc_id AS id,
        content.statement_id,
        content.statement_title,
        content.statement_text,
        content.statement_sources,
        classification_item.value.type AS type,
        classification_item.value.confidence AS confidence,
        classification_item.value.explanation AS explanation,
        content.file_source as file_source,
        ROW_NUMBER() OVER (PARTITION BY content.doc_id, content.statement_id ORDER BY classification_item.value.confidence DESC) AS rn
    FROM
        RAW_CLASSIFY_P1_OPERATIVE_RULES AS content,
        UNNEST(content.classification) AS classification_item(value)
)
SELECT
    id,
    statement_id,
    statement_title,
    statement_text,
    statement_sources,
    file_source,
    type AS classification_type,
    confidence AS classification_confidence,
    explanation AS classification_explanation
FROM
    ExpandedClassifications
WHERE
    rn = 1;
"""

conn.sql(query)

┌───────────┬──────────────┬───────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────┬──────────────────────────────┬─────────────────────┬───────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│    id     │ statement_id │              statement_title              │                                                                                                                                                  statement_text                                                                                     

### Classify - P2 Operative Rules

True table

In [55]:
upsert_table_from_json(
    conn,
    suffix_key_pattern="classify_P2_Operative_rules|true_table",
    prefix_key_pattern="",
    table_name="RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE",
    source="cfr2sbvr_db/true/documents_true_table.json",
    key_value="classify_P2_Operative_rules",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id"
)

classify_P2_Operative_rules|true_table
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE;\n        '
Trying to drop table because drop parameter is True
Table RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE dropped
_query_insert_data='\n    INSERT INTO RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'classify_P2_Operative_rules\' as prompt, \n        \'documents_true_table.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents_true_table.json");\n    '
Trying to insert into RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE
Catalog Error: Table with name RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE does not exist!
Did you mean "database_v3.RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE"?
Failed to insert, trying create RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE
_query_create_table='\n        CREATE TABLE RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE 

True

In [56]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE
;
"""

conn.sql(query)

┌─────────────────────────────┬─────────────────────────────┬───────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│             id              │           prompt            │        file_source        │                                                                                                                                                                                                                        content                                                                                                                                 

Pred table

In [57]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="classify_P2_Operative_rules|llm_response_classification",
        prefix_key_pattern="",
        table_name="RAW_CLASSIFY_P2_OPERATIVE_RULES",
        source=file_path,
        key_value="classify_P2_Operative_rules",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
classify_P2_Operative_rules|llm_response_classification
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_CLASSIFY_P2_OPERATIVE_RULES;\n        '
Trying to drop table because drop parameter is True
Table RAW_CLASSIFY_P2_OPERATIVE_RULES dropped
_query_insert_data='\n    INSERT INTO RAW_CLASSIFY_P2_OPERATIVE_RULES (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'classify_P2_Operative_rules\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_CLASSIFY_P2_OPERATIVE_RULES
Catalog Error: Table with name RAW_CLASSIFY_P2_OPERATIVE_RULES does not exist!
Did you mean "database_v3.RAW_CLASSIFY_P2_OPERATIVE_RULES"?
Failed to insert, trying create RAW_CLASSIFY_P2_OPERATIVE_RULES
_query_create_table='\n        CREATE TABLE 

In [58]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_OPERATIVE_RULES
;
"""

conn.sql(query)

┌─────────────────────────────┬─────────────────────────────┬──────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

### Classify P2 - Definitional Terms

True table

In [59]:
upsert_table_from_json(
    conn,
    suffix_key_pattern="classify_P2_Definitional_terms|true_table",
    prefix_key_pattern="",
    table_name="RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE",
    source="cfr2sbvr_db/true/documents_true_table.json",
    key_value="classify_P2_Definitional_terms",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id"
)

classify_P2_Definitional_terms|true_table
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE;\n        '
Trying to drop table because drop parameter is True
Table RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE dropped
_query_insert_data='\n    INSERT INTO RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'classify_P2_Definitional_terms\' as prompt, \n        \'documents_true_table.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents_true_table.json");\n    '
Trying to insert into RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE
Catalog Error: Table with name RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE does not exist!
Did you mean "database_v3.RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE"?
Failed to insert, trying create RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE
_query_create_table='\n        CREATE TABLE RAW_CLASSI

True

In [60]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE
;
"""

conn.sql(query)

┌────────────────────────────────┬────────────────────────────────┬───────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│               id               │             prompt             │        file_source        │                                                                                                                                                                                                                                                          content              

Pred table

In [61]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="classify_P2_Definitional_terms|llm_response_classification",
        prefix_key_pattern="",
        table_name="RAW_CLASSIFY_P2_DEFINITIONAL_TERMS",
        source=file_path,
        key_value="classify_P2_Definitional_terms",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
classify_P2_Definitional_terms|llm_response_classification
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_CLASSIFY_P2_DEFINITIONAL_TERMS;\n        '
Trying to drop table because drop parameter is True
Table RAW_CLASSIFY_P2_DEFINITIONAL_TERMS dropped
_query_insert_data='\n    INSERT INTO RAW_CLASSIFY_P2_DEFINITIONAL_TERMS (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'classify_P2_Definitional_terms\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_CLASSIFY_P2_DEFINITIONAL_TERMS
Catalog Error: Table with name RAW_CLASSIFY_P2_DEFINITIONAL_TERMS does not exist!
Did you mean "database_v3.RAW_CLASSIFY_P2_DEFINITIONAL_TERMS"?
Failed to insert, trying create RAW_CLASSIFY_P2_DEFINITIONAL_TERMS
_query_create_tab

In [62]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_TERMS
;
"""

conn.sql(query)

┌────────────────────────────────┬────────────────────────────────┬─────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬

### Classify P2 - Definitional Names

True table

In [63]:
upsert_table_from_json(
    conn,
    suffix_key_pattern="classify_P2_Definitional_names|true_table",
    prefix_key_pattern="",
    table_name="RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE",
    source="cfr2sbvr_db/true/documents_true_table.json",
    key_value="classify_P2_Definitional_names",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id"
)

classify_P2_Definitional_names|true_table
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE;\n        '
Trying to drop table because drop parameter is True
Table RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE dropped
_query_insert_data='\n    INSERT INTO RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'classify_P2_Definitional_names\' as prompt, \n        \'documents_true_table.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents_true_table.json");\n    '
Trying to insert into RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE
Catalog Error: Table with name RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE does not exist!
Did you mean "database_v3.RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE"?
Failed to insert, trying create RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE
_query_create_table='\n        CREATE TABLE RAW_CLASSI

True

In [64]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE
;
"""

conn.sql(query)

┌────────────────────────────────┬────────────────────────────────┬───────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│               id               │             prompt             │        file_source        │                                                                                                                                                                 content                                                                                                                                                                  │         created_at         │
│            varchar             │            varchar             │          varchar    

Pred table

In [65]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="classify_P2_Definitional_names|llm_response_classification",
        prefix_key_pattern="",
        table_name="RAW_CLASSIFY_P2_DEFINITIONAL_NAMES",
        source=file_path,
        key_value="classify_P2_Definitional_names",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
classify_P2_Definitional_names|llm_response_classification
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_CLASSIFY_P2_DEFINITIONAL_NAMES;\n        '
Trying to drop table because drop parameter is True
Table RAW_CLASSIFY_P2_DEFINITIONAL_NAMES dropped
_query_insert_data='\n    INSERT INTO RAW_CLASSIFY_P2_DEFINITIONAL_NAMES (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'classify_P2_Definitional_names\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_CLASSIFY_P2_DEFINITIONAL_NAMES
Catalog Error: Table with name RAW_CLASSIFY_P2_DEFINITIONAL_NAMES does not exist!
Did you mean "database_v3.RAW_CLASSIFY_P2_DEFINITIONAL_NAMES"?
Failed to insert, trying create RAW_CLASSIFY_P2_DEFINITIONAL_NAMES
_query_create_tab

In [66]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_NAMES
;
"""

conn.sql(query)

┌────────────────────────────────┬────────────────────────────────┬──────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│               id               │             prompt             │         file_source          │                                                                       

### Classify P2 - Definitional Facts

True table

In [67]:
upsert_table_from_json(
    conn,
    suffix_key_pattern="classify_P2_Definitional_facts|true_table",
    prefix_key_pattern="",
    table_name="RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE",
    source="cfr2sbvr_db/true/documents_true_table.json",
    key_value="classify_P2_Definitional_facts",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id"
)

classify_P2_Definitional_facts|true_table
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE;\n        '
Trying to drop table because drop parameter is True
Table RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE dropped
_query_insert_data='\n    INSERT INTO RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'classify_P2_Definitional_facts\' as prompt, \n        \'documents_true_table.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents_true_table.json");\n    '
Trying to insert into RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE
Catalog Error: Table with name RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE does not exist!
Did you mean "database_v3.RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE"?
Failed to insert, trying create RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE
_query_create_table='\n        CREATE TABLE RAW_CLASSI

True

In [68]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE
;
"""

conn.sql(query)

┌────────────────────────────────┬────────────────────────────────┬───────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│               id               │             prompt             │        file_source        │                                                                                                                                                                               

Pred table

In [69]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="classify_P2_Definitional_facts|llm_response_classification",
        prefix_key_pattern="",
        table_name="RAW_CLASSIFY_P2_DEFINITIONAL_FACTS",
        source=file_path,
        key_value="classify_P2_Definitional_facts",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
classify_P2_Definitional_facts|llm_response_classification
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_CLASSIFY_P2_DEFINITIONAL_FACTS;\n        '
Trying to drop table because drop parameter is True
Table RAW_CLASSIFY_P2_DEFINITIONAL_FACTS dropped
_query_insert_data='\n    INSERT INTO RAW_CLASSIFY_P2_DEFINITIONAL_FACTS (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'classify_P2_Definitional_facts\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_CLASSIFY_P2_DEFINITIONAL_FACTS
Catalog Error: Table with name RAW_CLASSIFY_P2_DEFINITIONAL_FACTS does not exist!
Did you mean "database_v3.RAW_CLASSIFY_P2_DEFINITIONAL_FACTS"?
Failed to insert, trying create RAW_CLASSIFY_P2_DEFINITIONAL_FACTS
_query_create_tab

In [70]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_FACTS
;
"""

conn.sql(query)

┌────────────────────────────────┬────────────────────────────────┬─────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

## Transform

Metadata

In [71]:
query = """
SELECT *,
FROM CHECKPOINT_METADATA
WHERE PROCESS='transformation'
ORDER BY DOC_ID, DOC_TYPE DESC;
"""

conn.sql(query)

┌────────────────┬────────────┬───────────────────────────┬────────────────────────┬───────────────────────────────┬────────────────────────────┐
│    process     │ doc_source │          doc_id           │        doc_type        │          table_name           │         created_at         │
│    varchar     │  varchar   │          varchar          │        varchar         │            varchar            │  timestamp with time zone  │
├────────────────┼────────────┼───────────────────────────┼────────────────────────┼───────────────────────────────┼────────────────────────────┤
│ transformation │ pred       │ transform_Fact_Types      │ llm_response_transform │ RAW_TRANSFORM_FACT_TYPES      │ 2024-12-18 00:31:26.968-03 │
│ transformation │ pred       │ transform_Names           │ llm_response_transform │ RAW_TRANSFORM_NAMES           │ 2024-12-18 00:31:26.968-03 │
│ transformation │ pred       │ transform_Operative_Rules │ llm_response_transform │ RAW_TRANSFORM_OPERATIVE_RULES │ 2024-12

### Operative Rules

Pred table

In [72]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="transform_Operative_Rules|llm_response_transform",
        prefix_key_pattern="",
        table_name="RAW_TRANSFORM_OPERATIVE_RULES",
        source=file_path,
        key_value="transform_Operative_Rules",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
transform_Operative_Rules|llm_response_transform
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_TRANSFORM_OPERATIVE_RULES;\n        '
Trying to drop table because drop parameter is True
Table RAW_TRANSFORM_OPERATIVE_RULES dropped
_query_insert_data='\n    INSERT INTO RAW_TRANSFORM_OPERATIVE_RULES (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'transform_Operative_Rules\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_TRANSFORM_OPERATIVE_RULES
Catalog Error: Table with name RAW_TRANSFORM_OPERATIVE_RULES does not exist!
Did you mean "database_v3.RAW_TRANSFORM_OPERATIVE_RULES"?
Failed to insert, trying create RAW_TRANSFORM_OPERATIVE_RULES
_query_create_table='\n        CREATE TABLE RAW_TRANSFORM_OPERATIVE

In [73]:
query = """
SELECT *
FROM RAW_TRANSFORM_OPERATIVE_RULES
;
"""

conn.sql(query)

┌───────────────────────────┬───────────────────────────┬──────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

### Terms

Pred table

In [74]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="transform_Terms|llm_response_transform",
        prefix_key_pattern="",
        table_name="RAW_TRANSFORM_TERMS",
        source=file_path,
        key_value="transform_Terms",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
transform_Terms|llm_response_transform
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_TRANSFORM_TERMS;\n        '
Trying to drop table because drop parameter is True
Table RAW_TRANSFORM_TERMS dropped
_query_insert_data='\n    INSERT INTO RAW_TRANSFORM_TERMS (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'transform_Terms\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_TRANSFORM_TERMS
Catalog Error: Table with name RAW_TRANSFORM_TERMS does not exist!
Did you mean "database_v3.RAW_TRANSFORM_TERMS"?
Failed to insert, trying create RAW_TRANSFORM_TERMS
_query_create_table='\n        CREATE TABLE RAW_TRANSFORM_TERMS AS\n        SELECT id, \n        \'transform_Terms\' as prompt, \n        \'documents-2024-12

In [75]:
query = """
SELECT *
FROM RAW_TRANSFORM_TERMS
;
"""

conn.sql(query)

┌─────────────────┬─────────────────┬─────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

### Names

Pred table

In [76]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="transform_Names|llm_response_transform",
        prefix_key_pattern="",
        table_name="RAW_TRANSFORM_NAMES",
        source=file_path,
        key_value="transform_Names",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
transform_Names|llm_response_transform
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_TRANSFORM_NAMES;\n        '
Trying to drop table because drop parameter is True
Table RAW_TRANSFORM_NAMES dropped
_query_insert_data='\n    INSERT INTO RAW_TRANSFORM_NAMES (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'transform_Names\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_TRANSFORM_NAMES
Catalog Error: Table with name RAW_TRANSFORM_NAMES does not exist!
Did you mean "database_v3.RAW_TRANSFORM_NAMES"?
Failed to insert, trying create RAW_TRANSFORM_NAMES
_query_create_table='\n        CREATE TABLE RAW_TRANSFORM_NAMES AS\n        SELECT id, \n        \'transform_Names\' as prompt, \n        \'documents-2024-12

In [77]:
query = """
SELECT *
FROM RAW_TRANSFORM_NAMES
;
"""

conn.sql(query)

┌─────────────────┬─────────────────┬──────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────┐
│       id        │     prompt      │         file_source          │                   

### Fact types

Pred table

In [78]:
# Define the directory and pattern
directory = Path("cfr2sbvr_db/evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop=True # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        suffix_key_pattern="transform_Fact_Types|llm_response_transform",
        prefix_key_pattern="",
        table_name="RAW_TRANSFORM_FACT_TYPES",
        source=file_path,
        key_value="transform_Fact_Types",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id"
    )
    drop=False # Stop dropping tables

cfr2sbvr_db/evaluation/documents-2024-12-08-1.json
transform_Fact_Types|llm_response_transform
_query_drop_table='\n        DROP TABLE IF EXISTS RAW_TRANSFORM_FACT_TYPES;\n        '
Trying to drop table because drop parameter is True
Table RAW_TRANSFORM_FACT_TYPES dropped
_query_insert_data='\n    INSERT INTO RAW_TRANSFORM_FACT_TYPES (id, prompt, file_source, content, created_at)\n    SELECT \n        id, \n        \'transform_Fact_Types\' as prompt, \n        \'documents-2024-12-08-1.json\' as file_source, \n        unnest(content) as content,\n        now() as created_at\n    FROM \n        read_json_auto("temp/documents-2024-12-08-1.json");\n    '
Trying to insert into RAW_TRANSFORM_FACT_TYPES
Catalog Error: Table with name RAW_TRANSFORM_FACT_TYPES does not exist!
Did you mean "database_v3.RAW_TRANSFORM_FACT_TYPES"?
Failed to insert, trying create RAW_TRANSFORM_FACT_TYPES
_query_create_table='\n        CREATE TABLE RAW_TRANSFORM_FACT_TYPES AS\n        SELECT id, \n        \'transfor

In [79]:
query = """
SELECT *
FROM RAW_TRANSFORM_FACT_TYPES
;
"""

conn.sql(query)

┌──────────────────────┬──────────────────────┬─────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

## Close conn

In [6]:
conn.close()