# Validation support

Evaluating of consistency of the checkpoints.

Dependencies:
- Copy checkpoints files from evaluation and extraction to `cfr2sbvr_db`
- Python DuckDB: `pip install duckdb --upgrade`

## Google colab

In [50]:
%load_ext autoreload
%autoreload 2

import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  !rm -rf cfr2sbvr configuration checkpoint
  !git clone https://github.com/asantos2000/master-degree-santos-anderson.git cfr2sbvr
  %pip install -r cfr2sbvr/code/requirements.txt
  !cp -r cfr2sbvr/code/src/configuration .
  !cp -r cfr2sbvr/code/src/checkpoint .
  !cp -r cfr2sbvr/code/config.colab.yaml config.yaml
  DEFAULT_CONFIG_FILE="config.yaml"
else:
  DEFAULT_CONFIG_FILE="../config.yaml"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Imports

In [51]:
import duckdb
import json
from pathlib import Path
import glob
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import humanize
import os
from dotenv import load_dotenv

# Local modules
import configuration.main as configuration
import logging_setup.main as logging_setup

DEV_MODE = True

if DEV_MODE:
    # Development mode
    import importlib

    importlib.reload(configuration)
    importlib.reload(logging_setup)

## Settings

### Configuration

In [52]:
# Load configuration
config = configuration.load_config(DEFAULT_CONFIG_FILE)

# Load environment variables
load_dotenv()

True

### Logging

In [53]:
logger = logging_setup.setting_logging(config["DEFAULT_LOG_DIR"], config["LOG_LEVEL"])

2025-01-16 11:39:05 - INFO - Logging is set up with daily rotation.


## Functions

In [54]:
def plot_horizontal_bar_chart(data, group_col, value_col, title, x_label, y_label, figsize=(10, 4), legend_position="best", big_numbers={"factor":1000, "suffix":"mil"}):
    """    
    :param data: DataFrame
    :param group_col: Y values
    :param value_col: X values
    :param title: Title
    :param x_label: X label
    :param y_label: Y label
    :param figsize: Picture size
    :param legend_position: Legend position ('upper right', 'lower right', etc.), ou None para posicionar automaticamente
    """
    # Agrupando os dados e somando os valores
    filterd = data.groupby(group_col)[value_col].sum().reset_index()

    # Calculando a média para o destaque
    avg_value = filterd[value_col].mean()

    # Configurando o tamanho do gráfico
    plt.figure(figsize=figsize)

    # Adicionando linhas horizontais para conectar os marcadores ao eixo Y
    colors = ["#E69F00" if val > avg_value else "#56B4E9" for val in filterd[value_col]]
    for i, (group, val, color) in enumerate(zip(filterd[group_col], filterd[value_col], colors)):
        plt.plot(
            [0, val],
            [group, group],
            color=color,
            linewidth=2,
            zorder=1
        )

    # Adicionando os marcadores com cores amigáveis para daltônicos
    plt.scatter(
        filterd[value_col],
        filterd[group_col],
        color=colors,
        s=100,  # Tamanho dos marcadores
        zorder=2
    )

    # Adicionando uma linha vertical para a média
    plt.axvline(
        avg_value,
        color="grey",
        linestyle="--",
        linewidth=1.5,
        label=f'Média: {avg_value / big_numbers["factor"]:,.2f} {big_numbers["suffix"]}'.replace('.', ',')
    )

    # Adicionando valores diretamente ao lado dos marcadores
    for group, val in zip(filterd[group_col], filterd[value_col]):
        plt.text(
            val + 0.05 * avg_value,  # Posicionando à direita
            group,
            f'{val / big_numbers["factor"]:,.2f} {big_numbers["suffix"]}'.replace('.', ','),
            va="center",
            fontsize=10,
            zorder=3
        )

    # Adicionando linhas verticais no eixo X com cor cinza claro
    plt.gca().xaxis.grid(color="lightgrey", linestyle="--", linewidth=0.5)

    # Função personalizada para formatar os ticks do eixo X
    def format_ticks(x, pos):
        return f'{x / big_numbers["factor"]:,.1f} {big_numbers["suffix"]}'.replace('.', ',')

    # Aplicando a formatação ao eixo X
    plt.gca().xaxis.set_major_formatter(FuncFormatter(format_ticks))

    # Ajustando o título e os eixos
    plt.title(title, fontsize=16, loc="left", pad=20)
    plt.xlabel(x_label, fontsize=14)
    plt.ylabel(y_label, fontsize=14)

    # Ajustando os limites do eixo Y para compactar os valores
    plt.gca().set_ylim(-0.2, len(filterd[group_col]) - 0.8)

    # Removendo as linhas superior, direita e esquerda (spines)
    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)

    # Configurando o layout e a legenda
    plt.legend(loc=legend_position, fontsize=10)
    plt.tight_layout()

    # Mostrando o gráfico
    plt.show()


In [55]:
def upsert_table_from_json(
    conn,
    table_name,
    source,
    key_value,
    content_key,
    alias,
    drop=False,
    doc_id_key="id",
    key_pattern1="",
    key_pattern2="",
    nested=True
) -> bool:
    
    logger.info(f"{table_name=}")
    logger.info(f"{source=}")
    logger.info(f"{key_value=}")
    logger.info(f"{content_key=}")
    logger.info(f"{alias=}")
    logger.info(f"{drop=}")
    
    _data = []
    _source_file = Path(source).name

    logger.info(f"{_source_file=}")

    directory_path = Path(f"{config['DEFAULT_DATA_DIR']}/temp")

    logger.info(f"{directory_path=}")

    # Combine the directory path and the filename
    _temp_file = directory_path / _source_file

    logger.info(f"{_temp_file=}")

    with open(source, "r") as f:
        loaded_data = json.load(f)

    keys = loaded_data.keys()

    logger.info(f"{len(keys)=}")

    for key in keys:
        if (
            key_pattern1 in key and key_pattern2 in key
        ):  # key key.startswith(prefix_key_pattern) and key.endswith(suffix_key_pattern):
            logger.info(key)
            _data.append(loaded_data[key])

    if not _data:
        logger.info("No data found. Check the key pattern.")
        return False

    with open(_temp_file, "w") as f:
        json.dump(_data, f, indent=4)

    if drop:
        _query_drop_table = f"""
        DROP TABLE IF EXISTS {table_name};
        """

        logger.info(f"{_query_drop_table=}")
        logger.info(f"Trying to drop table because drop parameter is {drop}")

        try:
            conn.execute(_query_drop_table)
            logger.info(f"Table {table_name} dropped")
        except duckdb.CatalogException as e:
            logger.info(e)

    unnest_clause = f"unnest({content_key}) as {alias}" if nested else f"{content_key} as {alias}"

    _query_insert_data = f"""
    INSERT INTO {table_name} ({doc_id_key}, prompt, file_source, {alias}, created_at)
    SELECT 
        {doc_id_key}, 
        '{key_value}' as prompt, 
        '{_source_file}' as file_source, 
        {unnest_clause},
        now() as created_at
    FROM 
        read_json_auto("{_temp_file}");
    """

    logger.info(f"{_query_insert_data=}")
    logger.info(f"Trying to insert into {table_name}")

    try:
        conn.execute(_query_insert_data)
        logger.info(f"Data inserted into {table_name}.")
    except duckdb.CatalogException as e:
        logger.info(e)
        logger.info(f"Failed to insert, trying create {table_name}")
        _query_create_table = f"""
        CREATE TABLE {table_name} AS
        SELECT {doc_id_key}, 
        '{key_value}' as prompt, 
        '{_source_file}' as file_source, 
        {unnest_clause},
        now() as 'created_at'
        FROM read_json_auto("{_temp_file}");
        """
        try:
            logger.info(f"{_query_create_table=}")
            conn.execute(_query_create_table)
            logger.info(f"Table {table_name} created and loaded.")
        except duckdb.CatalogException as e:
            logger.info(e)
            logger.info(f"Failed to create {table_name}")
            return False

    return True

## Connect database

Connect local or cloud

In [66]:
LOCAL_DB=False
DATABASE = "cfr2sbvr_v5"
CLOUD_DATABASE = "cfr2sbvr_v4"

In [69]:
if LOCAL_DB:
    conn = duckdb.connect(f'{config["DEFAULT_APP_DIR"]}/data/{DATABASE}.db')
else:
    mother_duck_token=os.getenv("MOTHER_DUCK_TOKEN")
    conn = duckdb.connect(f'md:{CLOUD_DATABASE}?motherduck_token={mother_duck_token}')

logger.info(f"Connected to {'🖥️' if LOCAL_DB else '☁️'} {DATABASE} database")

2025-01-16 11:46:06 - INFO - Connected to ☁️ cfr2sbvr_v5 database


### Clean-up

In [70]:
CLEAN_UP = True

In [71]:
query = """
SHOW TABLES;
"""

tables = conn.sql(query).fetchall()

tables

[('CHECKPOINT_METADATA',),
 ('RAW_CLASSIFY_P1_OPERATIVE_RULES',),
 ('RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE',),
 ('RAW_CLASSIFY_P1_OPERATIVE_RULES_VW',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_FACTS',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_VW',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_NAMES',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_VW',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_TERMS',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE',),
 ('RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_VW',),
 ('RAW_CLASSIFY_P2_OPERATIVE_RULES',),
 ('RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE',),
 ('RAW_CLASSIFY_P2_OPERATIVE_RULES_VW',),
 ('RAW_CLASSIFY_VW',),
 ('RAW_ELAPSED_TIME',),
 ('RAW_ELAPSED_TIME_VW',),
 ('RAW_LLM_COMPLETION',),
 ('RAW_LLM_COMPLETION_VW',),
 ('RAW_LLM_VALIDATION',),
 ('RAW_LLM_VALIDATION_BEST_VW',),
 ('RAW_LLM_VALIDATION_VW',),
 ('RAW_SECTION',),
 ('RAW_SECTION_EXTRACTED_ELEMENTS_VW',),
 ('RAW_SECTION_P1_EXTRACTED_ELEMENTS',),
 (

In [72]:
query = f"""
SELECT table_catalog, table_schema, table_name, table_type
FROM information_schema.tables
WHERE table_type in ('BASE TABLE', 'VIEW')
AND table_catalog = '{DATABASE if LOCAL_DB else CLOUD_DATABASE}'
AND table_schema = 'main';
"""

tables_or_views = conn.sql(query).fetchall()

tables_or_views

[('cfr2sbvr_v4', 'main', 'RAW_TRANSFORM_TERMS', 'BASE TABLE'),
 ('cfr2sbvr_v4', 'main', 'RAW_TRANSFORM_FACT_TYPES', 'BASE TABLE'),
 ('cfr2sbvr_v4', 'main', 'RAW_SECTION_P2_EXTRACTED_NOUN_TRUE', 'BASE TABLE'),
 ('cfr2sbvr_v4', 'main', 'RAW_SECTION_P2_EXTRACTED_NOUN', 'BASE TABLE'),
 ('cfr2sbvr_v4', 'main', 'RAW_SECTION_P1_EXTRACTED_ELEMENTS', 'BASE TABLE'),
 ('cfr2sbvr_v4', 'main', 'RAW_SECTION', 'BASE TABLE'),
 ('cfr2sbvr_v4',
  'main',
  'RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE',
  'BASE TABLE'),
 ('cfr2sbvr_v4', 'main', 'RAW_LLM_COMPLETION', 'BASE TABLE'),
 ('cfr2sbvr_v4', 'main', 'RAW_ELAPSED_TIME', 'BASE TABLE'),
 ('cfr2sbvr_v4',
  'main',
  'RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE',
  'BASE TABLE'),
 ('cfr2sbvr_v4', 'main', 'RAW_CLASSIFY_P2_OPERATIVE_RULES', 'BASE TABLE'),
 ('cfr2sbvr_v4', 'main', 'RAW_CLASSIFY_P2_DEFINITIONAL_TERMS', 'BASE TABLE'),
 ('cfr2sbvr_v4',
  'main',
  'RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE',
  'BASE TABLE'),
 ('cfr2sbvr_v4', 'main', 'RAW_LLM_VALIDATION'

In [73]:
len(tables_or_views)

41

In [61]:
for table_or_view_name in tables_or_views:
    object_schema = table_or_view_name[1]
    object_name = table_or_view_name[2]
    object_type = table_or_view_name[3]

    query = f"""
    DROP {'TABLE' if object_type == 'BASE TABLE' else 'VIEW'} IF EXISTS {object_schema}.{object_name};
    """

    # If True then drop tables and views
    if CLEAN_UP:
        logger.info(query)
        conn.sql(query)

2025-01-16 11:39:53 - INFO - 
    DROP TABLE IF EXISTS main.CHECKPOINT_METADATA;
    
2025-01-16 11:39:53 - INFO - 
    DROP TABLE IF EXISTS main.RAW_CLASSIFY_P1_OPERATIVE_RULES;
    
2025-01-16 11:39:53 - INFO - 
    DROP TABLE IF EXISTS main.RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE;
    
2025-01-16 11:39:53 - INFO - 
    DROP TABLE IF EXISTS main.RAW_CLASSIFY_P2_DEFINITIONAL_FACTS;
    
2025-01-16 11:39:53 - INFO - 
    DROP TABLE IF EXISTS main.RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE;
    
2025-01-16 11:39:53 - INFO - 
    DROP TABLE IF EXISTS main.RAW_CLASSIFY_P2_DEFINITIONAL_NAMES;
    
2025-01-16 11:39:53 - INFO - 
    DROP TABLE IF EXISTS main.RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE;
    
2025-01-16 11:39:53 - INFO - 
    DROP TABLE IF EXISTS main.RAW_CLASSIFY_P2_DEFINITIONAL_TERMS;
    
2025-01-16 11:39:53 - INFO - 
    DROP TABLE IF EXISTS main.RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE;
    
2025-01-16 11:39:53 - INFO - 
    DROP TABLE IF EXISTS main.RAW_CLASSIFY_P2_OPERATIVE_RULES;

## Metadata

In [14]:
query = f"""
DROP TABLE IF EXISTS CHECKPOINT_METADATA;

CREATE TABLE CHECKPOINT_METADATA AS
SELECT process,
doc_source,
doc_id,
doc_type,
table_name, 
now() as 'created_at'
FROM read_csv_auto("{config['DEFAULT_APP_DIR']}/data/metadata/checkpoints_metadata.csv");
"""

if CLEAN_UP:
    conn.sql(query)

In [None]:
query = """
SELECT *,
FROM CHECKPOINT_METADATA;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT DISTINCT TABLE_NAME,
FROM CHECKPOINT_METADATA
WHERE DOC_SOURCE = 'both'
ORDER BY 1;
"""

conn.sql(query).fetchdf()

## Dataset

CFR sections

In [None]:
upsert_table_from_json(
    conn,
    key_pattern1="|section",
    key_pattern2="§",
    table_name="RAW_SECTION",
    source=f"{config['DEFAULT_DATA_DIR']}/documents_true_table.json",
    key_value="section",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id",
    nested=False
)

In [None]:
query = """
SELECT *,
FROM RAW_SECTION;
"""

conn.sql(query).fetchdf()

## Extract

Metadata

In [None]:
query = """
SELECT *,
FROM CHECKPOINT_METADATA
WHERE PROCESS='extraction'
ORDER BY DOC_ID, DOC_TYPE DESC;
"""

conn.sql(query).fetchdf()

### Extract elements from sections (P1)

True tables

In [None]:
upsert_table_from_json(
    conn,
    key_pattern1="_P1|true_table",
    key_pattern2="§",
    table_name="RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE",
    source=f"{config['DEFAULT_DATA_DIR']}/documents_true_table.json",
    key_value="extract_p1",
    drop=True,
    content_key="content.elements",
    alias="elements",
    doc_id_key="id",
)

In [None]:
query = """
SELECT * FROM RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE;
"""

conn.sql(query).fetchdf()

Predict tables

In [None]:
# Use checkpoints_extraction to evaluate the original extraction
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_extraction")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="_P1|llm_response",
        key_pattern2="§",
        table_name="RAW_SECTION_P1_EXTRACTED_ELEMENTS",
        source=file_path,
        key_value="extract_p1",
        drop=drop,
        content_key="content.elements",
        alias="elements",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT *
FROM RAW_SECTION_P1_EXTRACTED_ELEMENTS
;
"""

conn.sql(query).fetchdf()

##### Elements

In [None]:
query = """
SELECT count(distinct elements) as elements_count
FROM RAW_SECTION_P1_EXTRACTED_ELEMENTS
;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT file_source, elements.classification, count(*) as count
FROM RAW_SECTION_P1_EXTRACTED_ELEMENTS
GROUP BY file_source, elements.classification
ORDER BY file_source, elements.classification
;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT id, elements.classification, count(*) as count
FROM RAW_SECTION_P1_EXTRACTED_ELEMENTS
GROUP BY id, elements.classification
ORDER BY id, elements.classification
;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT id, count(*) FROM RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE
GROUP BY id
ORDER BY id;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT id, count(*) FROM RAW_SECTION_P1_EXTRACTED_ELEMENTS
GROUP BY id
ORDER BY id;
"""

conn.sql(query).fetchdf()

##### Verb symbols

Total verb symbols extracted

In [None]:
query = """
SELECT
    COUNT(verb) AS total_verbs
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE
) AS flattened_verbs;
"""

conn.sql(query).fetchdf()

distinct verb symbols

In [None]:
query = """
SELECT
    COUNT(DISTINCT verb) AS distinct_verbs_count
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE
) AS flattened_verbs;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT DISTINCT
    verb,
    doc_id,
    source
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb,
        id AS doc_id,
        CAST(json_extract(elements, '$.sources') AS VARCHAR) AS source
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE
) AS distinct_combinations;
"""

section_p1_extracted_elements_true_values = conn.sql(query).fetchall()

conn.sql(query).fetchdf()


Total verb symbols extracted

In [None]:
query = """
SELECT
    id, COUNT(verb) AS total_verbs
FROM (
    SELECT
        id, UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS
) 
GROUP BY id;
"""

conn.sql(query).fetchdf()

##### Terms

In [None]:
query = """
SELECT id, term
FROM (
    SELECT id, UNNEST(terms) AS term
    FROM (
        SELECT
            id, UNNEST(elements) AS element
        FROM
            RAW_SECTION_P1_EXTRACTED_ELEMENTS
    )
)
ORDER BY id;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT id, terms.classification AS term_classification, count(if(terms.term is NULL, NULL, 1)) as hasDefinition, COUNT(*) AS term_count
FROM (
    SELECT id, UNNEST(terms) AS terms
    FROM (
        SELECT
            id, UNNEST(elements) AS element
        FROM
            RAW_SECTION_P1_EXTRACTED_ELEMENTS
    )
)
GROUP BY id, terms.classification, if(terms.term is NULL, NULL, 1)
ORDER BY id;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT id,
    terms.classification AS term_classification,
    terms.term AS term, 
    if(terms.term is NULL, NULL, 1) as hasDefinition
FROM (
    SELECT id, UNNEST(terms) AS terms
    FROM (
        SELECT
            id, UNNEST(elements) AS element
        FROM
            RAW_SECTION_P1_EXTRACTED_ELEMENTS
    )
)
ORDER BY id;
"""

conn.sql(query).fetchdf()

distinct verb symbols

In [None]:
query = """
SELECT
    COUNT(DISTINCT verb) AS distinct_verbs_count
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS
) AS flattened_verbs;
"""

conn.sql(query).fetchdf()

Distinct verb symbols with doc_id and source

In [None]:
query = """
SELECT DISTINCT
    verb,
    doc_id,
    source
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb,
        id AS doc_id,
        CAST(json_extract(elements, '$.sources') AS VARCHAR) AS source
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS
) AS distinct_combinations;
"""

conn.sql(query).fetchdf()


Verb symbols with doc_id and source for evaluation

In [None]:
query = """
SELECT
    verb,
    doc_id,
    source
FROM (
    SELECT
        UNNEST(CAST(json_extract(elements, '$.verb_symbols') AS VARCHAR[])) AS verb,
        id AS doc_id,
        CAST(json_extract(elements, '$.sources') AS VARCHAR) AS source
    FROM
        RAW_SECTION_P1_EXTRACTED_ELEMENTS
) AS distinct_combinations;
"""

section_p1_extracted_elements_pred_values = conn.sql(query).fetchall()

conn.sql(query).fetchdf()


### Extract Terms and Names definitions (P2)

True table

In [None]:
upsert_table_from_json(
    conn,
    key_pattern1="_P2|true_table",
    key_pattern2="§",
    table_name="RAW_SECTION_P2_EXTRACTED_NOUN_TRUE",
    source=f"{config['DEFAULT_DATA_DIR']}/documents_true_table.json",
    key_value="extract_p2",
    drop=True,
    content_key="content.terms",
    alias="terms",
    doc_id_key="id",
)

In [None]:
query = """
SELECT * FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE;
"""

conn.sql(query).fetchdf()

Total terms per id

In [None]:
query = """
SELECT
    id, COUNT(terms) AS total_terms, COUNT(DISTINCT terms) AS total_terms_distinct
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
GROUP BY id;
"""

conn.sql(query).fetchdf()

terms

In [None]:
query = """
SELECT
    id, 
    terms.term as term,
    terms.definition as definition,
    if(terms.definition is NULL, 0, 1) as hasDefinition,
    terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
-- GROUP BY id;
"""

conn.sql(query).fetchdf()

How many terms has definition

In [None]:
query = """
SELECT
    id, count(terms) as term, count(if(terms.definition is NULL, NULL, 1)) as hasDefinition, count(if(terms.definition is NULL, NULL, 1)) / count(terms) as ratio
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
GROUP BY id
ORDER BY id
;
"""

conn.sql(query).fetchdf()

isLocalScope per document

In [None]:
query = """
WITH TotalCounts AS (
    SELECT
        id,
        COUNT(terms) AS total_terms
    FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
    GROUP BY id
)
SELECT
    e.id,
    COUNT(e.terms) AS term,
    e.terms.isLocalScope AS isLocalScope,
    (COUNT(e.terms) * 100.0 / tc.total_terms) AS percentage
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE e
JOIN TotalCounts tc ON e.id = tc.id
GROUP BY e.id, e.terms.isLocalScope, tc.total_terms
ORDER BY e.id, e.terms.isLocalScope;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT
    id, terms, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
WHERE terms.isLocalScope is NULL
;
"""

conn.sql(query).fetchdf()

Predict table

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_extraction")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    logger.info(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="_P2|llm_response",
        key_pattern2="§",
        table_name="RAW_SECTION_P2_EXTRACTED_NOUN",
        source=file_path,
        key_value="extract_p2",
        drop=drop,
        content_key="content.terms",
        alias="terms",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT DISTINCT * FROM RAW_SECTION_P2_EXTRACTED_NOUN;
"""

conn.sql(query).fetchdf()

In [48]:
query = """
SELECT id, file_source, terms FROM RAW_SECTION_P2_EXTRACTED_NOUN;
"""

nouns = conn.sql(query).fetchdf()

In [None]:
nouns[(nouns['id'] == '§ 275.0-2_P2')
      #& (nouns['file_source'] == 'documents-2025-01-11-9.json')
      & (nouns['terms'].apply(lambda x: 'Appointed agents' in x['term']))
]

In [None]:
query = """
SELECT ID, COUNT(*)
FROM RAW_SECTION_P2_EXTRACTED_NOUN
GROUP BY id;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
DESCRIBE RAW_SECTION_P2_EXTRACTED_NOUN;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT id, terms.classification AS term_classification, terms.term AS term
FROM (
    SELECT id, UNNEST(terms) AS terms
    FROM (
        SELECT
            id, UNNEST(elements) AS element
        FROM
            RAW_SECTION_P1_EXTRACTED_ELEMENTS
    )
)
ORDER BY id;
"""

conn.sql(query).fetchdf()

Has definition ratio

In [None]:
query = """
SELECT
    id, count(terms) as term, count(if(terms.definition is NULL, NULL, 1)) as hasDefinition, count(if(terms.definition is NULL, NULL, 1)) / count(terms) as ratio
FROM RAW_SECTION_P2_EXTRACTED_NOUN
GROUP BY id
ORDER BY id
;
"""

conn.sql(query).fetchdf()

Terms and names classify by definition

In [None]:
query = """
SELECT
    id, terms.term as term, if(terms.definition is NULL, NULL, 1) as hasDefinition
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
;
"""

conn.sql(query).fetchdf()

Results from P1

In [None]:
query = """
    SELECT 
        id,
        terms.term AS term,
        terms.classification AS term_classification,
        count(terms.term) AS term_count
    FROM (
        SELECT 
            id, UNNEST(terms) AS terms
        FROM (
            SELECT 
                id, UNNEST(elements) AS element
            FROM 
                RAW_SECTION_P1_EXTRACTED_ELEMENTS
        )
    )
    GROUP BY id, terms.term, terms.classification
    ORDER BY id, terms.term;
"""

conn.sql(query).fetchdf()

Results from P2

In [None]:
query = """
    SELECT
        id, 
        terms.term AS term, 
        count(IF(terms.definition IS NULL, NULL, 1)) AS hasDefinition
    FROM RAW_SECTION_P2_EXTRACTED_NOUN
    GROUP BY id, terms.term
    ORDER BY id, terms.term;
"""

conn.sql(query).fetchdf()

Join results from P1 and P2

In [None]:
query = """
WITH terms_classification AS (SELECT 
        REPLACE(id, '_P1', '') AS doc_id,
        terms.term AS term,
        terms.classification AS term_classification,
        count(terms.term) AS term_count
    FROM (
        SELECT 
            id, UNNEST(terms) AS terms
        FROM (
            SELECT 
                id, UNNEST(elements) AS element
            FROM 
                RAW_SECTION_P1_EXTRACTED_ELEMENTS
        )
    )
    GROUP BY id, terms.term, terms.classification
    ORDER BY id, terms.term),
terms_definition AS (
    SELECT
        REPLACE(id, '_P2', '') AS doc_id,
        terms.term AS term, 
        count(IF(terms.definition IS NULL, NULL, 1)) AS hasDefinition
    FROM RAW_SECTION_P2_EXTRACTED_NOUN
    GROUP BY id, terms.term
    ORDER BY id, terms.term
)
SELECT
    terms_classification.doc_id,
    terms_classification.term,
    terms_classification.term_classification,
    terms_classification.term_count,
    terms_definition.hasDefinition
FROM terms_classification  
LEFT JOIN terms_definition
ON terms_classification.doc_id = terms_definition.doc_id AND terms_classification.term = terms_definition.term;
"""

conn.sql(query).fetchdf()

Number of terms extract P1 (Proper and Common), P2 (with and without definition)

> Terms summary for dissertation's tables: "Tabela 2  - Conjunto de dados ouro para validação (P1)" and "Tabela 3 – Termos e nomes com definição."

In [None]:
query = """
WITH terms_classification AS (SELECT 
        REPLACE(id, '_P1', '') AS doc_id,
        terms.term AS term,
        terms.classification AS term_classification,
        count(terms.term) AS term_count
    FROM (
        SELECT 
            id, UNNEST(terms) AS terms
        FROM (
            SELECT 
                id, UNNEST(elements) AS element
            FROM 
                RAW_SECTION_P1_EXTRACTED_ELEMENTS
        )
    )
    GROUP BY id, terms.term, terms.classification
    ORDER BY id, terms.term),
terms_definition AS (
    SELECT
        REPLACE(id, '_P2', '') AS doc_id,
        terms.term AS term, 
        if(count(IF(terms.definition IS NULL, NULL, 1)) > 0, 'Yes', 'No') AS hasDefinition
    FROM RAW_SECTION_P2_EXTRACTED_NOUN
    GROUP BY id, terms.term
    ORDER BY id, terms.term
)
SELECT
    terms_classification.doc_id,
    -- terms_classification.term,
    terms_classification.term_classification,
    terms_definition.hasDefinition,
    count(*) AS term_count
FROM terms_classification  
LEFT JOIN terms_definition
ON terms_classification.doc_id = terms_definition.doc_id AND terms_classification.term = terms_definition.term
GROUP BY terms_classification.doc_id, terms_classification.term_classification, terms_definition.hasDefinition
ORDER BY terms_classification.doc_id, terms_classification.term_classification, terms_definition.hasDefinition;
"""

conn.sql(query).fetchdf()

INTERSECTION(TRUE, PRED)

In [None]:
query = """
SELECT id, prompt, term, definition, isLocalScope
FROM (
SELECT DISTINCT id, prompt, file_source, created_at, UNNEST(terms) as terms, terms.term, terms.definition, terms.isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
)
EXCEPT
SELECT DISTINCT id, prompt, term, definition, isLocalScope
FROM (
SELECT DISTINCT id, prompt, file_source, created_at, UNNEST(terms) as terms, terms.term, terms.definition, terms.isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN
);
"""

conn.sql(query).fetchdf()

INTERSECTION(PRED, TRUE)

In [None]:
query = """
SELECT id, prompt, term, definition, isLocalScope
FROM (
SELECT DISTINCT id, prompt, file_source, created_at, UNNEST(terms) as terms, terms.term, terms.definition, terms.isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN
)
EXCEPT
SELECT DISTINCT id, prompt, term, definition, isLocalScope
FROM (
SELECT DISTINCT id, prompt, file_source, created_at, UNNEST(terms) as terms, terms.term, terms.definition, terms.isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN_TRUE
);
"""

conn.sql(query).fetchdf()

Total terms and distinct

In [None]:
query = """
SELECT
    id, COUNT(terms) AS total_terms, COUNT(DISTINCT terms) AS total_terms_distinct
FROM RAW_SECTION_P2_EXTRACTED_NOUN
GROUP BY id;
"""

conn.sql(query).fetchdf()

distinct terms

In [None]:
query = """
SELECT
    id, terms.term as term, terms.definition as definition, if(terms.definition is NULL, 0, 1) as hasDefinition, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN
-- GROUP BY id;
"""

conn.sql(query).fetchdf()

How many terms has definition

In [None]:
query = """
SELECT
    id, count(terms) as term, count(if(terms.definition is NULL, NULL, 1)) as hasDefinition, count(if(terms.definition is NULL, NULL, 1)) / count(terms) as ratio
FROM RAW_SECTION_P2_EXTRACTED_NOUN
GROUP BY id
ORDER BY id
;
"""

conn.sql(query).fetchdf()

isLocalScope per id

In [None]:
query = """
WITH TotalCounts AS (
    SELECT
        id,
        COUNT(terms) AS total_terms
    FROM RAW_SECTION_P2_EXTRACTED_NOUN
    GROUP BY id
)
SELECT
    e.id,
    COUNT(e.terms) AS term,
    e.terms.isLocalScope AS isLocalScope,
    (COUNT(e.terms) * 100.0 / tc.total_terms) AS percentage
FROM RAW_SECTION_P2_EXTRACTED_NOUN e
JOIN TotalCounts tc ON e.id = tc.id
GROUP BY e.id, e.terms.isLocalScope, tc.total_terms
ORDER BY e.id, e.terms.isLocalScope;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT
    id, terms, terms.isLocalScope as isLocalScope
FROM RAW_SECTION_P2_EXTRACTED_NOUN
WHERE terms.isLocalScope is NULL
;
"""

conn.sql(query).fetchdf()

Reset extraction tables to evaluation checkpoints.

In [None]:
# Use checkpoints_extraction to evaluate the original extraction
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_extraction")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="_P1|llm_response",
        key_pattern2="§",
        table_name="RAW_SECTION_P1_EXTRACTED_ELEMENTS",
        source=file_path,
        key_value="extract_p1",
        drop=drop,
        content_key="content.elements",
        alias="elements",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT distinct file_source
FROM RAW_SECTION_P1_EXTRACTED_ELEMENTS
ORDER BY 1;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
DESCRIBE RAW_SECTION_P1_EXTRACTED_ELEMENTS;
"""

conn.sql(query).fetchdf()

## Classify

Metadata

In [None]:
query = """
SELECT *,
FROM CHECKPOINT_METADATA
WHERE PROCESS='classification'
ORDER BY DOC_ID, DOC_TYPE DESC;
"""

conn.sql(query).fetchdf()

### Classify P1 - Operative Rules into top-level Witt(2012) taxonomy (P1)

True table

In [None]:
upsert_table_from_json(
    conn,
    key_pattern1="classify_P1|true_table",
    table_name="RAW_CLASSIFY_P1_OPERATIVE_RULES_TRUE",
    source=f"{config['DEFAULT_DATA_DIR']}/documents_true_table.json",
    key_value="classify_p1",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id",
)

Predict table

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_classification")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="classify_P1|llm_response_classification",
        table_name="RAW_CLASSIFY_P1_OPERATIVE_RULES",
        source=file_path,
        key_value="classify_p1",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT
    content.doc_id as id,
    content.statement_id,
    content.statement_title,
    content.statement_text,
    content.statement_sources,
    content.classification
FROM RAW_CLASSIFY_P1_OPERATIVE_RULES
;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT 
    content.doc_id as doc_id,
    content.statement_id as statement_id,
    content.statement_title as statement_title,
    content.statement_sources as statement_sources,
    content.file_source as file_source,
    MAX(classification.type) as classification_type, 
    MAX(classification.explanation) as classification_explanation, 
    MAX(classification.confidence) as classification_confidence,
    MAX(file_source) as file_source
FROM (
    SELECT
        file_source,
        content.doc_id,
        content.statement_id,
        content.statement_sources,
        content.statement_title,
        unnest(content.classification) as classification
    FROM
        RAW_CLASSIFY_P1_OPERATIVE_RULES
) AS content
GROUP BY doc_id, statement_id, statement_title, statement_sources, file_source
ORDER BY file_source, doc_id, statement_id
"""

pred_classify_p1_operative_rules = conn.sql(query).fetchall()

conn.sql(query).fetchdf()

The best classification of all checkpoints.

In [None]:
query = """
WITH ExpandedClassifications AS (
    SELECT
        content.doc_id AS id,
        content.statement_id,
        content.statement_title,
        content.statement_text,
        content.statement_sources,
        classification_item.value.type AS type,
        classification_item.value.confidence AS confidence,
        classification_item.value.explanation AS explanation,
        content.file_source as file_source,
        ROW_NUMBER() OVER (PARTITION BY content.doc_id, content.statement_id ORDER BY classification_item.value.confidence DESC) AS rn
    FROM
        RAW_CLASSIFY_P1_OPERATIVE_RULES AS content,
        UNNEST(content.classification) AS classification_item(value)
)
SELECT
    id,
    statement_id,
    statement_title,
    statement_text,
    statement_sources,
    file_source,
    type AS classification_type,
    confidence AS classification_confidence,
    explanation AS classification_explanation
FROM
    ExpandedClassifications
WHERE
    rn = 1
ORDER BY statement_text;
"""

conn.sql(query).fetchdf()

Join with terms P1

### Classify - P2 Operative Rules

True table

In [None]:
upsert_table_from_json(
    conn,
    key_pattern1="classify_P2_Operative_rules|true_table",
    table_name="RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE",
    source=f"{config['DEFAULT_DATA_DIR']}/documents_true_table.json",
    key_value="classify_P2_Operative_rules",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id",
)

In [None]:
query = """
SELECT * FROM RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE;
"""

conn.sql(query).fetchdf()

Join extract P1 with Transform

Pred table

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_classification")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="classify_P2_Operative_rules|llm_response_classification",
        table_name="RAW_CLASSIFY_P2_OPERATIVE_RULES",
        source=file_path,
        key_value="classify_P2_Operative_rules",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

Check distinct pred rules match true

In [None]:
query = """
SELECT DISTINCT id, prompt, content
FROM RAW_CLASSIFY_P2_OPERATIVE_RULES_TRUE;
"""

conn.sql(query).fetchdf()

### Classify P2 - Definitional Terms

True table

In [None]:
upsert_table_from_json(
    conn,
    key_pattern1="classify_P2_Definitional_terms|true_table",
    table_name="RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE",
    source=f"{config['DEFAULT_DATA_DIR']}/documents_true_table.json",
    key_value="classify_P2_Definitional_terms",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id",
)

In [None]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE
;
"""

conn.sql(query).fetchdf()

Pred table

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_classification")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="classify_P2_Definitional_terms|llm_response_classification",
        table_name="RAW_CLASSIFY_P2_DEFINITIONAL_TERMS",
        source=file_path,
        key_value="classify_P2_Definitional_terms",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_TERMS
;
"""

conn.sql(query).fetchdf()

### Classify P2 - Definitional Names

True table

In [None]:
upsert_table_from_json(
    conn,
    key_pattern1="classify_P2_Definitional_names|true_table",
    table_name="RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE",
    source=f"{config['DEFAULT_DATA_DIR']}/documents_true_table.json",
    key_value="classify_P2_Definitional_names",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id",
)

In [None]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE
;
"""

conn.sql(query).fetchdf()

Pred table

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_classification")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="classify_P2_Definitional_names|llm_response_classification",
        table_name="RAW_CLASSIFY_P2_DEFINITIONAL_NAMES",
        source=file_path,
        key_value="classify_P2_Definitional_names",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_NAMES
;
"""

conn.sql(query).fetchdf()

### Classify P2 - Definitional Facts

True table

In [None]:
upsert_table_from_json(
    conn,
    key_pattern1="classify_P2_Definitional_facts|true_table",
    table_name="RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE",
    source=f"{config['DEFAULT_DATA_DIR']}/documents_true_table.json",
    key_value="classify_P2_Definitional_facts",
    drop=True,
    content_key="content",
    alias="content",
    doc_id_key="id",
)

In [None]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_TRUE
;
"""

conn.sql(query).fetchdf()

Pred table

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_classification")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="classify_P2_Definitional_facts|llm_response_classification",
        table_name="RAW_CLASSIFY_P2_DEFINITIONAL_FACTS",
        source=file_path,
        key_value="classify_P2_Definitional_facts",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT *
FROM RAW_CLASSIFY_P2_DEFINITIONAL_FACTS
;
"""

conn.sql(query).fetchdf()

## Transform

Metadata

In [None]:
query = """
SELECT *,
FROM CHECKPOINT_METADATA
WHERE PROCESS='transformation'
ORDER BY DOC_ID, DOC_TYPE DESC;
"""

conn.sql(query).fetchdf()

### Operative Rules

Pred table

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_transform")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="transform_Operative_Rules|llm_response_transform",
        table_name="RAW_TRANSFORM_OPERATIVE_RULES",
        source=file_path,
        key_value="transform_Operative_Rules",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT *
FROM RAW_TRANSFORM_OPERATIVE_RULES
;
"""

conn.sql(query).fetchdf()

### Terms

Pred table

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_transform")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="transform_Terms|llm_response_transform",
        table_name="RAW_TRANSFORM_TERMS",
        source=file_path,
        key_value="transform_Terms",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT *
FROM RAW_TRANSFORM_TERMS
;
"""

conn.sql(query).fetchdf()

### Names

Pred table

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_transform")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="transform_Names|llm_response_transform",
        table_name="RAW_TRANSFORM_NAMES",
        source=file_path,
        key_value="transform_Names",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT *
FROM RAW_TRANSFORM_NAMES
;
"""

conn.sql(query).fetchdf()

### Fact types

Pred table

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_transform")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="transform_Fact_Types|llm_response_transform",
        table_name="RAW_TRANSFORM_FACT_TYPES",
        source=file_path,
        key_value="transform_Fact_Types",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT *
FROM RAW_TRANSFORM_FACT_TYPES
;
"""

conn.sql(query).fetchdf()

## Validation

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="|llm_validation",
        table_name="RAW_LLM_VALIDATION",
        source=file_path,
        key_value="llm_validation",
        drop=drop,
        content_key="content",
        alias="content",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT
*
FROM RAW_LLM_VALIDATION
;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT
    id,
    REPLACE(id, 'validation_judge_', '') as element_type,
    file_source,
    content.doc_id,
    content.statement_id,
    content.statement,
    content.sources,
    content.semscore,
    content.similarity_score,
    content.similarity_score_confidence,
    content.transformation_accuracy,
    content.grammar_syntax_accuracy,
    content.findings,
    created_at  
FROM RAW_LLM_VALIDATION;
"""

conn.sql(query).fetchdf()

 ### Scores per id, file_source, and doc_id

 - semscore
 - similarity_score
 - similarity_score_confidence
 - transformation_accuracy
 - grammar_syntax_accuracy

In [None]:
query = """
SELECT
    id,
    REPLACE(id, 'validation_judge_', '') as element_type,
    file_source,
    content.doc_id,
    count(content.doc_id) as count_doc_id,
    avg(content.semscore) as avg_semscore,
    avg(content.similarity_score) as avg_similarity_score,
    avg(content.similarity_score_confidence) as avg_similarity_score_confidence,
    avg(content.transformation_accuracy) as avg_transformation_accuracy,
    avg(content.grammar_syntax_accuracy) as avg_grammar_syntax_accuracy,
    max(created_at) as last_created_at
FROM RAW_LLM_VALIDATION
group by id, file_source, content.doc_id
;
"""

conn.sql(query).fetchdf()

 ### Scores per element_type

 - semscore
 - similarity_score
 - similarity_score_confidence
 - transformation_accuracy
 - grammar_syntax_accuracy

In [None]:
query = """
SELECT
    REPLACE(id, 'validation_judge_', '') as element_type,
    count(content.doc_id) as count_doc_id,
    max(created_at) as last_created_at,
    -- avg
    round(avg(content.semscore), 3) as avg_semscore,
    round(avg(content.similarity_score), 3) as avg_similarity_score,
    round(avg(content.similarity_score_confidence), 3) as avg_similarity_score_confidence,
    round(avg(content.transformation_accuracy), 3) as avg_transformation_accuracy,
    round(avg(content.grammar_syntax_accuracy), 3) as avg_grammar_syntax_accuracy,
    -- min
    round(min(content.semscore), 3) as avg_semscore,
    round(min(content.similarity_score), 3) as avg_similarity_score,
    round(min(content.similarity_score_confidence), 3) as min_similarity_score_confidence,
    round(min(content.transformation_accuracy), 3) as min_transformation_accuracy,
    round(min(content.grammar_syntax_accuracy), 3) as min_grammar_syntax_accuracy,
    -- max
    round(max(content.semscore), 3) as avg_semscore,
    round(max(content.similarity_score), 3) as avg_similarity_score,
    round(max(content.similarity_score_confidence), 3) as max_similarity_score_confidence,
    round(max(content.transformation_accuracy), 3) as max_transformation_accuracy,
    round(max(content.grammar_syntax_accuracy), 3) as max_grammar_syntax_accuracy
FROM RAW_LLM_VALIDATION
GROUP BY 
    REPLACE(id, 'validation_judge_', '')
;
"""

conn.sql(query).fetchdf()

 ### Scores of all runs

 - semscore
 - similarity_score
 - similarity_score_confidence
 - transformation_accuracy
 - grammar_syntax_accuracy

In [None]:
query = """
SELECT
    count(content.doc_id) as count_doc_id,
    max(created_at) as last_created_at,
    -- avg
    round(avg(content.semscore), 3) as avg_semscore,
    round(avg(content.similarity_score), 3) as avg_similarity_score,
    round(avg(content.similarity_score_confidence), 3) as avg_similarity_score_confidence,
    round(avg(content.transformation_accuracy), 3) as avg_transformation_accuracy,
    round(avg(content.grammar_syntax_accuracy), 3) as avg_grammar_syntax_accuracy,
    -- min
    round(min(content.semscore), 3) as avg_semscore,
    round(min(content.similarity_score), 3) as avg_similarity_score,
    round(min(content.similarity_score_confidence), 3) as min_similarity_score_confidence,
    round(min(content.transformation_accuracy), 3) as min_transformation_accuracy,
    round(min(content.grammar_syntax_accuracy), 3) as min_grammar_syntax_accuracy,
    -- max
    round(max(content.semscore), 3) as avg_semscore,
    round(max(content.similarity_score), 3) as avg_similarity_score,
    round(max(content.similarity_score_confidence), 3) as max_similarity_score_confidence,
    round(max(content.transformation_accuracy), 3) as max_transformation_accuracy,
    round(max(content.grammar_syntax_accuracy), 3) as max_grammar_syntax_accuracy
FROM RAW_LLM_VALIDATION
;
"""

conn.sql(query).fetchdf()

## Elapsed time

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="|llm_",
        table_name="RAW_ELAPSED_TIME",
        source=file_path,
        key_value="llm_response",
        drop=drop,
        content_key="elapsed_times",
        alias="elapsed_times",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
SELECT *
FROM RAW_ELAPSED_TIME
;
"""

conn.sql(query).fetchdf()

### Elapsed time by checkpoint

In [None]:
query = """
SELECT file_source as checkpoint, count(elapsed_times) as count_elapsed_time, sum(elapsed_times) as total_elapsed_time, avg(elapsed_times) as avg_elapsed_time
FROM RAW_ELAPSED_TIME
GROUP BY id, file_source
;
"""

et_file = conn.sql(query).fetchdf()

et_file

In [None]:
plot_horizontal_bar_chart(
    et_file,
    group_col="checkpoint",
    value_col="total_elapsed_time",
    title="Soma do Tempo Total Decorrido por Checkpoint",
    x_label="Tempo Total Decorrido",
    y_label="Checkpoint",
    big_numbers={"factor": 1, "suffix": "s"},
)

### Elapsed time by doc_type

In [None]:
query = """
SELECT id as doc_type, count(elapsed_times) as count_elapsed_time, sum(elapsed_times) as total_elapsed_time, avg(elapsed_times) as avg_elapsed_time
FROM RAW_ELAPSED_TIME
GROUP BY id
;
"""

et_doc_type = conn.sql(query).fetchdf()

et_doc_type

In [None]:
plot_horizontal_bar_chart(
    et_doc_type,
    group_col="doc_type",
    value_col="total_elapsed_time",
    title="Soma do tempo total decorrido por tipo documento",
    x_label="Tempo total decorrido",
    y_label="doc_type",
    big_numbers={"factor": 1, "suffix": "s"},
)

### Elapsed time by process

In [None]:
query = """
WITH CHECKPOINT AS (
    SELECT 
        PROCESS, 
        DOC_ID, 
        DOC_TYPE,
        DOC_SOURCE
    FROM 
        CHECKPOINT_METADATA
    GROUP BY 
        PROCESS, DOC_ID, DOC_TYPE, DOC_SOURCE
)
SELECT
    cm.process,
    cm.doc_id,
    cm.doc_type,
    cm.doc_source,
    COUNT(et.elapsed_times) AS count_elapsed_time,
    SUM(et.elapsed_times) AS total_elapsed_time,
    AVG(et.elapsed_times) AS avg_elapsed_time
FROM
    RAW_ELAPSED_TIME AS et
JOIN
    CHECKPOINT AS cm ON et.id = cm.doc_id
WHERE
    cm.doc_source in ('pred', 'val')
GROUP BY
    cm.process, cm.doc_id, cm.doc_type, cm.doc_source
ORDER BY
    cm.process, cm.doc_id, cm.doc_type, cm.doc_source;
"""

et_process = conn.sql(query).fetchdf()

et_process

In [None]:
plot_horizontal_bar_chart(
    et_process,
    group_col="process",
    value_col="total_elapsed_time",
    title="Soma do tempo total decorrido por processo",
    x_label="Tempo total decorrido",
    y_label="processo",
    big_numbers={"factor": 1, "suffix": "s"},
)

In [None]:
et_process.groupby("process")["total_elapsed_time"].sum().reset_index()

### Average elapsed time by process

In [None]:
plot_horizontal_bar_chart(
    et_process,
    group_col="process",
    value_col="avg_elapsed_time",
    title="Tempo médio decorrido por processo",
    x_label="Tempo total decorrido",
    y_label="processo",
    big_numbers={"factor": 1, "suffix": "s"},
)

### Elapsed time all runs

In [None]:
query = """
SELECT count(elapsed_times) as count_elapsed_time, sum(elapsed_times) as total_elapsed_time, avg(elapsed_times) as avg_elapsed_time
FROM RAW_ELAPSED_TIME
;
"""

elapsed_time_big_numbers = conn.sql(query).fetchdf()

conn.sql(query).fetchdf()

In [None]:
print(f"count_elapsed_time: {humanize.intword(elapsed_time_big_numbers['count_elapsed_time'])}")
print(f"total_elapsed_time: {humanize.intword(elapsed_time_big_numbers['total_elapsed_time'])}")
print(f"avg_elapsed_time: {humanize.intword(elapsed_time_big_numbers['avg_elapsed_time'])}")

## LLM completion

In [None]:
# Define the directory and pattern
directory = Path(f"{config['DEFAULT_DATA_DIR']}/checkpoints_evaluation")
pattern = "documents-*.json"

# Use glob to find all files matching the pattern
files = directory.glob(pattern)

drop = True  # Drop the table first time
for file_path in files:
    print(file_path)  # Output each file path
    upsert_table_from_json(
        conn,
        key_pattern1="|llm_",
        table_name="RAW_LLM_COMPLETION",
        source=file_path,
        key_value="llm_response",
        drop=drop,
        content_key="completions",
        alias="completions",
        doc_id_key="id",
    )
    drop = False  # Stop dropping tables

In [None]:
query = """
DESCRIBE RAW_LLM_COMPLETION
;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
SELECT 
    id, 
    file_source, 
    created_at, 
    completions.id,
    completions.model,
    completions.object,
    completions.system_fingerprint,
    completions.usage.completion_tokens,
    completions.usage.prompt_tokens,
    completions.usage.total_tokens
FROM RAW_LLM_COMPLETION
;
"""

conn.sql(query).fetchdf()

In [None]:
query = """
WITH CHECKPOINT AS (
    SELECT 
        PROCESS, 
        DOC_ID, 
        DOC_TYPE,
        DOC_SOURCE
    FROM 
        CHECKPOINT_METADATA
    GROUP BY 
        PROCESS, DOC_ID, DOC_TYPE, DOC_SOURCE
)
SELECT 
    id as doc_id,
    cm.process,
    cm.doc_source as doc_source,
    file_source as checkpoint, 
    created_at, 
    completions.id as completion_id,
    completions.model,
    completions.object,
    --completions.service_tier,
    completions.system_fingerprint,
    completions.usage.completion_tokens,
    completions.usage.prompt_tokens,
    completions.usage.total_tokens
    --completions.usage.completion_tokens_details,
    --completions.usage.prompt_tokens_details
FROM RAW_LLM_COMPLETION AS lc
JOIN
    CHECKPOINT AS cm ON lc.id = cm.doc_id
WHERE
    cm.doc_source in ('pred', 'val')
--GROUP BY
--    cm.process, cm.doc_id, cm.doc_type, cm.doc_source
--ORDER BY
--    cm.process, cm.doc_id, cm.doc_type, cm.doc_source;
"""

lc_process = conn.sql(query).fetchdf()

lc_process

### Tokens per process

In [None]:
query = """
WITH CHECKPOINT AS (
    SELECT 
        PROCESS, 
        DOC_ID, 
        DOC_TYPE,
        DOC_SOURCE
    FROM 
        CHECKPOINT_METADATA
    GROUP BY 
        PROCESS, DOC_ID, DOC_TYPE, DOC_SOURCE
)
SELECT 
    cm.process,
    cm.doc_id,
    cm.doc_type,
    cm.doc_source,
    lc.file_source as checkpoint,
    COUNT(lc.id) AS count_completions,
    SUM(lc.completions.usage.completion_tokens) AS total_completion_tokens,
    AVG(lc.completions.usage.completion_tokens) AS avg_completion_tokens,

    SUM(lc.completions.usage.prompt_tokens) AS total_prompt_tokens,
    AVG(lc.completions.usage.prompt_tokens) AS avg_prompt_tokens,

    SUM(lc.completions.usage.total_tokens) AS total_total_tokens,
    AVG(lc.completions.usage.total_tokens) AS avg_total_tokens,
    
    MAX(lc.created_at) AS last_completion
FROM RAW_LLM_COMPLETION AS lc
JOIN
    CHECKPOINT AS cm ON lc.id = cm.doc_id
WHERE
    cm.doc_source in ('pred', 'val')
GROUP BY
    cm.process, cm.doc_id, cm.doc_type, cm.doc_source, lc.file_source
ORDER BY
    cm.process, cm.doc_id, cm.doc_type, cm.doc_source, lc.file_source;
"""

et_llm_completion = conn.sql(query).fetchdf()

et_llm_completion

In [None]:
plot_horizontal_bar_chart(
    et_llm_completion,
    group_col="process",
    value_col="total_prompt_tokens",
    title="Tempo médio decorrido por processo",
    x_label="Qde tokens",
    y_label="processo"
)

### Avarege tokens per process

In [None]:
plot_horizontal_bar_chart(
    et_llm_completion,
    group_col="process",
    value_col="avg_total_tokens",
    title="Qde tokens médio por processo",
    x_label="Qde tokens médio",
    y_label="processo"
)

### Tokens per document type

In [None]:
plot_horizontal_bar_chart(
    et_llm_completion,
    group_col="doc_id",
    value_col="total_prompt_tokens",
    title="Qde tokens por tipo documento",
    x_label="Qde tokens",
    y_label="doc_id",
    figsize=(12, 8)
)

### Average tokens per checkpoint

In [None]:
plot_horizontal_bar_chart(
    et_llm_completion,
    group_col="doc_id",
    value_col="avg_prompt_tokens",
    title="Qde tokens por tipo documento",
    x_label="Qde tokens",
    y_label="doc_id",
    figsize=(12, 8)
)

### Tokens per checkpoint

In [None]:
plot_horizontal_bar_chart(
    et_llm_completion,
    group_col="checkpoint",
    value_col="total_prompt_tokens",
    title="Qde tokens por checkpoint",
    x_label="Qde tokens",
    y_label="checkpoint",
    figsize=(12, 8)
)

In [None]:
plot_horizontal_bar_chart(
    et_llm_completion,
    group_col="checkpoint",
    value_col="avg_total_tokens",
    title="Tokens médio por checkpoint",
    x_label="Qde tokens",
    y_label="checkpoint",
    figsize=(12, 8)
)

### Tokens total

In [None]:
query = """
WITH CHECKPOINT AS (
    SELECT 
        PROCESS, 
        DOC_ID, 
        DOC_TYPE,
        DOC_SOURCE
    FROM 
        CHECKPOINT_METADATA
    GROUP BY 
        PROCESS, DOC_ID, DOC_TYPE, DOC_SOURCE
)
SELECT
    cm.process,
    COUNT(lc.id) AS count_completions,
    SUM(lc.completions.usage.completion_tokens) AS total_completion_tokens,
    AVG(lc.completions.usage.completion_tokens) AS avg_completion_tokens,

    SUM(lc.completions.usage.prompt_tokens) AS total_prompt_tokens,
    AVG(lc.completions.usage.prompt_tokens) AS avg_prompt_tokens,

    SUM(lc.completions.usage.total_tokens) AS total_total_tokens,
    AVG(lc.completions.usage.total_tokens) AS avg_total_tokens,
    
    MAX(lc.created_at) AS last_completion
FROM RAW_LLM_COMPLETION AS lc
JOIN
    CHECKPOINT AS cm ON lc.id = cm.doc_id
WHERE
    cm.doc_source in ('pred', 'val')
GROUP BY
    cm.process
"""

token_big_numbers = conn.sql(query).fetchdf()

conn.sql(query).fetchdf()

In [None]:
print(f"total_prompt_tokens: {humanize.intword(token_big_numbers['total_prompt_tokens'])}")
print(f"total_completion_tokens: {humanize.intword(token_big_numbers['total_completion_tokens'])}")
print(f"total_total_tokens: {humanize.intword(token_big_numbers['total_total_tokens'])}")

## Create views

The main use of the views is exploratory analysis.

In [74]:
from natsort import natsorted

DB_OBJECTS="db_objects_v4"

if CLEAN_UP:
    directory = Path(f"{config['DEFAULT_APP_DIR']}/data/{DB_OBJECTS}")
    pattern = "*_VW.sql"

    # Use glob to find all files matching the pattern
    files = directory.glob(pattern)

    files_processed = 0
    for file_path in natsorted(files):
        logger.info(f"Processing file {file_path}")
        files_processed += 1
        with open(file_path, "r") as file:
            ddl_query = file.read()
            logger.debug(ddl_query)
            conn.sql(ddl_query)

2025-01-16 11:47:30 - INFO - Processing file ../cfr2sbvr_inspect/data/db_objects_v4/10_RAW_SECTION_EXTRACTED_ELEMENTS_VW.sql


2025-01-16 11:47:31 - INFO - Processing file ../cfr2sbvr_inspect/data/db_objects_v4/20_RAW_SECTION_P1_EXTRACTED_ELEMENTS_VW.sql
2025-01-16 11:47:31 - INFO - Processing file ../cfr2sbvr_inspect/data/db_objects_v4/30_RAW_SECTION_P2_EXTRACTED_NOUN_VW.sql
2025-01-16 11:47:31 - INFO - Processing file ../cfr2sbvr_inspect/data/db_objects_v4/40_RAW_CLASSIFY_P1_OPERATIVE_RULES_VW.sql
2025-01-16 11:47:32 - INFO - Processing file ../cfr2sbvr_inspect/data/db_objects_v4/50_RAW_CLASSIFY_P2_OPERATIVE_RULES_VW.sql
2025-01-16 11:47:32 - INFO - Processing file ../cfr2sbvr_inspect/data/db_objects_v4/60_RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_VW.sql
2025-01-16 11:47:32 - INFO - Processing file ../cfr2sbvr_inspect/data/db_objects_v4/70_RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_VW.sql
2025-01-16 11:47:33 - INFO - Processing file ../cfr2sbvr_inspect/data/db_objects_v4/80_RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_VW.sql
2025-01-16 11:47:33 - INFO - Processing file ../cfr2sbvr_inspect/data/db_objects_v4/90_RAW_CLASSIFY_VW.sql
20

In [75]:
query = f"""
SELECT * FROM duckdb_views
  WHERE database_name = '{DATABASE if LOCAL_DB else CLOUD_DATABASE}'
  AND schema_name = 'main'
  AND view_name LIKE '%_VW';
"""

views_created = len(conn.sql(query).fetchall())
assert views_created == files_processed, f"Number of views created {views_created} is different from expected {files_processed}"

conn.sql(query).fetchdf()


Unnamed: 0,database_name,database_oid,schema_name,schema_oid,view_name,view_oid,comment,tags,internal,temporary,column_count,sql
0,cfr2sbvr_v4,1315,main,3616,RAW_TRANSFORM_NAMES_VW,3655,,{},False,False,20,CREATE VIEW RAW_TRANSFORM_NAMES_VW AS SELECT T...
1,cfr2sbvr_v4,1315,main,3616,RAW_CLASSIFY_P2_DEFINITIONAL_FACTS_VW,3641,,{},False,False,14,CREATE VIEW RAW_CLASSIFY_P2_DEFINITIONAL_FACTS...
2,cfr2sbvr_v4,1315,main,3616,RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_VW,3642,,{},False,False,14,CREATE VIEW RAW_CLASSIFY_P2_DEFINITIONAL_NAMES...
3,cfr2sbvr_v4,1315,main,3616,RAW_CLASSIFY_P2_OPERATIVE_RULES_VW,3644,,{},False,False,14,CREATE VIEW RAW_CLASSIFY_P2_OPERATIVE_RULES_VW...
4,cfr2sbvr_v4,1315,main,3616,RAW_CLASSIFY_VW,3645,,{},False,False,18,CREATE VIEW RAW_CLASSIFY_VW AS (((SELECT CLASS...
5,cfr2sbvr_v4,1315,main,3616,RAW_ELAPSED_TIME_VW,3646,,{},False,False,4,"CREATE VIEW RAW_ELAPSED_TIME_VW AS SELECT id, ..."
6,cfr2sbvr_v4,1315,main,3616,RAW_LLM_VALIDATION_BEST_VW,3648,,{},False,False,26,CREATE VIEW RAW_LLM_VALIDATION_BEST_VW AS SELE...
7,cfr2sbvr_v4,1315,main,3616,RAW_LLM_COMPLETION_VW,3647,,{},False,False,7,CREATE VIEW RAW_LLM_COMPLETION_VW AS SELECT id...
8,cfr2sbvr_v4,1315,main,3616,RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_VW,3643,,{},False,False,14,CREATE VIEW RAW_CLASSIFY_P2_DEFINITIONAL_TERMS...
9,cfr2sbvr_v4,1315,main,3616,RAW_LLM_VALIDATION_VW,3649,,{},False,False,28,CREATE VIEW RAW_LLM_VALIDATION_VW AS SELECT VA...


Test

In [76]:
for table_or_view_name in tables_or_views:
    object_schema = table_or_view_name[1]
    object_name = table_or_view_name[2]
    object_type = table_or_view_name[3]

    query = f"""
    SELECT COUNT(1) FROM {object_schema}.{object_name};
    """

    logger.info(f"{object_name} - Record(s): {conn.sql(query).fetchall()[0][0]}")

2025-01-16 11:47:47 - INFO - RAW_TRANSFORM_TERMS - Record(s): 280
2025-01-16 11:47:47 - INFO - RAW_TRANSFORM_FACT_TYPES - Record(s): 160
2025-01-16 11:47:47 - INFO - RAW_SECTION_P2_EXTRACTED_NOUN_TRUE - Record(s): 84
2025-01-16 11:47:47 - INFO - RAW_SECTION_P2_EXTRACTED_NOUN - Record(s): 705
2025-01-16 11:47:47 - INFO - RAW_SECTION_P1_EXTRACTED_ELEMENTS - Record(s): 215
2025-01-16 11:47:47 - INFO - RAW_SECTION - Record(s): 3
2025-01-16 11:47:48 - INFO - RAW_SECTION_P1_EXTRACTED_ELEMENTS_TRUE - Record(s): 22
2025-01-16 11:47:48 - INFO - RAW_LLM_COMPLETION - Record(s): 1210
2025-01-16 11:47:48 - INFO - RAW_ELAPSED_TIME - Record(s): 1210
2025-01-16 11:47:48 - INFO - RAW_CLASSIFY_P2_DEFINITIONAL_TERMS_TRUE - Record(s): 28
2025-01-16 11:47:48 - INFO - RAW_CLASSIFY_P2_OPERATIVE_RULES - Record(s): 60
2025-01-16 11:47:48 - INFO - RAW_CLASSIFY_P2_DEFINITIONAL_TERMS - Record(s): 280
2025-01-16 11:47:49 - INFO - RAW_CLASSIFY_P2_DEFINITIONAL_NAMES_TRUE - Record(s): 5
2025-01-16 11:47:49 - INFO - R

## Close conn

In [77]:
conn.close()