## Dict lookup

In [55]:
def dict_lookup(data, search_value, search_key):
    results = []

    # Split the search_key to navigate through nested dictionaries
    keys = search_key.split(".")

    for item in data:
        # Start with the current item
        current = item

        # Navigate through the keys
        for key in keys:
            if isinstance(current, list):
                # If current is a list, iterate through it
                current = [sub_item.get(key) for sub_item in current if isinstance(sub_item, dict)]
            elif isinstance(current, dict) and key in current:
                # If current is a dict, navigate to the next level
                current = current[key]
            else:
                current = None
                break

        # Check if the final value matches the search value
        if current == search_value or (isinstance(current, list) and search_value in current):
            results.append(item)

    return results

In [76]:
a = [
    {
        "name": "John",
        "age": 30,
        "city": "New York",
        "adresses": [
            {"street": "Main St", "number": 123},
            {"street": "Broadway", "number": 456},
        ],
    },
    {
        "name": "Jane",
        "age": 25,
        "city": "Los Angeles",
        "adresses": [
            {"street": "Sunset Blvd", "number": 789},
            {"street": "Hollywood Blvd", "number": 101},
        ],
        "institutions": [
            {"name": "UCLA", "department": "Computer Science"},
            {"name": "USC", "department": "Physics"},
        ]
    },
    {
        "name": "Alice",
        "age": 35,
        "city": "Chicago",
        "adresses": [
            {"street": "Michigan Ave", "number": 303},
            {"street": "Sunset Blvd", "number": 123},
        ],
    },
    {
        "name": "Mary",
        "age": 45,
        "city": "Las Vegas",
        "adresses": [
            {"street": "Las Vegas Blvd", "number": 999},
            {"street": "Fremont St", "number": 101},
        ],
        "institutions": [
            {"name": "UNLV", "department": "Mathematics"},
            {"name": "CSN", "department": "Biology"},
        ]
    },
    {
        "name": "Bob",
        "age": 40,
        "city": "San Francisco"
    }
]

In [None]:
b = [
    {
        "name": "Bob",
        "age": 41,
        "city": "San Francisco"
    },
    {
        "name": "John",
        "age": 30,
        "city": "New York",
        "adresses": [
            {"street": "Main St", "number": 123},
            {"street": "Broadway", "number": 456},
        ],
    },
    {
        "name": "John Carpenter",
        "age": 22,
        "city": "Wayne",
        "adresses": [
            {"street": "Main St", "number": 123},
            {"street": "Broadway", "number": 456},
        ],
    },
    {
        "name": "Jane",
        "age": 25,
        "city": "Los Angeles",
        "adresses": [
            {"street": "Sunset Blvd", "number": 789},
            {"street": "Hollywood Blvd", "number": 101},
            {"street": "Rodeo Dr", "number": 202},
        ],
        "institutions": [
            {"name": "UCLA", "department": "Computer Science"},
            {"name": "USC", "department": "Mathematics"},
        ]
    },
    {
        "name": "Alice",
        "age": 35,
        "city": "Chicago",
        "adresses": [
            {"street": "Michigan Ave", "number": 303},
            {"street": "Sunset Blvd", "number": 123},
        ],
        "institutions": [
            {"name": "University of Chicago", "department": "Computer Science"},
            {"name": "Northwestern University", "department": "Mathematics"},
        ]
    },
]

In [182]:
from deepdiff import DeepDiff, DeepSearch, grep
from pprint import pprint

In [180]:
results = DeepSearch(b, "John", verbose_level=2, match_string=True)

results

{'matched_values': {"root[1]['name']": 'John'}}

In [173]:
list(results["matched_values"].keys())

["root[1]['name']"]

In [188]:
results = DeepSearch(b, "John", verbose_level=2, match_string=False)

results

{'matched_values': {"root[1]['name']": 'John',
  "root[2]['name']": 'John Carpenter'}}

In [185]:
import re

def extract_index_and_key(s):
    # Define a regular expression pattern to match the index and key
    pattern = r"root\[(\d+)\]\['(\w+)'\]"
    
    # Use re.match to find the pattern in the string
    match = re.match(pattern, s)
    
    if match:
        # Extract the index and key from the match groups
        index = int(match.group(1))
        key = match.group(2)
        return index, key
    else:
        raise ValueError("String format is incorrect")

In [195]:
for item in list(results["matched_values"].keys()):
    #print(item | grep(r"root\[(\d+)\]\['(\w+)'\]", use_regexp=True))
    idx, key = extract_index_and_key(item)
    print(b[idx]) #.get(key))

{'name': 'John', 'age': 30, 'city': 'New York', 'adresses': [{'street': 'Main St', 'number': 123}, {'street': 'Broadway', 'number': 456}]}
{'name': 'John Carpenter', 'age': 22, 'city': 'Wayne', 'adresses': [{'street': 'Main St', 'number': 123}, {'street': 'Broadway', 'number': 456}]}


In [199]:
diff

{'dictionary_item_added': {"root['institutions']": [{'name': 'University of Chicago',
    'department': 'Computer Science'},
   {'name': 'Northwestern University', 'department': 'Mathematics'}]}}

In [204]:
for a_item in a:
    b_item = dict_lookup(b, a_item["name"], "name") or [None]
    #print(f"{a_item=}")
    #print(f"{b_item[0]=}")
    diff = DeepDiff(a_item, b_item[0], ignore_string_case=True, ignore_order=True, verbose_level=2)
    print(diff)
    if diff:
        if "values_changed" in diff:    
            for item in list(diff["values_changed"].keys()):
                print(item)
        elif "type_changes" in diff:
            for item in list(diff["type_changes"].keys()):
                print(item)
        elif "dictionary_item_added" in diff:
            for item in list(diff["dictionary_item_added"].keys()):
                print(item)
        elif "iterable_item_added" in diff:
            for item in list(diff["iterable_item_added"].keys()):
                print(item)
            #print(extract_index_and_key(item))

{}
{'values_changed': {"root['institutions'][1]['department']": {'new_value': 'mathematics', 'old_value': 'physics'}}, 'iterable_item_added': {"root['adresses'][2]": {'street': 'Rodeo Dr', 'number': 202}}}
root['institutions'][1]['department']
{'dictionary_item_added': {"root['institutions']": [{'name': 'University of Chicago', 'department': 'Computer Science'}, {'name': 'Northwestern University', 'department': 'Mathematics'}]}}
root['institutions']
{'type_changes': {'root': {'old_type': <class 'dict'>, 'new_type': <class 'NoneType'>, 'old_value': {'name': 'Mary', 'age': 45, 'city': 'Las Vegas', 'adresses': [{'street': 'Las Vegas Blvd', 'number': 999}, {'street': 'Fremont St', 'number': 101}], 'institutions': [{'name': 'UNLV', 'department': 'Mathematics'}, {'name': 'CSN', 'department': 'Biology'}]}, 'new_value': None}}}
root
{'values_changed': {"root['age']": {'new_value': 41, 'old_value': 40}}}
root['age']


In [142]:
result = dict_lookup(b, "John", "name")

result

[{'name': 'John',
  'age': 30,
  'city': 'New York',
  'adresses': [{'street': 'Main St', 'number': 123},
   {'street': 'Broadway', 'number': 456}]}]

In [143]:
result = dict_lookup(b, "Sunset Blvd", "adresses.street")

result

[{'name': 'Jane',
  'age': 25,
  'city': 'Los Angeles',
  'adresses': [{'street': 'Sunset Blvd', 'number': 789},
   {'street': 'Hollywood Blvd', 'number': 101},
   {'street': 'Rodeo Dr', 'number': 202}],
  'institutions': [{'name': 'UCLA', 'department': 'Computer Science'},
   {'name': 'USC', 'department': 'Mathematics'}]},
 {'name': 'Alice',
  'age': 35,
  'city': 'Chicago',
  'adresses': [{'street': 'Michigan Ave', 'number': 303},
   {'street': 'Sunset Blvd', 'number': 123}],
  'institutions': [{'name': 'University of Chicago',
    'department': 'Computer Science'},
   {'name': 'Northwestern University', 'department': 'Mathematics'}]}]

## DuckDB

References:

- https://medium.com/@anshubantra/using-duckdb-in-python-a-comprehensive-guide-d14bc0b06546

In [2]:
import duckdb

In [2]:
query = """
SELECT name, age, city, addr.value->>'street' AS street, addr.value->>'number' AS number
  FROM read_json_auto('a_table.json') AS a_table,
  LATERAL UNNEST(a_table.adresses) AS addr(value)
  WHERE street LIKE '%Blvd%';
"""

In [3]:
duckdb.sql(query).show()

┌─────────┬───────┬─────────────┬────────────────┬─────────┐
│  name   │  age  │    city     │     street     │ number  │
│ varchar │ int64 │   varchar   │    varchar     │ varchar │
├─────────┼───────┼─────────────┼────────────────┼─────────┤
│ Jane    │    25 │ Los Angeles │ Sunset Blvd    │ 789     │
│ Jane    │    25 │ Los Angeles │ Hollywood Blvd │ 101     │
│ Mary    │    45 │ Las Vegas   │ Las Vegas Blvd │ 999     │
│ Alice   │    35 │ Chicago     │ Sunset Blvd    │ 123     │
└─────────┴───────┴─────────────┴────────────────┴─────────┘



In [4]:
duckdb.read_json("a_table.json") 

┌─────────┬───────┬───────────────┬─────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────┐
│  name   │  age  │     city      │                                      adresses                                       │                                      institutions                                      │
│ varchar │ int64 │    varchar    │                       struct(street varchar, number bigint)[]                       │                      struct("name" varchar, department varchar)[]                      │
├─────────┼───────┼───────────────┼─────────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────┤
│ John    │    30 │ New York      │ [{'street': Main St, 'number': 123}, {'street': Broadway, 'number': 456}]           │ NULL                              

In [5]:
duckdb.sql(query).fetchall()

[('Jane', 25, 'Los Angeles', 'Sunset Blvd', '789'),
 ('Jane', 25, 'Los Angeles', 'Hollywood Blvd', '101'),
 ('Mary', 45, 'Las Vegas', 'Las Vegas Blvd', '999'),
 ('Alice', 35, 'Chicago', 'Sunset Blvd', '123')]

In [6]:
res = duckdb.read_json("a_table.json").fetchdf()

In [7]:
dict_res = res.to_dict(orient="records")

In [8]:
dict_res[0]["adresses"]

array([{'street': 'Main St', 'number': 123},
       {'street': 'Broadway', 'number': 456}], dtype=object)

In [9]:
from collections import defaultdict

dd = defaultdict(list)

ddict_res = res.to_dict('records', into=dd)


In [10]:
ddict_res[0]["name"]

'John'

### Multiple files

In [48]:
import json

with open("cfr2sbvr_db/documents-2024-12-08-1.json", "r") as f:
    data = json.load(f)

In [49]:
keys = data.keys()

print(len(keys))

51


In [50]:
keys

dict_keys(['§ 275.0-2|section', '§ 275.0-5|section', '§ 275.0-7|section', 'prompt-extract_P1|prompt', 'prompt-extract_P2|prompt', '§ 275.0-2_P1|llm_response', '§ 275.0-2_P2|llm_response', '§ 275.0-5_P1|llm_response', '§ 275.0-5_P2|llm_response', '§ 275.0-7_P1|llm_response', '§ 275.0-7_P2|llm_response', 'classify_P1|true_table', 'classify_P2_Operative_rules|true_table', 'classify_P2_Definitional_terms|true_table', 'classify_P2_Definitional_names|true_table', 'classify_P2_Definitional_facts|true_table', 'classify_P1|llm_response_classification', 'prompt-classify_P2_operative_rule_batch1_Party rules|prompt', 'prompt-classify_P2_operative_rule_batch1_Activity rules|prompt', 'prompt-classify_P2_operative_rule_batch2_Activity rules|prompt', 'classify_P2_Operative_rules|llm_response_classification', 'prompt-classify_P2_term_batch1_Definitional rules|prompt', 'prompt-classify_P2_term_batch2_Definitional rules|prompt', 'prompt-classify_P2_term_batch3_Definitional rules|prompt', 'prompt-classify

### Extract elements P1

In [51]:
# _P1|llm_response
data_extracted_elements = []

for key in keys:
    if key.endswith("_P1|llm_response"):
        print(key)
        data_extracted_elements.append(data[key])

§ 275.0-2_P1|llm_response
§ 275.0-5_P1|llm_response
§ 275.0-7_P1|llm_response


In [66]:
import json

# _P1|llm_response
_data = []
_key_pattern = "_P1|llm_response"
_table_name = "EXTRACTED_ELEMENTS"
_json_file_name = "temp/data_extracted_elements.json"

with open("cfr2sbvr_db/documents-2024-12-08-1.json", "r") as f:
    loaded_data = json.load(f)

keys = loaded_data.keys()

for key in keys:
    if key.endswith(_key_pattern):
        print(key)
        _data.append(loaded_data[key])

with open(_json_file_name, "w") as f:
    json.dump(_data, f, indent=4)

_query_drop_table = f"""
DROP TABLE {_table_name};
"""

print(f"{_query_drop_table=}")

try:
    duckdb.sql(_query_drop_table)
except duckdb.CatalogException as e:
    print(e)

# _query_create_table = f"""
# CREATE TABLE {_table_name} AS
#     SELECT *
#     FROM read_json_auto("{_json_file_name}");
# """

_query_create_table = f"""
CREATE TABLE {_table_name} AS
  SELECT id, 
  'extract_p1' as prompt, 
  '{_json_file_name}' as file_source, 
  unnest(content.elements) as elements,
  now() as 'created_at'
  FROM read_json_auto("{_json_file_name}");
"""

print(f"{_query_create_table=}")

duckdb.sql(_query_create_table)

§ 275.0-2_P1|llm_response
§ 275.0-5_P1|llm_response
§ 275.0-7_P1|llm_response
_query_drop_table='\nDROP TABLE EXTRACTED_ELEMENTS;\n'
Catalog Error: Table with name EXTRACTED_ELEMENTS does not exist!
Did you mean "ALL_DOCS"?
_query_create_table='\nCREATE TABLE EXTRACTED_ELEMENTS AS\n  SELECT id, \n  \'extract_p1\' as prompt, \n  \'temp/data_extracted_elements.json\' as file_source, \n  unnest(content.elements) as elements,\n  now() as \'created_at\'\n  FROM read_json_auto("temp/data_extracted_elements.json");\n'


In [63]:
query = f"""
SELECT * FROM {_table_name};
"""

duckdb.sql(query)

┌──────────────┬────────────┬───────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [45]:
duckdb.read_json("cfr2sbvr_db/*.json")

┌──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────

In [20]:
duckdb.read_json("cfr2sbvr_db/*.json") .describe()

┌─────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬

In [8]:
query_insert = """
CREATE TABLE ALL_DOCS AS
    SELECT *
    FROM read_json_auto("cfr2sbvr_db/*.json");
"""

duckdb.sql(query_insert)

In [12]:
meta_query = "SHOW TABLE ALL_DOCS;"

tbl_struct = duckdb.sql(meta_query).fetchall()



In [13]:
tbl_struct

[('§ 275.0-2|section',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" VARCHAR, elapsed_times JSON, completions JSON)',
  'YES',
  None,
  None,
  None),
 ('§ 275.0-5|section',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" VARCHAR, elapsed_times JSON, completions JSON)',
  'YES',
  None,
  None,
  None),
 ('§ 275.0-7|section',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" VARCHAR, elapsed_times JSON, completions JSON)',
  'YES',
  None,
  None,
  None),
 ('prompt-extract_P1|prompt',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" VARCHAR, elapsed_times JSON, completions JSON)',
  'YES',
  None,
  None,
  None),
 ('prompt-extract_P2|prompt',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" VARCHAR, elapsed_times JSON, completions JSON)',
  'YES',
  None,
  None,
  None),
 ('§ 275.0-2_P1|llm_response',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" STRUCT(section VARCHAR, summary VARCHAR, elements STRUCT(id BIGINT, title VARCHAR, "statement" VARCHAR, terms STRUCT(term VARCHAR, classi

In [16]:
doc = "§ 275.0-2_P1|llm_response"
query_document = f"""
SELECT UNNEST("{doc}") FROM ALL_DOCS;
"""

tbl_doc = duckdb.sql(query_document).fetchall()

In [17]:
tbl_doc



[('§ 275.0-2_P1',
  'llm_response',
  {'section': '§ 275.0-2',
   'summary': "This section outlines the procedures for serving legal documents such as process, pleadings, or other papers on non-resident investment advisers, general partners, and managing agents. It specifies how service can be made through appointed agents and the role of the Commission's Secretary in forwarding documents. It also provides definitions for terms like 'Managing agent', 'Non-resident', and 'Principal office and place of business'.",
   'elements': [{'id': 1,
     'title': 'Service of process on non-resident entities',
     'statement': 'A person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents.',
     'terms': [{'term': 'Person',
       'classification': 'Common Noun',
       'confidence': 0.9,
       'reason': 'The term is a general 

### Multiple files

In [None]:
import json

with open("cfr2sbvr_db/documents-2024-12-08-1.json", "r") as f:
    data = json.load(f)

In [None]:
keys = data.keys()

print(len(keys))

51


In [None]:
keys

dict_keys(['§ 275.0-2|section', '§ 275.0-5|section', '§ 275.0-7|section', 'prompt-extract_P1|prompt', 'prompt-extract_P2|prompt', '§ 275.0-2_P1|llm_response', '§ 275.0-2_P2|llm_response', '§ 275.0-5_P1|llm_response', '§ 275.0-5_P2|llm_response', '§ 275.0-7_P1|llm_response', '§ 275.0-7_P2|llm_response', 'classify_P1|true_table', 'classify_P2_Operative_rules|true_table', 'classify_P2_Definitional_terms|true_table', 'classify_P2_Definitional_names|true_table', 'classify_P2_Definitional_facts|true_table', 'classify_P1|llm_response_classification', 'prompt-classify_P2_operative_rule_batch1_Party rules|prompt', 'prompt-classify_P2_operative_rule_batch1_Activity rules|prompt', 'prompt-classify_P2_operative_rule_batch2_Activity rules|prompt', 'classify_P2_Operative_rules|llm_response_classification', 'prompt-classify_P2_term_batch1_Definitional rules|prompt', 'prompt-classify_P2_term_batch2_Definitional rules|prompt', 'prompt-classify_P2_term_batch3_Definitional rules|prompt', 'prompt-classify

### Extract elements P1

In [None]:
# _P1|llm_response
data_extracted_elements = []

for key in keys:
    if key.endswith("_P1|llm_response"):
        print(key)
        data_extracted_elements.append(data[key])

§ 275.0-2_P1|llm_response
§ 275.0-5_P1|llm_response
§ 275.0-7_P1|llm_response


In [None]:
import json

# _P1|llm_response
_data = []
_key_pattern = "_P1|llm_response"
_table_name = "EXTRACTED_ELEMENTS"
_json_file_name = "temp/data_extracted_elements.json"

with open("cfr2sbvr_db/documents-2024-12-08-1.json", "r") as f:
    loaded_data = json.load(f)

keys = loaded_data.keys()

for key in keys:
    if key.endswith(_key_pattern):
        print(key)
        _data.append(loaded_data[key])

with open(_json_file_name, "w") as f:
    json.dump(_data, f, indent=4)

_query_drop_table = f"""
DROP TABLE {_table_name};
"""

print(f"{_query_drop_table=}")

try:
    duckdb.sql(_query_drop_table)
except duckdb.CatalogException as e:
    print(e)

# _query_create_table = f"""
# CREATE TABLE {_table_name} AS
#     SELECT *
#     FROM read_json_auto("{_json_file_name}");
# """

_query_create_table = f"""
CREATE TABLE {_table_name} AS
  SELECT id, 
  'extract_p1' as prompt, 
  '{_json_file_name}' as file_source, 
  unnest(content.elements) as elements,
  now() as 'created_at'
  FROM read_json_auto("{_json_file_name}");
"""

print(f"{_query_create_table=}")

duckdb.sql(_query_create_table)

§ 275.0-2_P1|llm_response
§ 275.0-5_P1|llm_response
§ 275.0-7_P1|llm_response
_query_drop_table='\nDROP TABLE EXTRACTED_ELEMENTS;\n'
Catalog Error: Table with name EXTRACTED_ELEMENTS does not exist!
Did you mean "ALL_DOCS"?
_query_create_table='\nCREATE TABLE EXTRACTED_ELEMENTS AS\n  SELECT id, \n  \'extract_p1\' as prompt, \n  \'temp/data_extracted_elements.json\' as file_source, \n  unnest(content.elements) as elements,\n  now() as \'created_at\'\n  FROM read_json_auto("temp/data_extracted_elements.json");\n'


In [None]:
query = f"""
SELECT * FROM {_table_name};
"""

duckdb.sql(query)

┌──────────────┬────────────┬───────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [None]:
duckdb.read_json("cfr2sbvr_db/*.json")

┌──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────

In [None]:
duckdb.read_json("cfr2sbvr_db/*.json") .describe()

┌─────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬

In [None]:
query_insert = """
CREATE TABLE ALL_DOCS AS
    SELECT *
    FROM read_json_auto("cfr2sbvr_db/*.json");
"""

duckdb.sql(query_insert)

In [None]:
meta_query = "SHOW TABLE ALL_DOCS;"

tbl_struct = duckdb.sql(meta_query).fetchall()

In [None]:
tbl_struct

[('§ 275.0-2|section',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" VARCHAR, elapsed_times JSON, completions JSON)',
  'YES',
  None,
  None,
  None),
 ('§ 275.0-5|section',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" VARCHAR, elapsed_times JSON, completions JSON)',
  'YES',
  None,
  None,
  None),
 ('§ 275.0-7|section',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" VARCHAR, elapsed_times JSON, completions JSON)',
  'YES',
  None,
  None,
  None),
 ('prompt-extract_P1|prompt',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" VARCHAR, elapsed_times JSON, completions JSON)',
  'YES',
  None,
  None,
  None),
 ('prompt-extract_P2|prompt',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" VARCHAR, elapsed_times JSON, completions JSON)',
  'YES',
  None,
  None,
  None),
 ('§ 275.0-2_P1|llm_response',
  'STRUCT(id VARCHAR, "type" VARCHAR, "content" STRUCT(section VARCHAR, summary VARCHAR, elements STRUCT(id BIGINT, title VARCHAR, "statement" VARCHAR, terms STRUCT(term VARCHAR, classi

In [None]:
doc = "§ 275.0-2_P1|llm_response"
query_document = f"""
SELECT UNNEST("{doc}") FROM ALL_DOCS;
"""

tbl_doc = duckdb.sql(query_document).fetchall()

In [None]:
tbl_doc



[('§ 275.0-2_P1',
  'llm_response',
  {'section': '§ 275.0-2',
   'summary': "This section outlines the procedures for serving legal documents such as process, pleadings, or other papers on non-resident investment advisers, general partners, and managing agents. It specifies how service can be made through appointed agents and the role of the Commission's Secretary in forwarding documents. It also provides definitions for terms like 'Managing agent', 'Non-resident', and 'Principal office and place of business'.",
   'elements': [{'id': 1,
     'title': 'Service of process on non-resident entities',
     'statement': 'A person may serve process, pleadings, or other papers on a non-resident investment adviser, or on a non-resident general partner or non-resident managing agent of an investment adviser by serving any or all of its appointed agents.',
     'terms': [{'term': 'Person',
       'classification': 'Common Noun',
       'confidence': 0.9,
       'reason': 'The term is a general 

## Number of verb_symbols

In [76]:
query = """
SELECT
      unnest(json_extract(data.content, '$.elements[*].verb_symbols'), recursive := true) AS verb_symbols
  FROM
      read_json_auto("temp/data_extracted_elements.json") AS data;
"""

In [77]:
duckdb.sql(query)

┌────────────────────────────────────────────────────────────────────────────────────────────┐
│                                        verb_symbols                                        │
│                                            json                                            │
├────────────────────────────────────────────────────────────────────────────────────────────┤
│ ["serve","by serving"]                                                                     │
│ ["serve","by furnishing","with"]                                                           │
│ ["will forward","by","at"]                                                                 │
│ ["certifies","was served","forwarded","constitutes"]                                       │
│ ["means","directs","manages","participates"]                                               │
│ ["means","resides","is incorporated","has"]                                                │
│ ["has"]                                         

In [74]:
query = """
SELECT
    COUNT(verb) AS total_verbs
FROM (
    SELECT
        UNNEST(CAST(json_extract(verb_array, '$[*]') AS VARCHAR[])) AS verb
    FROM (
        SELECT
            json_extract(data.content, '$.elements[*].verb_symbols') AS verb_lists
        FROM
            read_json_auto("temp/data_extracted_elements.json") AS data
    ) AS nested_lists,
    LATERAL (
        SELECT UNNEST(CAST(verb_lists AS JSON[])) AS verb_array
    )
) AS flattened_verbs;
"""

In [75]:
duckdb.sql(query)

┌─────────────┐
│ total_verbs │
│    int64    │
├─────────────┤
│          61 │
└─────────────┘

## True table

In [None]:
import json

def drop_create_table_from_json(suffix_key_pattern, prefix_key_pattern, table_name, source, json_file_name, key_value) -> bool:

    _data = []

    with open(source, "r") as f:
        loaded_data = json.load(f)

    keys = loaded_data.keys()

    for key in keys:
        if key.startswith(prefix_key_pattern) and key.endswith(suffix_key_pattern):
            print(key)
            _data.append(loaded_data[key])

    with open(_json_file_name, "w") as f:
        json.dump(_data, f, indent=4)

    _query_drop_table = f"""
    DROP TABLE {_table_name};
    """

    print(f"{_query_drop_table=}")

    try:
        duckdb.sql(_query_drop_table)
    except duckdb.CatalogException as e:
        print(e)
        return False

    _query_create_table = f"""
    CREATE TABLE {table_name} AS
    SELECT id, 
    '{key_value}' as prompt, 
    '{json_file_name}' as file_source, 
    unnest(content.elements) as elements,
    now() as 'created_at'
    FROM read_json_auto("{json_file_name}");
    """

    print(f"{_query_create_table=}")

    try:
        duckdb.sql(_query_create_table)
    except duckdb.CatalogException as e:
        print(e)
        return False

    return True

In [None]:
drop_create_table_from_json(
    suffix_key_pattern="_P1|true_table",
    prefix_key_pattern="§",
    table_name="EXTRACTED_ELEMENTS_TRUE",
    source="cfr2sbvr_db/documents_true_table.json",
    json_file_name="temp/data_extracted_elements_true.json",
    key_value="extract_p1"
)

In [85]:
query = """
SELECT * FROM EXTRACTED_ELEMENTS_TRUE;
"""

duckdb.sql(query)

┌──────────────┬────────────┬────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [84]:
query = """
SELECT
    COUNT(verb) AS total_verbs
FROM (
    SELECT
        UNNEST(CAST(json_extract(verb_array, '$[*]') AS VARCHAR[])) AS verb
    FROM (
        SELECT
            json_extract(data.content, '$.elements[*].verb_symbols') AS verb_lists
        FROM
            read_json_auto("temp/data_extracted_elements_true.json") AS data
    ) AS nested_lists,
    LATERAL (
        SELECT UNNEST(CAST(verb_lists AS JSON[])) AS verb_array
    )
) AS flattened_verbs;
"""

duckdb.sql(query)

┌─────────────┐
│ total_verbs │
│    int64    │
├─────────────┤
│          61 │
└─────────────┘

## Queries

```sql
CREATE TABLE EXTRACTED_ELEMENTS AS
  SELECT id, 'extract_p1' as prompt, '/Users/u009165/Documents/scorecars/lab/temp/data_extracted_elements.json' as file_source,  unnest(content.elements) as elements FROM read_json_auto("temp/data_extracted_elements.json");


SELECT
·     id,
·     'extract_p1' AS prompt,
·     '/Users/u009165/Documents/scorecars/lab/temp/data_extracted_elements.json' AS file_source,
·     elements.title AS title  -- Extracting the title from elements
· FROM
·     read_json_auto("temp/data_extracted_elements.json"),
‣     unnest(content.elements) AS elements;

SELECT 
    data.id, 
    'extract_p1' AS prompt, 
    '/Users/u009165/Documents/scorecars/lab/temp/data_extracted_elements.json' AS file_source,  
    elem->>'title' AS title,        -- Extracting title from the unnested element
    elem->>'statement' AS statement  -- Extracting statement from the unnested element
FROM 
    read_json_auto("temp/data_extracted_elements.json") AS data,
    unnest(data.content.elements) AS elem; 

SELECT 
    data.id, 
    'extract_p1' AS prompt, 
    '/Users/u009165/Documents/scorecars/lab/temp/data_extracted_elements.json' AS file_source,  
    elem 
FROM 
    read_json_auto("temp/data_extracted_elements.json") AS data,
    unnest(data.content.elements) AS elem; 

  
SELECT 
    data.id, 
    'extract_p1' AS prompt, 
    '/Users/u009165/Documents/scorecars/lab/temp/data_extracted_elements.json' AS file_source,  
    unnest(json_extract(data.content, '$.elements[*].title')) AS titles,
    unnest(json_extract(data.content, '$.elements[*].statement')) AS statements,
    unnest(json_extract(data.content, '$.elements[*].terms')) AS terms,
    unnest(json_extract(data.content, '$.elements[*].verb_symbols')) AS verb_symbols,
    unnest(json_extract(data.content, '$.elements[*].verb_symbols_extracted_confidence')) AS verb_symbols_extracted_confidence,
    unnest(json_extract(data.content, '$.elements[*].classification')) AS classification,
    unnest(json_extract(data.content, '$.elements[*].confidence')) AS confidence,
    unnest(json_extract(data.content, '$.elements[*].reason')) AS reason,
    unnest(json_extract(data.content, '$.elements[*].sources')) AS sources
FROM 
    read_json_auto("temp/data_extracted_elements.json") AS data;

SELECT 
    unnest(json_extract(data.content, '$.elements[*].verb_symbols'), max_depth := 2) AS verb_symbols
FROM 
    read_json_auto("temp/data_extracted_elements.json") AS data;
```