# Search

## Deepsearch, Deepdiff

In [1]:
def dict_lookup(data, search_value, search_key):
    results = []

    # Split the search_key to navigate through nested dictionaries
    keys = search_key.split(".")

    for item in data:
        # Start with the current item
        current = item

        # Navigate through the keys
        for key in keys:
            if isinstance(current, list):
                # If current is a list, iterate through it
                current = [sub_item.get(key) for sub_item in current if isinstance(sub_item, dict)]
            elif isinstance(current, dict) and key in current:
                # If current is a dict, navigate to the next level
                current = current[key]
            else:
                current = None
                break

        # Check if the final value matches the search value
        if current == search_value or (isinstance(current, list) and search_value in current):
            results.append(item)

    return results

In [2]:
a = [
    {
        "name": "John",
        "age": 30,
        "city": "New York",
        "adresses": [
            {"street": "Main St", "number": 123},
            {"street": "Broadway", "number": 456},
        ],
    },
    {
        "name": "Jane",
        "age": 25,
        "city": "Los Angeles",
        "adresses": [
            {"street": "Sunset Blvd", "number": 789},
            {"street": "Hollywood Blvd", "number": 101},
        ],
        "institutions": [
            {"name": "UCLA", "department": "Computer Science"},
            {"name": "USC", "department": "Physics"},
        ]
    },
    {
        "name": "Alice",
        "age": 35,
        "city": "Chicago",
        "adresses": [
            {"street": "Michigan Ave", "number": 303},
            {"street": "Sunset Blvd", "number": 123},
        ],
    },
    {
        "name": "Mary",
        "age": 45,
        "city": "Las Vegas",
        "adresses": [
            {"street": "Las Vegas Blvd", "number": 999},
            {"street": "Fremont St", "number": 101},
        ],
        "institutions": [
            {"name": "UNLV", "department": "Mathematics"},
            {"name": "CSN", "department": "Biology"},
        ]
    },
    {
        "name": "Bob",
        "age": 40,
        "city": "San Francisco"
    }
]

In [3]:
b = [
    {
        "name": "Bob",
        "age": 41,
        "city": "San Francisco"
    },
    {
        "name": "John",
        "age": 30,
        "city": "New York",
        "adresses": [
            {"street": "Main St", "number": 123},
            {"street": "Broadway", "number": 456},
        ],
    },
    {
        "name": "John Carpenter",
        "age": 22,
        "city": "Wayne",
        "adresses": [
            {"street": "Main St", "number": 123},
            {"street": "Broadway", "number": 456},
        ],
    },
    {
        "name": "Jane",
        "age": 25,
        "city": "Los Angeles",
        "adresses": [
            {"street": "Sunset Blvd", "number": 789},
            {"street": "Hollywood Blvd", "number": 101},
            {"street": "Rodeo Dr", "number": 202},
        ],
        "institutions": [
            {"name": "UCLA", "department": "Computer Science"},
            {"name": "USC", "department": "Mathematics"},
        ]
    },
    {
        "name": "Alice",
        "age": 35,
        "city": "Chicago",
        "adresses": [
            {"street": "Michigan Ave", "number": 303},
            {"street": "Sunset Blvd", "number": 123},
        ],
        "institutions": [
            {"name": "University of Chicago", "department": "Computer Science"},
            {"name": "Northwestern University", "department": "Mathematics"},
        ]
    },
]

In [4]:
from deepdiff import DeepDiff, DeepSearch, grep
from pprint import pprint

In [5]:
results = DeepSearch(b, "John", verbose_level=2, match_string=True)

results

{'matched_values': {"root[1]['name']": 'John'}}

In [6]:
list(results["matched_values"].keys())

["root[1]['name']"]

In [7]:
results = DeepSearch(b, "John", verbose_level=2, match_string=False)

results

{'matched_values': {"root[1]['name']": 'John',
  "root[2]['name']": 'John Carpenter'}}

In [8]:
import re

def extract_index_and_key(s):
    # Define a regular expression pattern to match the index and key
    pattern = r"root\[(\d+)\]\['(\w+)'\]"
    
    # Use re.match to find the pattern in the string
    match = re.match(pattern, s)
    
    if match:
        # Extract the index and key from the match groups
        index = int(match.group(1))
        key = match.group(2)
        return index, key
    else:
        raise ValueError("String format is incorrect")

In [9]:
for item in list(results["matched_values"].keys()):
    #print(item | grep(r"root\[(\d+)\]\['(\w+)'\]", use_regexp=True))
    idx, key = extract_index_and_key(item)
    print(b[idx]) #.get(key))

{'name': 'John', 'age': 30, 'city': 'New York', 'adresses': [{'street': 'Main St', 'number': 123}, {'street': 'Broadway', 'number': 456}]}
{'name': 'John Carpenter', 'age': 22, 'city': 'Wayne', 'adresses': [{'street': 'Main St', 'number': 123}, {'street': 'Broadway', 'number': 456}]}


In [11]:
for a_item in a:
    b_item = dict_lookup(b, a_item["name"], "name") or [None]
    #print(f"{a_item=}")
    #print(f"{b_item[0]=}")
    diff = DeepDiff(a_item, b_item[0], ignore_string_case=True, ignore_order=True, verbose_level=2)
    print(diff)
    if diff:
        if "values_changed" in diff:    
            for item in list(diff["values_changed"].keys()):
                print(item)
        elif "type_changes" in diff:
            for item in list(diff["type_changes"].keys()):
                print(item)
        elif "dictionary_item_added" in diff:
            for item in list(diff["dictionary_item_added"].keys()):
                print(item)
        elif "iterable_item_added" in diff:
            for item in list(diff["iterable_item_added"].keys()):
                print(item)
            #print(extract_index_and_key(item))

{}
{'values_changed': {"root['institutions'][1]['department']": {'new_value': 'mathematics', 'old_value': 'physics'}}, 'iterable_item_added': {"root['adresses'][2]": {'street': 'Rodeo Dr', 'number': 202}}}
root['institutions'][1]['department']
{'dictionary_item_added': {"root['institutions']": [{'name': 'University of Chicago', 'department': 'Computer Science'}, {'name': 'Northwestern University', 'department': 'Mathematics'}]}}
root['institutions']
{'type_changes': {'root': {'old_type': <class 'dict'>, 'new_type': <class 'NoneType'>, 'old_value': {'name': 'Mary', 'age': 45, 'city': 'Las Vegas', 'adresses': [{'street': 'Las Vegas Blvd', 'number': 999}, {'street': 'Fremont St', 'number': 101}], 'institutions': [{'name': 'UNLV', 'department': 'Mathematics'}, {'name': 'CSN', 'department': 'Biology'}]}, 'new_value': None}}}
root
{'values_changed': {"root['age']": {'new_value': 41, 'old_value': 40}}}
root['age']


In [12]:
result = dict_lookup(b, "John", "name")

result

[{'name': 'John',
  'age': 30,
  'city': 'New York',
  'adresses': [{'street': 'Main St', 'number': 123},
   {'street': 'Broadway', 'number': 456}]}]

In [13]:
result = dict_lookup(b, "Sunset Blvd", "adresses.street")

result

[{'name': 'Jane',
  'age': 25,
  'city': 'Los Angeles',
  'adresses': [{'street': 'Sunset Blvd', 'number': 789},
   {'street': 'Hollywood Blvd', 'number': 101},
   {'street': 'Rodeo Dr', 'number': 202}],
  'institutions': [{'name': 'UCLA', 'department': 'Computer Science'},
   {'name': 'USC', 'department': 'Mathematics'}]},
 {'name': 'Alice',
  'age': 35,
  'city': 'Chicago',
  'adresses': [{'street': 'Michigan Ave', 'number': 303},
   {'street': 'Sunset Blvd', 'number': 123}],
  'institutions': [{'name': 'University of Chicago',
    'department': 'Computer Science'},
   {'name': 'Northwestern University', 'department': 'Mathematics'}]}]

## DuckDB

References:

- https://medium.com/@anshubantra/using-duckdb-in-python-a-comprehensive-guide-d14bc0b06546

In [14]:
import duckdb

In [15]:
query = """
SELECT name, age, city, addr.value->>'street' AS street, addr.value->>'number' AS number
  FROM read_json_auto('a_table.json') AS a_table,
  LATERAL UNNEST(a_table.adresses) AS addr(value)
  WHERE street LIKE '%Blvd%';
"""

In [16]:
duckdb.sql(query).show()

┌─────────┬───────┬─────────────┬────────────────┬─────────┐
│  name   │  age  │    city     │     street     │ number  │
│ varchar │ int64 │   varchar   │    varchar     │ varchar │
├─────────┼───────┼─────────────┼────────────────┼─────────┤
│ Jane    │    25 │ Los Angeles │ Sunset Blvd    │ 789     │
│ Jane    │    25 │ Los Angeles │ Hollywood Blvd │ 101     │
│ Mary    │    45 │ Las Vegas   │ Las Vegas Blvd │ 999     │
│ Alice   │    35 │ Chicago     │ Sunset Blvd    │ 123     │
└─────────┴───────┴─────────────┴────────────────┴─────────┘



In [17]:
duckdb.read_json("a_table.json") 

┌─────────┬───────┬───────────────┬─────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────┐
│  name   │  age  │     city      │                                      adresses                                       │                                      institutions                                      │
│ varchar │ int64 │    varchar    │                       struct(street varchar, number bigint)[]                       │                      struct("name" varchar, department varchar)[]                      │
├─────────┼───────┼───────────────┼─────────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────┤
│ John    │    30 │ New York      │ [{'street': Main St, 'number': 123}, {'street': Broadway, 'number': 456}]           │ NULL                              

In [18]:
duckdb.sql(query).fetchall()

[('Jane', 25, 'Los Angeles', 'Sunset Blvd', '789'),
 ('Jane', 25, 'Los Angeles', 'Hollywood Blvd', '101'),
 ('Mary', 45, 'Las Vegas', 'Las Vegas Blvd', '999'),
 ('Alice', 35, 'Chicago', 'Sunset Blvd', '123')]

In [19]:
res = duckdb.read_json("a_table.json").fetchdf()

In [20]:
dict_res = res.to_dict(orient="records")

In [21]:
dict_res[0]["adresses"]

array([{'street': 'Main St', 'number': 123},
       {'street': 'Broadway', 'number': 456}], dtype=object)

In [22]:
from collections import defaultdict

dd = defaultdict(list)

ddict_res = res.to_dict('records', into=dd)


In [23]:
ddict_res[0]["name"]

'John'