Skip to content

Commit

Permalink
feat: use sqlite's native JSON support in findv2 method
Browse files Browse the repository at this point in the history
  • Loading branch information
newgene committed Jun 19, 2023
1 parent c123b38 commit a2c5013
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 38 deletions.
2 changes: 1 addition & 1 deletion biothings/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def process_inspect(source_name, mode, limit, merge, logger, do_validate, output
converters, mode = btinspect.get_converters(mode)
for m in mode:
inspected.setdefault(m, {})
cur = src_cols.find()
cur = src_cols.findv2()
res = btinspect.inspect_docs(
cur,
mode=mode,
Expand Down
10 changes: 8 additions & 2 deletions biothings/cli/web_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,13 @@ async def get(self, slug):
if limit:
limit = int(limit)
start = int(start)
entries, total_hit = src_cols.find_with_count(query_params, start=start, limit=limit)
# entries, total_hit = src_cols.find_with_count(query_params, start=start, limit=limit)
entries, total_hit = src_cols.findv2(
query_params, start=start, limit=limit, return_total=True, return_list=True
)
else:
entries, total_hit = src_cols.find_with_count(query_params)
# entries, total_hit = src_cols.find_with_count(query_params)
entries, total_hit = src_cols.findv2(query_params, return_total=True)
if not entries:
entries = []

Expand All @@ -95,7 +99,9 @@ def get_example_queries(db, table_space):
out = {}
for table in table_space:
col = db[table]
print("Counting documents...", end="", flush=True)
total_cnt = col.count()
print(total_cnt)
n = 5
i = random.randint(0, min(1000, total_cnt - n))
random_docs = [
Expand Down
5 changes: 5 additions & 0 deletions biothings/utils/serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ def to_json_file(data, fobj, indent=False, sort_keys=False):
fobj.write(json_str)


# define aliases close to json.loads and json.dumps for convenience
json_loads = load_json
json_dumps = to_json


def to_yaml(data, stream=None, Dumper=yaml.SafeDumper, default_flow_style=False):
# Author: Cyrus Afrasiabi

Expand Down
97 changes: 62 additions & 35 deletions biothings/utils/sqlite3.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from biothings.utils.dataload import update_dict_recur
from biothings.utils.dotfield import parse_dot_fields
from biothings.utils.hub_db import IDatabase
from biothings.utils.serializer import json_loads

config = None

Expand Down Expand Up @@ -203,7 +204,7 @@ def find_one(self, *args, **kwargs):
.fetchone()
)
if strdoc:
return json.loads(strdoc[0])
return json_loads(strdoc[0])
else:
return None
else:
Expand All @@ -213,47 +214,73 @@ def find_one(self, *args, **kwargs):
else:
return self.find(find_one=True)

def find(self, *args, **kwargs):
def findv2(self, *args, **kwargs):
"""This is a new version of find() that uses json feature of sqlite3, will replace find in the future"""
start = kwargs.get("start", 0)
limit = kwargs.get("limit", 10)
return_total = kwargs.get("return_total", False) # return (results, total) tuple if True, default False
return_list = kwargs.get("return_list", False) # return list instead of generator if True, default False
conn = self.get_conn()
tbl_name = self.colname

results = []
print(0, args, kwargs)

print(0.1, conn, self.db, self.db.dbfile)
if args and len(args) == 1 and isinstance(args[0], dict) and len(args[0]) > 0:
# it's key/value search, let's iterate
for doc in self.get_conn().execute("SELECT document FROM %s" % self.colname).fetchall():
found = []
doc = json.loads(doc[0])
for k, v in args[0].items():
_found = find_value_in_doc(k, v, doc)
found.append(_found)
if all(found):
if "find_one" in kwargs:
return doc
else:
results.append(doc)
if "limit" in kwargs:
start = kwargs.get("start", 0)
end = start + kwargs.get("limit", 0)
return results[start:end]
return results
# it's key/value search, args[0] like {"a.b": "test", "a.b.c", "value"}
sub_queries = []
for k, v in args[0].items():
if "*" in v or "?" in v:
_v = v.replace("*", "%").replace("?", "_")
_v = f"LIKE '{_v}'"
else:
_v = f"= '{v}'"
if "." in k:
# nested field name like a.b.c, we will use json_tree.fullkey to match
k = k.replace(".", "%.%")
k = f"$.%{k}%"
where = f"(json_tree.fullkey LIKE '{k}' AND json_tree.value {_v})"
sub_query = f"SELECT _id FROM {tbl_name}, json_tree({tbl_name}.document) WHERE {where}"
else:
# just a top level field, we will use ->> operator to match
where = f"(document->>'{k}' {_v})"
sub_query = f"SELECT _id FROM {tbl_name} WHERE {where}"
sub_queries.append(sub_query)
if sub_queries:
if len(sub_queries) == 1:
query = sub_queries[0].replace("SELECT _id FROM", "SELECT document FROM")
else:
query = f"SELECT _id FROM ({sub_queries[0]}) AS subq0"
for i, sub_query in enumerate(sub_queries[1:]):
query += f" INNER JOIN ({sub_queries[i+1]}) AS subq{i+1} USING (_id)"
query = f"SELECT document FROM {tbl_name} WHERE _id IN ({query})"
elif not args or len(args) == 1 and len(args[0]) == 0:
# nothing or empty dict
results = [
json.loads(doc[0])
for doc in self.get_conn().execute("SELECT document FROM %s" % self.colname).fetchall()
]
if "limit" in kwargs:
start = kwargs.get("start", 0)
end = start + kwargs.get("limit", 0)
return results[start:end]
return results
query = f"SELECT document FROM {tbl_name}"
else:
raise NotImplementedError("find: args=%s kwargs=%s" % (repr(args), repr(kwargs)))

def find_with_count(self, *args, **kwargs):
# include limit and offset
_query = query + f" LIMIT {limit} OFFSET {start}"
print(1, _query)
results = (json_loads(doc[0]) for doc in conn.execute(_query)) # results is a generator
if return_list:
results = list(results)
if return_total:
# get total count without limit and offset
total = conn.execute(query.replace("SELECT document FROM", "SELECT COUNT(*) FROM")).fetchone()[0]
return results, total
else:
return results

def find(self, *args, **kwargs):
results = []
if args and len(args) == 1 and isinstance(args[0], dict) and len(args[0]) > 0:
# it's key/value search, let's iterate
for doc in self.get_conn().execute("SELECT document FROM %s" % self.colname).fetchall():
found = []
doc = json.loads(doc[0])
doc = json_loads(doc[0])
for k, v in args[0].items():
_found = find_value_in_doc(k, v, doc)
found.append(_found)
Expand All @@ -265,19 +292,19 @@ def find_with_count(self, *args, **kwargs):
if "limit" in kwargs:
start = kwargs.get("start", 0)
end = start + kwargs.get("limit", 0)
return results[start:end], len(results)
return results, len(results)
return results[start:end]
return results
elif not args or len(args) == 1 and len(args[0]) == 0:
# nothing or empty dict
results = [
json.loads(doc[0])
json_loads(doc[0])
for doc in self.get_conn().execute("SELECT document FROM %s" % self.colname).fetchall()
]
if "limit" in kwargs:
start = kwargs.get("start", 0)
end = start + kwargs.get("limit", 0)
return results[start:end], len(results)
return results, len(results)
return results[start:end]
return results
else:
raise NotImplementedError("find: args=%s kwargs=%s" % (repr(args), repr(kwargs)))

Expand Down

0 comments on commit a2c5013

Please sign in to comment.