Skip to content

Commit

Permalink
Add Bundled, Configurable Parsers (#191)
Browse files Browse the repository at this point in the history
* Add a comment for unrelated method

* placeholder for parser factory

* update PARSER_FACTORY placehoder

* change template so that the parser import depends on if the parser comes from a factory

* implement two parser factory parsers

- ndjson parser
- json array parser
  • Loading branch information
zcqian committed Oct 4, 2021
1 parent c773381 commit 8e044af
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 5 deletions.
39 changes: 39 additions & 0 deletions biothings/hub/dataplugin/assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from string import Template
from yapf.yapflib import yapf_api

import orjson

from biothings.utils.hub_db import get_data_plugin, get_src_dump, get_src_master
from biothings.utils.common import rmdashfr, get_class_from_classpath
from biothings.utils.loggers import get_logger
Expand Down Expand Up @@ -133,6 +135,17 @@ def load_plugin(self):
self.invalidate_plugin("Missing plugin folder '%s'" % df)

def get_code_for_mod_name(self, mod_name):
"""
Returns string literal and name of function, given a path
Args:
mod_name: string with module name and function name, separated by colon
Returns:
Tuple[str, str]: containing
- indented string literal for the function specified
- name of the function
"""
try:
mod, funcname = map(str.strip, mod_name.split(":"))
except ValueError as e:
Expand Down Expand Up @@ -255,6 +268,32 @@ def get_uploader_dynamic_class(self, uploader_section, metadata):
mod, func = uploader_section.get("parser").split(":")
confdict["PARSER_MOD"] = mod
confdict["PARSER_FUNC"] = func
if uploader_section.get('parser_kwargs'):
parser_kwargs_serialized = orjson.dumps(
uploader_section['parser_kwargs']
).decode('utf-8')
confdict["PARSER_FACTORY_CODE"] = textwrap.dedent(f'''
# get json
import orjson
# Setup parser to parser factory
from {mod} import {func} as parser_factory
parser_kwargs_serialized = r\'\'\'
{parser_kwargs_serialized}
\'\'\' # I am not 100 percent certain this works
parser_kwargs = orjson.loads(parser_kwargs_serialized)
parser_func = parser_factory(**parser_kwargs)
''')
else:
confdict["PARSER_FACTORY_CODE"] = textwrap.dedent(f'''
# when code is exported, import becomes relative
try:
from {self.plugin_name}.{mod} import {func} as parser_func
except ImportError:
from .{mod} import {func} as parser_func
''')
except ValueError:
raise AssistantException("'parser' must be defined as 'module:parser_func' but got: '%s'" %
uploader_section["parser"])
Expand Down
7 changes: 2 additions & 5 deletions biothings/hub/dataplugin/uploader.py.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,8 @@ biothings.config_for_app(config)

import biothings.hub.dataload.uploader

# when code is exported, import becomes relative
try:
from $SRC_NAME.$PARSER_MOD import $PARSER_FUNC as parser_func
except ImportError:
from .$PARSER_MOD import $PARSER_FUNC as parser_func

$PARSER_FACTORY_CODE

$IMPORT_IDCONVERTER_FUNC

Expand Down
70 changes: 70 additions & 0 deletions biothings/utils/parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pathlib
from typing import Callable, Generator, Iterable, Optional

import orjson


def ndjson_parser(patterns: Optional[Iterable[str]] = None,) \
-> Callable[[str], Generator[dict, None, None]]:
"""
Create NDJSON Parser given filename patterns
For use with manifest.json based plugins.
Caveat: Only handles valid NDJSON (no extra newlines, UTF8, etc.)
Args:
patterns: glob-compatible patterns for filenames, like *.ndjson, data*.ndjson
Returns:
parser_func: Generator that takes in a data_folder and returns documents from
NDJSON files that matches the filename patterns
"""
if patterns is None:
raise TypeError("Must provide keyword argument patterns to"
"match files for NDJSON Parser")

def ndjson_parser_func(data_folder):
work_dir = pathlib.Path(data_folder)
for pattern in patterns:
for filename in work_dir.glob(pattern):
with open(filename, 'rb') as f:
for line in f:
doc = orjson.loads(line)
yield doc

return ndjson_parser_func


def json_array_parser(patterns: Optional[Iterable[str]] = None) \
-> Callable[[str], Generator[dict, None, None]]:
"""
Create JSON Array Parser given filename patterns
For use with manifest.json based plugins. The data comes in a JSON that is
an JSON array, containing multiple documents.
Args:
patterns: glob-compatible patterns for filenames, like *.json, data*.json
Returns:
parser_func
"""
if patterns is None:
raise TypeError("Must provide keyword argument patterns to"
"match files for JSON Array Parser")

def json_array_parser(data_folder):
work_dir = pathlib.Path(data_folder)
for pattern in patterns:
for filename in work_dir.glob(pattern):
with open(filename, 'r') as f:
data = orjson.loads(f.read())
try:
iterator = iter(data)
except TypeError:
raise RuntimeError(f"{filename} does not contain a valid"
"JSON Array")
for doc in iterator:
yield doc

return json_array_parser

0 comments on commit 8e044af

Please sign in to comment.