-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Bundled, Configurable Parsers (#191)
* Add a comment for unrelated method * placeholder for parser factory * update PARSER_FACTORY placehoder * change template so that the parser import depends on if the parser comes from a factory * implement two parser factory parsers - ndjson parser - json array parser
- Loading branch information
Showing
3 changed files
with
111 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import pathlib | ||
from typing import Callable, Generator, Iterable, Optional | ||
|
||
import orjson | ||
|
||
|
||
def ndjson_parser(patterns: Optional[Iterable[str]] = None,) \ | ||
-> Callable[[str], Generator[dict, None, None]]: | ||
""" | ||
Create NDJSON Parser given filename patterns | ||
For use with manifest.json based plugins. | ||
Caveat: Only handles valid NDJSON (no extra newlines, UTF8, etc.) | ||
Args: | ||
patterns: glob-compatible patterns for filenames, like *.ndjson, data*.ndjson | ||
Returns: | ||
parser_func: Generator that takes in a data_folder and returns documents from | ||
NDJSON files that matches the filename patterns | ||
""" | ||
if patterns is None: | ||
raise TypeError("Must provide keyword argument patterns to" | ||
"match files for NDJSON Parser") | ||
|
||
def ndjson_parser_func(data_folder): | ||
work_dir = pathlib.Path(data_folder) | ||
for pattern in patterns: | ||
for filename in work_dir.glob(pattern): | ||
with open(filename, 'rb') as f: | ||
for line in f: | ||
doc = orjson.loads(line) | ||
yield doc | ||
|
||
return ndjson_parser_func | ||
|
||
|
||
def json_array_parser(patterns: Optional[Iterable[str]] = None) \ | ||
-> Callable[[str], Generator[dict, None, None]]: | ||
""" | ||
Create JSON Array Parser given filename patterns | ||
For use with manifest.json based plugins. The data comes in a JSON that is | ||
an JSON array, containing multiple documents. | ||
Args: | ||
patterns: glob-compatible patterns for filenames, like *.json, data*.json | ||
Returns: | ||
parser_func | ||
""" | ||
if patterns is None: | ||
raise TypeError("Must provide keyword argument patterns to" | ||
"match files for JSON Array Parser") | ||
|
||
def json_array_parser(data_folder): | ||
work_dir = pathlib.Path(data_folder) | ||
for pattern in patterns: | ||
for filename in work_dir.glob(pattern): | ||
with open(filename, 'r') as f: | ||
data = orjson.loads(f.read()) | ||
try: | ||
iterator = iter(data) | ||
except TypeError: | ||
raise RuntimeError(f"{filename} does not contain a valid" | ||
"JSON Array") | ||
for doc in iterator: | ||
yield doc | ||
|
||
return json_array_parser |