Skip to content

Commit

Permalink
Merge pull request #54 from bxparks/develop
Browse files Browse the repository at this point in the history
merge 1.2 into master
  • Loading branch information
bxparks committed Oct 28, 2020
2 parents 6e2c62d + 9120c81 commit 0f63dd0
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 20 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Changelog

* Unreleased
* 1.2 (2020-10-27)
* Print full path of nested JSON elements in error messages (See #52;
thanks abroglesc@).
* 1.1 (2020-07-10)
* Add `--ignore_invalid_lines` to ignore parsing errors on invalid lines
and continue processing. Fixes
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ tests:
python3 -m unittest

flake8:
flake8 . \
flake8 bigquery_schema_generator \
--count \
--ignore W503 \
--show-source \
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ $ generate-schema < file.data.json > file.schema.json
$ generate-schema --input_format csv < file.data.csv > file.schema.json
```

Version: 1.1 (2020-07-10)
Version: 1.2 (2020-10-27)

Changelog: [CHANGELOG.md](CHANGELOG.md)

Expand Down Expand Up @@ -723,6 +723,7 @@ now requires Python 3.6 or higher, I think mostly due to the use of f-strings.

I have tested it on:

* Ubuntu 20.04, Python 3.8.5
* Ubuntu 18.04, Python 3.7.7
* Ubuntu 18.04, Python 3.6.7
* Ubuntu 17.10, Python 3.6.3
Expand All @@ -745,6 +746,8 @@ and 3.8.
* Sanitizing of column names to valid BigQuery characters and length by Jon
Warghed (jonwarghed@).
* Bug fix in `--sanitize_names` by Riccardo M. Cefala (riccardomc@).
* Print full path of nested JSON elements in error messages, by Austin Brogle
(abroglesc@).


## License
Expand Down
89 changes: 72 additions & 17 deletions bigquery_schema_generator/generate_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,10 @@ def deduce_schema(self, file):

# Deduce the schema from this given data record.
if isinstance(json_object, dict):
self.deduce_schema_for_line(json_object, schema_map)
self.deduce_schema_for_line(
json_object=json_object,
schema_map=schema_map,
)
elif isinstance(json_object, Exception):
self.log_error(
f'Record could not be parsed: Exception: {json_object}')
Expand All @@ -218,20 +221,35 @@ def deduce_schema(self, file):

return schema_map, self.error_logs

def deduce_schema_for_line(self, json_object, schema_map):
def deduce_schema_for_line(self, json_object, schema_map, base_path=None):
"""Figures out the BigQuery schema for the given 'json_object' and
updates 'schema_map' with the latest info. A 'schema_map' entry of type
'soft' is a provisional entry that can be overwritten by a subsequent
'soft' or 'hard' entry. If both the old and new have the same type,
then they must be compatible.
'base_path' is the string representing the current path within the
nested record that leads to this specific entry.
"""
for key, value in json_object.items():
schema_entry = schema_map.get(key)
new_schema_entry = self.get_schema_entry(key, value)
schema_map[key] = self.merge_schema_entry(schema_entry,
new_schema_entry)

def merge_schema_entry(self, old_schema_entry, new_schema_entry):
new_schema_entry = self.get_schema_entry(
key=key,
value=value,
base_path=base_path,
)
schema_map[key] = self.merge_schema_entry(
old_schema_entry=schema_entry,
new_schema_entry=new_schema_entry,
base_path=base_path,
)

def merge_schema_entry(
self,
old_schema_entry,
new_schema_entry,
base_path=None,
):
"""Merges the 'new_schema_entry' into the 'old_schema_entry' and return
a merged schema entry. Recursively merges in sub-fields as well.
Expand All @@ -240,6 +258,10 @@ def merge_schema_entry(self, old_schema_entry, new_schema_entry):
returned as the new schema_entry. Returns None if the field should
be removed from the schema due to internal consistency errors.
'base_path' is the string representing the current path within the
nested record that leads to this specific entry. This is used during
error logging.
An Exception is thrown if an unexpected programming error is detected.
The calling routine should stop processing the file.
"""
Expand Down Expand Up @@ -310,50 +332,71 @@ def merge_schema_entry(self, old_schema_entry, new_schema_entry):
new_fields = new_info['fields']
for key, new_entry in new_fields.items():
old_entry = old_fields.get(key)
old_fields[key] = self.merge_schema_entry(old_entry, new_entry)
new_base_path = json_full_path(base_path, old_name)
old_fields[key] = self.merge_schema_entry(
old_schema_entry=old_entry,
new_schema_entry=new_entry,
base_path=new_base_path,
)
return old_schema_entry

full_old_name = json_full_path(base_path, old_name)
full_new_name = json_full_path(base_path, new_name)

# For all other types, the old_mode must be the same as the new_mode. It
# might seem reasonable to allow a NULLABLE {primitive_type} to be
# upgraded to a REPEATED {primitive_type}, but currently 'bq load' does
# not support that so we must also follow that rule.
if old_mode != new_mode:
self.log_error(
f'Ignoring non-RECORD field with mismatched mode: '
f'old=({old_status},{old_name},{old_mode},{old_type}); '
f'new=({new_status},{new_name},{new_mode},{new_type})')
f'old=({old_status},{full_old_name},{old_mode},{old_type}); '
f'new=({new_status},{full_new_name},{new_mode},{new_type})')
return None

# Check that the converted types are compatible.
candidate_type = convert_type(old_type, new_type)
if not candidate_type:
self.log_error(
f'Ignoring field with mismatched type: '
f'old=({old_status},{old_name},{old_mode},{old_type}); '
f'new=({new_status},{new_name},{new_mode},{new_type})')
f'old=({old_status},{full_old_name},{old_mode},{old_type}); '
f'new=({new_status},{full_new_name},{new_mode},{new_type})')
return None

new_info['type'] = candidate_type
return new_schema_entry

def get_schema_entry(self, key, value):
def get_schema_entry(self, key, value, base_path=None):
"""Determines the 'schema_entry' of the (key, value) pair. Calls
deduce_schema_for_line() recursively if the value is another object
instead of a primitive (this will happen only for JSON input file).
'base_path' is the string representing the current path within the
nested record that leads to this specific entry.
"""
value_mode, value_type = self.infer_bigquery_type(value)
if not value_mode or not value_type:
return None

if value_type == 'RECORD':
new_base_path = json_full_path(base_path, key)
# recursively figure out the RECORD
fields = OrderedDict()
if value_mode == 'NULLABLE':
self.deduce_schema_for_line(value, fields)
self.deduce_schema_for_line(
json_object=value,
schema_map=fields,
base_path=new_base_path,
)
else:
for val in value:
self.deduce_schema_for_line(val, fields)
# yapf: disable
self.deduce_schema_for_line(
json_object=val,
schema_map=fields,
base_path=new_base_path,
)

# yapf: disable
schema_entry = OrderedDict([
('status', 'hard'),
('filled', True),
Expand Down Expand Up @@ -539,7 +582,8 @@ def flatten_schema(self, schema_map):
keep_nulls=self.keep_nulls,
sorted_schema=self.sorted_schema,
infer_mode=self.infer_mode,
sanitize_names=self.sanitize_names)
sanitize_names=self.sanitize_names,
)

def run(self, input_file=sys.stdin, output_file=sys.stdout):
"""Read the data records from the input_file and print out the BigQuery
Expand Down Expand Up @@ -745,6 +789,17 @@ def flatten_schema_map(
return schema


def json_full_path(base_path, key):
"""Return the dot-separated JSON full path to a particular key.
e.g. 'server.config.port'. Column names in CSV files are never nested,
so this will always return `key`.
"""
if base_path is None or base_path == "":
return key
else:
return f'{base_path}.{key}'


def main():
# Configure command line flags.
parser = argparse.ArgumentParser(
Expand Down
2 changes: 1 addition & 1 deletion bigquery_schema_generator/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.1'
__version__ = '1.2'
10 changes: 10 additions & 0 deletions tests/test_generate_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from bigquery_schema_generator.generate_schema import SchemaGenerator
from bigquery_schema_generator.generate_schema import is_string_type
from bigquery_schema_generator.generate_schema import convert_type
from bigquery_schema_generator.generate_schema import json_full_path
from .data_reader import DataReader


Expand Down Expand Up @@ -414,6 +415,15 @@ def test_run_with_invalid_input_throws_exception(self):
with self.assertRaises(Exception):
generator.run(input, output)

def test_json_full_path(self):
self.assertEqual('port', json_full_path(None, 'port'))
self.assertEqual('port', json_full_path("", 'port'))

# 'base_path' should never be '0', but if is do something reasonable.
self.assertEqual('0.port', json_full_path(0, 'port'))

self.assertEqual('server.port', json_full_path('server', 'port'))


class TestFromDataFile(unittest.TestCase):
"""Read the test case data from TESTDATA_FILE and verify that the expected
Expand Down
30 changes: 30 additions & 0 deletions tests/testdata.txt
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,36 @@ SCHEMA
]
END

# Incompatible types error printing full path
# given
DATA
{"source_machine":{"port":80},"dest_machine":{"port":80}}
{"source_machine":{"port":80},"dest_machine":{"port":"http-port"}}
ERRORS
2: Ignoring field with mismatched type: old=(hard,dest_machine.port,NULLABLE,INTEGER); new=(hard,dest_machine.port,NULLABLE,STRING)
SCHEMA
[
{
"fields": [],
"mode": "NULLABLE",
"name": "dest_machine",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "port",
"type": "INTEGER"
}
],
"mode": "NULLABLE",
"name": "source_machine",
"type": "RECORD"
}
]
END

# Simple CSV file
DATA csv
name,surname,age
Expand Down

0 comments on commit 0f63dd0

Please sign in to comment.