From 4deb6a47f4b8b8a46043a45ac572a4d138793954 Mon Sep 17 00:00:00 2001 From: Luigi Mori Date: Fri, 7 Dec 2018 14:34:40 +0100 Subject: [PATCH 01/14] Adds support for quoted types and use pythonic console_script Signed-off-by: Luigi Mori --- bigquery_schema_generator/generate_schema.py | 20 ++++++++++++++++++++ scripts/generate-schema | 1 - setup.py | 9 +++++++-- 3 files changed, 27 insertions(+), 3 deletions(-) delete mode 100755 scripts/generate-schema diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py index 84be658..bb7d2a4 100755 --- a/bigquery_schema_generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -55,6 +55,9 @@ class SchemaGenerator: # Detect a TIME field of the form [H]H:[M]M:[S]S[.DDDDDD] TIME_MATCHER = re.compile(r'^\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})?$') + INTEGER_MATCHER = re.compile(r'[-]?^\d+$') + FLOAT_MATCHER = re.compile(r'[-]?^\d+\.\d+$') + def __init__(self, keep_nulls=False, debugging_interval=1000, @@ -240,6 +243,7 @@ def get_schema_entry(self, key, value): object, instead of a primitive. """ value_mode, value_type = self.infer_bigquery_type(value) + if value_type == 'RECORD': # recursively figure out the RECORD fields = OrderedDict() @@ -326,6 +330,12 @@ def infer_value_type(self, value): return 'DATE' elif self.TIME_MATCHER.match(value): return 'TIME' + elif self.INTEGER_MATCHER.match(value): + return 'QINTEGER' # quoted integer + elif self.FLOAT_MATCHER.match(value): + return 'QFLOAT' # quoted float + elif value.lower() in ['true', 'false']: + return 'QBOOLEAN' # quoted boolean else: return 'STRING' # Python 'bool' is a subclass of 'int' so we must check it first @@ -412,8 +422,16 @@ def convert_type(atype, btype): return atype if atype == 'INTEGER' and btype == 'FLOAT': return 'FLOAT' + if atype == 'QINTEGER' and btype == 'QFLOAT': + return 'QFLOAT' if atype == 'FLOAT' and btype == 'INTEGER': return 'FLOAT' + if atype == 'QFLOAT' and btype == 'QINTEGER': + return 'QFLOAT' + if atype in ['QINTEGER', 'QFLOAT', 'QBOOLEAN'] and btype == 'STRING': + return 'STRING' + if atype == 'STRING' and btype in ['QINTEGER', 'QFLOAT', 'QBOOLEAN']: + return 'STRING' if is_string_type(atype) and is_string_type(btype): return 'STRING' return None @@ -466,6 +484,8 @@ def flatten_schema_map(schema_map, keep_nulls=False): else: # Recursively flatten the sub-fields of a RECORD entry. new_value = flatten_schema_map(value, keep_nulls) + elif key == 'type' and value in ['QINTEGER', 'QFLOAT', 'QBOOLEAN']: + new_value = value[1:] else: new_value = value new_info[key] = new_value diff --git a/scripts/generate-schema b/scripts/generate-schema deleted file mode 100755 index 3865080..0000000 --- a/scripts/generate-schema +++ /dev/null @@ -1 +0,0 @@ -python3 -m bigquery_schema_generator.generate_schema "$@" diff --git a/setup.py b/setup.py index 65d7ed7..f230034 100644 --- a/setup.py +++ b/setup.py @@ -22,5 +22,10 @@ author_email='brian@xparks.net', license='Apache 2.0', packages=['bigquery_schema_generator'], - scripts=['scripts/generate-schema'], - python_requires='~=3.5') + python_requires='~=3.5', + entry_points={ + 'console_scripts': [ + 'generate-schema = bigquery_schema_generator.generate_schema:main' + ] + } +) From 88f567dcc8ec17c418caa77839e7176d03f7ff3a Mon Sep 17 00:00:00 2001 From: Luigi Mori Date: Fri, 7 Dec 2018 14:43:51 +0100 Subject: [PATCH 02/14] Adds fix for string types Signed-off-by: Luigi Mori --- bigquery_schema_generator/generate_schema.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py index bb7d2a4..671490e 100755 --- a/bigquery_schema_generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -440,8 +440,7 @@ def convert_type(atype, btype): def is_string_type(thetype): """Returns true if the type is one of: STRING, TIMESTAMP, DATE, or TIME.""" - return (thetype == 'STRING' or thetype == 'TIMESTAMP' or - thetype == 'DATE' or thetype == 'TIME') + return thetype in ['STRING', 'TIMESTAMP', 'DATE', 'TIME', 'QINTEGER', 'QFLOAT', 'QBOOLEAN'] def flatten_schema_map(schema_map, keep_nulls=False): From 47aa01438219ce4d9861c357193d315db4457cc3 Mon Sep 17 00:00:00 2001 From: Luigi Mori Date: Fri, 7 Dec 2018 15:00:52 +0100 Subject: [PATCH 03/14] Tuned Date Regex Signed-off-by: Luigi Mori --- bigquery_schema_generator/generate_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py index 671490e..b96ee8a 100755 --- a/bigquery_schema_generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -50,7 +50,7 @@ class SchemaGenerator: r'(([+-]\d{1,2}(:\d{1,2})?)|Z)?$') # Detect a DATE field of the form YYYY-[M]M-[D]D. - DATE_MATCHER = re.compile(r'^\d{4}-\d{1,2}-\d{1,2}$') + DATE_MATCHER = re.compile(r'^\d{4}-(?:0[1-9]|1[012])-(?:0[1-9]|[12][0-9]|3[01])$') # Detect a TIME field of the form [H]H:[M]M:[S]S[.DDDDDD] TIME_MATCHER = re.compile(r'^\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})?$') From cc927d3d9639b16641d79381e43991cf27e9c6e2 Mon Sep 17 00:00:00 2001 From: Luigi Mori Date: Fri, 7 Dec 2018 23:48:14 +0100 Subject: [PATCH 04/14] Improved DATE regex and modified tests Signed-off-by: Luigi Mori --- bigquery_schema_generator/generate_schema.py | 2 +- tests/test_generate_schema.py | 1 + tests/testdata.txt | 79 ++++++++++++++++++++ 3 files changed, 81 insertions(+), 1 deletion(-) diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py index b96ee8a..d8e66e5 100755 --- a/bigquery_schema_generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -50,7 +50,7 @@ class SchemaGenerator: r'(([+-]\d{1,2}(:\d{1,2})?)|Z)?$') # Detect a DATE field of the form YYYY-[M]M-[D]D. - DATE_MATCHER = re.compile(r'^\d{4}-(?:0[1-9]|1[012])-(?:0[1-9]|[12][0-9]|3[01])$') + DATE_MATCHER = re.compile(r'^\d{4}-(?:[1-9]|0[1-9]|1[012])-(?:[1-9]|0[1-9]|[12][0-9]|3[01])$') # Detect a TIME field of the form [H]H:[M]M:[S]S[.DDDDDD] TIME_MATCHER = re.compile(r'^\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})?$') diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py index b239b3b..5e9fe15 100755 --- a/tests/test_generate_schema.py +++ b/tests/test_generate_schema.py @@ -81,6 +81,7 @@ def test_date_matcher_valid(self): def test_date_matcher_invalid(self): self.assertFalse(SchemaGenerator.DATE_MATCHER.match('17-05-22')) self.assertFalse(SchemaGenerator.DATE_MATCHER.match('2017-111-22')) + self.assertFalse(SchemaGenerator.DATE_MATCHER.match('1988-00-00')) def test_time_matcher_valid(self): self.assertTrue(SchemaGenerator.TIME_MATCHER.match('12:33:01')) diff --git a/tests/testdata.txt b/tests/testdata.txt index 977f619..6b0d4d3 100644 --- a/tests/testdata.txt +++ b/tests/testdata.txt @@ -477,3 +477,82 @@ SCHEMA } ] END + +# QINTEGER, QFLOAT, QBOOLEAN +DATA +{ "qi" : "1", "qf": "1", "qb": "true" } +{ "qi" : "2", "qf": "1.1", "qb": "True" } +{ "qi" : "3", "qf": "2", "qb": "false" } +SCHEMA +[ + { + "mode": "NULLABLE", + "name": "qi", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "qf", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "qb", + "type": "BOOLEAN" + } +] +END + +# From STRING to [QINTEGER, QFLOAT, QBOOLEAN] = STRING +DATA +{ "qi" : "foo", "qf": "bar", "qb": "foo2" } +{ "qi" : "2", "qf": "1.1", "qb": "True" } +SCHEMA +[ + { + "mode": "NULLABLE", + "name": "qi", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "qf", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "qb", + "type": "STRING" + } +] +END + +# QINTEGER -> QFLOAT -> STRING +DATA +{ "qn" : "1" } +{ "qn" : "1.1" } +{ "qn" : "test" } +SCHEMA +[ + { + "mode": "NULLABLE", + "name": "qn", + "type": "STRING" + } +] +END + +# QBOOLEAN -> STRING +DATA +{ "qb" : "true" } +{ "qb" : "False" } +{ "qb" : "test" } +SCHEMA +[ + { + "mode": "NULLABLE", + "name": "qb", + "type": "STRING" + } +] +END From 04fe116ea275a514e7f6844ef9e23d74ac808c00 Mon Sep 17 00:00:00 2001 From: Brian Park Date: Fri, 7 Dec 2018 22:22:08 -0800 Subject: [PATCH 05/14] Add conversion between quoted types and their equilvalent unquoted types (e.g. QINTEGER + INTEGER), and cross conversions (QINTEGER + FLOAT, or QFLOAT + INTEGER); add extensive unit tests to cover all combinations of conversions --- bigquery_schema_generator/generate_schema.py | 70 ++++++++++++--- tests/test_generate_schema.py | 60 ++++++++++++- tests/testdata.txt | 95 +++++++++++++++++++- 3 files changed, 209 insertions(+), 16 deletions(-) diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py index d8e66e5..5c033b8 100755 --- a/bigquery_schema_generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -413,34 +413,80 @@ def run(self): def convert_type(atype, btype): """Return the compatible type between 'atype' and 'btype'. Return 'None' - if there is no compatible type. Type conversions are: - - * INTEGER, FLOAT => FLOAT - * DATE, TIME, TIMESTAMP, STRING => STRING + if there is no compatible type. Type conversions (in order of precedence) + are: + + * type + type => type + * [Q]BOOLEAN + [Q]BOOLEAN => BOOLEAN + * [Q]INTEGER + [Q]INTEGER => INTEGER + * [Q]FLOAT + [Q]FLOAT => FLOAT + * QINTEGER + QFLOAT = QFLOAT + * QFLOAT + QINTEGER = QFLOAT + * [Q]INTEGER + [Q]FLOAT => FLOAT (except QINTEGER + QFLOAT) + * [Q]FLOAT + [Q]INTEGER => FLOAT (except QFLOAT + QINTEGER) + * (DATE, TIME, TIMESTAMP, QBOOLEAN, QINTEGER, QFLOAT, STRING) + + (DATE, TIME, TIMESTAMP, QBOOLEAN, QINTEGER, QFLOAT, STRING) => STRING """ + # type + type => type if atype == btype: return atype - if atype == 'INTEGER' and btype == 'FLOAT': + + # [Q]BOOLEAN + [Q]BOOLEAN => BOOLEAN + if atype == 'BOOLEAN' and btype == 'QBOOLEAN': + return 'BOOLEAN' + if atype == 'QBOOLEAN' and btype == 'BOOLEAN': + return 'BOOLEAN' + + # [Q]INTEGER + [Q]INTEGER => INTEGER + if atype == 'QINTEGER' and btype == 'INTEGER': + return 'INTEGER' + if atype == 'INTEGER' and btype == 'QINTEGER': + return 'INTEGER' + + # [Q]FLOAT + [Q]FLOAT => FLOAT + if atype == 'QFLOAT' and btype == 'FLOAT': + return 'FLOAT' + if atype == 'FLOAT' and btype == 'QFLOAT': return 'FLOAT' + + # QINTEGER + QFLOAT => QFLOAT if atype == 'QINTEGER' and btype == 'QFLOAT': return 'QFLOAT' - if atype == 'FLOAT' and btype == 'INTEGER': - return 'FLOAT' + + # QFLOAT + QINTEGER => QFLOAT if atype == 'QFLOAT' and btype == 'QINTEGER': return 'QFLOAT' - if atype in ['QINTEGER', 'QFLOAT', 'QBOOLEAN'] and btype == 'STRING': - return 'STRING' - if atype == 'STRING' and btype in ['QINTEGER', 'QFLOAT', 'QBOOLEAN']: - return 'STRING' + + # [Q]INTEGER + [Q]FLOAT => FLOAT (except QINTEGER + QFLOAT => QFLOAT) + if atype == 'INTEGER' and btype == 'FLOAT': + return 'FLOAT' + if atype == 'INTEGER' and btype == 'QFLOAT': + return 'FLOAT' + if atype == 'QINTEGER' and btype == 'FLOAT': + return 'FLOAT' + + # [Q]FLOAT + [Q]INTEGER => FLOAT (except # QFLOAT + QINTEGER => QFLOAT) + if atype == 'FLOAT' and btype == 'INTEGER': + return 'FLOAT' + if atype == 'FLOAT' and btype == 'QINTEGER': + return 'FLOAT' + if atype == 'QFLOAT' and btype == 'INTEGER': + return 'FLOAT' + + # All remaining combination of: + # (DATE, TIME, TIMESTAMP, QBOOLEAN, QINTEGER, QFLOAT, STRING) + + # (DATE, TIME, TIMESTAMP, QBOOLEAN, QINTEGER, QFLOAT, STRING) => STRING if is_string_type(atype) and is_string_type(btype): return 'STRING' + return None def is_string_type(thetype): """Returns true if the type is one of: STRING, TIMESTAMP, DATE, or TIME.""" - return thetype in ['STRING', 'TIMESTAMP', 'DATE', 'TIME', 'QINTEGER', 'QFLOAT', 'QBOOLEAN'] + return thetype in ['STRING', 'TIMESTAMP', 'DATE', 'TIME', + 'QINTEGER', 'QFLOAT', 'QBOOLEAN'] def flatten_schema_map(schema_map, keep_nulls=False): diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py index 5e9fe15..8b37dfc 100755 --- a/tests/test_generate_schema.py +++ b/tests/test_generate_schema.py @@ -103,8 +103,11 @@ def test_infer_value_type(self): generator.infer_value_type('2018-02-08T12:34:56')) self.assertEqual('STRING', generator.infer_value_type('abc')) self.assertEqual('BOOLEAN', generator.infer_value_type(True)) + self.assertEqual('QBOOLEAN', generator.infer_value_type('True')) self.assertEqual('INTEGER', generator.infer_value_type(1)) + self.assertEqual('QINTEGER', generator.infer_value_type('2')) self.assertEqual('FLOAT', generator.infer_value_type(2.0)) + self.assertEqual('QFLOAT', generator.infer_value_type('3.0')) self.assertEqual('RECORD', generator.infer_value_type({ 'a': 1, 'b': 2 @@ -236,16 +239,64 @@ def test_infer_array_type(self): def test_convert_type(self): # no conversion needed + self.assertEqual('BOOLEAN', convert_type('BOOLEAN', 'BOOLEAN')) self.assertEqual('INTEGER', convert_type('INTEGER', 'INTEGER')) self.assertEqual('FLOAT', convert_type('FLOAT', 'FLOAT')) self.assertEqual('STRING', convert_type('STRING', 'STRING')) - self.assertEqual('BOOLEAN', convert_type('BOOLEAN', 'BOOLEAN')) self.assertEqual('DATE', convert_type('DATE', 'DATE')) self.assertEqual('RECORD', convert_type('RECORD', 'RECORD')) - # conversions + # quoted and unquoted versions of the same type + self.assertEqual('BOOLEAN', convert_type('BOOLEAN', 'QBOOLEAN')) + self.assertEqual('BOOLEAN', convert_type('QBOOLEAN', 'BOOLEAN')) + self.assertEqual('INTEGER', convert_type('INTEGER', 'QINTEGER')) + self.assertEqual('INTEGER', convert_type('QINTEGER', 'INTEGER')) + self.assertEqual('FLOAT', convert_type('FLOAT', 'QFLOAT')) + self.assertEqual('FLOAT', convert_type('QFLOAT', 'FLOAT')) + + # [Q]INTEGER and [Q]FLOAT conversions self.assertEqual('FLOAT', convert_type('INTEGER', 'FLOAT')) + self.assertEqual('FLOAT', convert_type('INTEGER', 'QFLOAT')) + self.assertEqual('FLOAT', convert_type('QINTEGER', 'FLOAT')) + self.assertEqual('QFLOAT', convert_type('QINTEGER', 'QFLOAT')) self.assertEqual('FLOAT', convert_type('FLOAT', 'INTEGER')) + self.assertEqual('FLOAT', convert_type('FLOAT', 'QINTEGER')) + self.assertEqual('FLOAT', convert_type('QFLOAT', 'INTEGER')) + self.assertEqual('QFLOAT', convert_type('QFLOAT', 'QINTEGER')) + + # quoted and STRING conversions + self.assertEqual('STRING', convert_type('STRING', 'QBOOLEAN')) + self.assertEqual('STRING', convert_type('STRING', 'QINTEGER')) + self.assertEqual('STRING', convert_type('STRING', 'QFLOAT')) + self.assertEqual('STRING', convert_type('QBOOLEAN', 'STRING')) + self.assertEqual('STRING', convert_type('QINTEGER', 'STRING')) + self.assertEqual('STRING', convert_type('QFLOAT', 'STRING')) + + # quoted and DATE conversions + self.assertEqual('STRING', convert_type('DATE', 'QBOOLEAN')) + self.assertEqual('STRING', convert_type('DATE', 'QINTEGER')) + self.assertEqual('STRING', convert_type('DATE', 'QFLOAT')) + self.assertEqual('STRING', convert_type('QBOOLEAN', 'DATE')) + self.assertEqual('STRING', convert_type('QINTEGER', 'DATE')) + self.assertEqual('STRING', convert_type('QFLOAT', 'DATE')) + + # quoted and TIME conversions + self.assertEqual('STRING', convert_type('TIME', 'QBOOLEAN')) + self.assertEqual('STRING', convert_type('TIME', 'QINTEGER')) + self.assertEqual('STRING', convert_type('TIME', 'QFLOAT')) + self.assertEqual('STRING', convert_type('QBOOLEAN', 'TIME')) + self.assertEqual('STRING', convert_type('QINTEGER', 'TIME')) + self.assertEqual('STRING', convert_type('QFLOAT', 'TIME')) + + # quoted and TIMESTAMP conversions + self.assertEqual('STRING', convert_type('TIMESTAMP', 'QBOOLEAN')) + self.assertEqual('STRING', convert_type('TIMESTAMP', 'QINTEGER')) + self.assertEqual('STRING', convert_type('TIMESTAMP', 'QFLOAT')) + self.assertEqual('STRING', convert_type('QBOOLEAN', 'TIMESTAMP')) + self.assertEqual('STRING', convert_type('QINTEGER', 'TIMESTAMP')) + self.assertEqual('STRING', convert_type('QFLOAT', 'TIMESTAMP')) + + # DATE, TIME, and TIMESTAMP conversions self.assertEqual('STRING', convert_type('DATE', 'TIME')) self.assertEqual('STRING', convert_type('DATE', 'TIMESTAMP')) self.assertEqual('STRING', convert_type('DATE', 'STRING')) @@ -255,6 +306,11 @@ def test_convert_type(self): # no conversion possible self.assertEqual(None, convert_type('INTEGER', 'BOOLEAN')) + self.assertEqual(None, convert_type('QINTEGER', 'BOOLEAN')) + self.assertEqual(None, convert_type('INTEGER', 'QBOOLEAN')) + self.assertEqual(None, convert_type('FLOAT', 'BOOLEAN')) + self.assertEqual(None, convert_type('QFLOAT', 'BOOLEAN')) + self.assertEqual(None, convert_type('FLOAT', 'QBOOLEAN')) self.assertEqual(None, convert_type('FLOAT', 'STRING')) self.assertEqual(None, convert_type('STRING', 'BOOLEAN')) self.assertEqual(None, convert_type('BOOLEAN', 'DATE')) diff --git a/tests/testdata.txt b/tests/testdata.txt index 6b0d4d3..a03cad3 100644 --- a/tests/testdata.txt +++ b/tests/testdata.txt @@ -480,9 +480,9 @@ END # QINTEGER, QFLOAT, QBOOLEAN DATA -{ "qi" : "1", "qf": "1", "qb": "true" } +{ "qi" : "1", "qf": "1.0", "qb": "true" } { "qi" : "2", "qf": "1.1", "qb": "True" } -{ "qi" : "3", "qf": "2", "qb": "false" } +{ "qi" : "3", "qf": "2.0", "qb": "false" } SCHEMA [ { @@ -503,6 +503,49 @@ SCHEMA ] END +# QINTEGER, QFLOAT, QBOOLEAN -> INTEGER, FLOAT, BOOLEAN +DATA +{ "qi" : "1", "qf": "1.0", "qb": "true" } +{ "qi" : 2, "qf": 2.0, "qb": false } +SCHEMA +[ + { + "mode": "NULLABLE", + "name": "qi", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "qf", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "qb", + "type": "BOOLEAN" + } +] +END + +# mixed [Q]INTEGER, [Q]FLOAT +DATA +{ "qf_i" : "1.0", "qi_f": "2" } +{ "qf_i" : 1.1, "qi_f": 2.1 } +SCHEMA +[ + { + "mode": "NULLABLE", + "name": "qf_i", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "qi_f", + "type": "FLOAT" + } +] +END + # From STRING to [QINTEGER, QFLOAT, QBOOLEAN] = STRING DATA { "qi" : "foo", "qf": "bar", "qb": "foo2" } @@ -556,3 +599,51 @@ SCHEMA } ] END + +# DATE, TIME, DATETIME +DATA +{ "qd" : "2018-12-07", "qt": "21:52:00", "qdt": "2018-12-07T21:52:00-08:00" } +SCHEMA +[ + { + "mode": "NULLABLE", + "name": "qd", + "type": "DATE" + }, + { + "mode": "NULLABLE", + "name": "qt", + "type": "TIME" + }, + { + "mode": "NULLABLE", + "name": "qdt", + "type": "TIMESTAMP" + } +] +END + +# DATE, TIME, DATETIME + [QINTEGER, QFLOAT, QBOOLEAN] => STRING +DATA +{ "qd" : "2018-12-07", "qt": "21:52:00", "qdt": "2018-12-07T21:52:00-08:00" } +{ "qd" : "1", "qt": "1.1", "qdt": "true" } +SCHEMA +[ + { + "mode": "NULLABLE", + "name": "qd", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "qt", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "qdt", + "type": "STRING" + } +] +END + From 4e101d8cdbb4144474b0cab58b1e3e82b4a46c0e Mon Sep 17 00:00:00 2001 From: Luigi Mori Date: Tue, 11 Dec 2018 13:25:18 +0100 Subject: [PATCH 06/14] Fix negative number support in Q REs and adds unit tests Signed-off-by: Luigi Mori --- bigquery_schema_generator/generate_schema.py | 4 ++-- tests/test_generate_schema.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py index 5c033b8..5bb1a79 100755 --- a/bigquery_schema_generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -55,8 +55,8 @@ class SchemaGenerator: # Detect a TIME field of the form [H]H:[M]M:[S]S[.DDDDDD] TIME_MATCHER = re.compile(r'^\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})?$') - INTEGER_MATCHER = re.compile(r'[-]?^\d+$') - FLOAT_MATCHER = re.compile(r'[-]?^\d+\.\d+$') + INTEGER_MATCHER = re.compile(r'^[-]?\d+$') + FLOAT_MATCHER = re.compile(r'^[-]?\d+\.\d+$') def __init__(self, keep_nulls=False, diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py index 8b37dfc..03ee8c8 100755 --- a/tests/test_generate_schema.py +++ b/tests/test_generate_schema.py @@ -104,10 +104,15 @@ def test_infer_value_type(self): self.assertEqual('STRING', generator.infer_value_type('abc')) self.assertEqual('BOOLEAN', generator.infer_value_type(True)) self.assertEqual('QBOOLEAN', generator.infer_value_type('True')) + self.assertEqual('QBOOLEAN', generator.infer_value_type('False')) + self.assertEqual('QBOOLEAN', generator.infer_value_type('true')) + self.assertEqual('QBOOLEAN', generator.infer_value_type('false')) self.assertEqual('INTEGER', generator.infer_value_type(1)) self.assertEqual('QINTEGER', generator.infer_value_type('2')) + self.assertEqual('QINTEGER', generator.infer_value_type('-1000')) self.assertEqual('FLOAT', generator.infer_value_type(2.0)) self.assertEqual('QFLOAT', generator.infer_value_type('3.0')) + self.assertEqual('QFLOAT', generator.infer_value_type('-5.4')) self.assertEqual('RECORD', generator.infer_value_type({ 'a': 1, 'b': 2 From 6c26248b36d1be4355389ce2a812d42b30571e41 Mon Sep 17 00:00:00 2001 From: Brian Park Date: Fri, 7 Dec 2018 10:22:48 -0800 Subject: [PATCH 07/14] README.md: fix typo 'theirs' to 'their' --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b1961f6..d4a30e7 100644 --- a/README.md +++ b/README.md @@ -238,7 +238,7 @@ $ generate-schema --debugging_interval 50 < file.data.json > file.schema.json Instead of printing out the BigQuery schema, the `--debugging_map` prints out the bookkeeping metadata map which is used internally to keep track of the -various fields and theirs types that were inferred using the data file. This +various fields and their types that were inferred using the data file. This flag is intended to be used for debugging. ``` From d559594a6ec759301eccffa118cd77b7a2121ad9 Mon Sep 17 00:00:00 2001 From: Brian Park Date: Fri, 7 Dec 2018 10:26:37 -0800 Subject: [PATCH 08/14] README.md: change 'downgrades to STRING' to 'upgrades to STRING' since STRING is the super type of TIME, DATE, TIMESTAMP; makes this usage consistent with other parts of that section --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d4a30e7..3cc339f 100644 --- a/README.md +++ b/README.md @@ -282,7 +282,7 @@ compatibility rules implemented by **bq load**: upgraded to a `FLOAT` * the reverse does not happen, once a field is a `FLOAT`, it will remain a `FLOAT` -* conflicting `TIME`, `DATE`, `TIMESTAMP` types downgrades to `STRING` +* conflicting `TIME`, `DATE`, `TIMESTAMP` types upgrades to `STRING` * if a field is determined to have one type of "time" in one record, then subsequently a different "time" type, then the field will be assigned a `STRING` type From e06987ff2752899c1c0601674adf5f06856a1992 Mon Sep 17 00:00:00 2001 From: Brian Park Date: Tue, 11 Dec 2018 11:50:32 -0800 Subject: [PATCH 09/14] README.md: add documentation about additional type inference of strings which are semantically INTEGER or FLOAT --- README.md | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 3cc339f..fed1749 100644 --- a/README.md +++ b/README.md @@ -109,9 +109,9 @@ This is essentially what the `generate-schema` command does. **3) Python script** -If you retrieved this code from its [GitHub -repository](https://github.com/bxparks/bigquery-schema-generator), then you can invoke -the Python script directly: +If you retrieved this code from its +[GitHub repository](https://github.com/bxparks/bigquery-schema-generator), +then you can invoke the Python script directly: ``` $ ./generate_schema.py < file.data.json > file.schema.json ``` @@ -121,21 +121,33 @@ $ ./generate_schema.py < file.data.json > file.schema.json The resulting schema file can be given to the **bq load** command using the `--schema` flag: ``` + $ bq load --source_format NEWLINE_DELIMITED_JSON \ --ignore_unknown_values \ --schema file.schema.json \ mydataset.mytable \ file.data.json ``` - where `mydataset.mytable` is the target table in BigQuery. -A useful flag for **bq load** is `--ignore_unknown_values`, which causes **bq load** -to ignore fields in the input data which are not defined in the schema. When -`generate_schema.py` detects an inconsistency in the definition of a particular -field in the input data, it removes the field from the schema definition. -Without the `--ignore_unknown_values`, the **bq load** fails when the -inconsistent data record is read. +For debugging purposes, here is the equivalent `bq load` command using schema +autodetection: + +``` +$ bq load --source_format NEWLINE_DELIMITED_JSON \ + --ignore_unknown_values \ + --autodetect + mydataset.mytable \ + file.data.json +``` + +A useful flag for `bq load` is `--ignore_unknown_values`, which causes `bq +load` to ignore fields in the input data which are not defined in the schema. +When `generate_schema.py` detects an inconsistency in the definition of a +particular field in the input data, it removes the field from the schema +definition. Without the `--ignore_unknown_values`, the `bq load` fails when +the inconsistent data record is read. Another useful flag during development and +debugging is `--replace` which replaces any existing BigQuery table. After the BigQuery table is loaded, the schema can be retrieved using: @@ -299,6 +311,10 @@ compatibility rules implemented by **bq load**: * we follow the same logic as **bq load** and always infer these as `TIMESTAMP` +The BigQuery loader also looks inside strings to determine if they are actually +INTEGER or FLOAT types instead. Luigi Mori (jtschichold@) added additional logic +to replicate the type conversion logic used by `bq load` for these strings. + ## Examples Here is an example of a single JSON data record on the STDIN (the `^D` below @@ -392,9 +408,10 @@ tested it on: * Ubuntu 16.04, Python 3.5.2 * MacOS 10.13.2, [Python 3.6.4](https://www.python.org/downloads/release/python-364/) -## Author +## Authors -Created by Brian T. Park (brian@xparks.net). +* Created by Brian T. Park (brian@xparks.net). +* Additional type inferrence logic by Luigi Mori (jtschichold@). ## License From 2a03c91efbc05e27365cc539bd709501cabb191b Mon Sep 17 00:00:00 2001 From: Brian Park Date: Mon, 17 Dec 2018 10:39:33 -0800 Subject: [PATCH 10/14] Beautify generate_schema.py through yapf3 formatter --- bigquery_schema_generator/generate_schema.py | 35 +++++++++++--------- tests/test_generate_schema.py | 12 ++++--- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py index 5bb1a79..7a6675b 100755 --- a/bigquery_schema_generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -50,7 +50,8 @@ class SchemaGenerator: r'(([+-]\d{1,2}(:\d{1,2})?)|Z)?$') # Detect a DATE field of the form YYYY-[M]M-[D]D. - DATE_MATCHER = re.compile(r'^\d{4}-(?:[1-9]|0[1-9]|1[012])-(?:[1-9]|0[1-9]|[12][0-9]|3[01])$') + DATE_MATCHER = re.compile( + r'^\d{4}-(?:[1-9]|0[1-9]|1[012])-(?:[1-9]|0[1-9]|[12][0-9]|3[01])$') # Detect a TIME field of the form [H]H:[M]M:[S]S[.DDDDDD] TIME_MATCHER = re.compile(r'^\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})?$') @@ -139,8 +140,8 @@ def deduce_schema_for_line(self, json_object, schema_map): schema_entry = schema_map.get(key) try: new_schema_entry = self.get_schema_entry(key, value) - merged_schema_entry = self.merge_schema_entry(schema_entry, - new_schema_entry) + merged_schema_entry = self.merge_schema_entry( + schema_entry, new_schema_entry) except Exception as e: self.log_error(str(e)) continue @@ -203,8 +204,8 @@ def merge_schema_entry(self, old_schema_entry, new_schema_entry): elif old_mode == 'REPEATED' and new_mode == 'NULLABLE': # TODO: Maybe remove this warning output. It was helpful during # development, but maybe it's just natural. - self.log_error('Leaving schema for "%s" as REPEATED RECORD' % - old_name) + self.log_error( + 'Leaving schema for "%s" as REPEATED RECORD' % old_name) # RECORD type needs a recursive merging of sub-fields. We merge into # the 'old_schema_entry' which assumes that the 'old_schema_entry' @@ -244,6 +245,7 @@ def get_schema_entry(self, key, value): """ value_mode, value_type = self.infer_bigquery_type(value) + # yapf: disable if value_type == 'RECORD': # recursively figure out the RECORD fields = OrderedDict() @@ -288,6 +290,7 @@ def get_schema_entry(self, key, value): ('name', key), ('type', value_type), ]))]) + # yapf: enable return schema_entry def infer_bigquery_type(self, node_value): @@ -304,8 +307,8 @@ def infer_bigquery_type(self, node_value): array_type = self.infer_array_type(node_value) if not array_type: raise Exception( - "All array elements must be the same compatible type: %s" - % node_value) + "All array elements must be the same compatible type: %s" % + node_value) # Disallow array of special types (with '__' not supported). # EXCEPTION: allow (REPEATED __empty_record) ([{}]) because it is @@ -331,11 +334,11 @@ def infer_value_type(self, value): elif self.TIME_MATCHER.match(value): return 'TIME' elif self.INTEGER_MATCHER.match(value): - return 'QINTEGER' # quoted integer + return 'QINTEGER' # quoted integer elif self.FLOAT_MATCHER.match(value): - return 'QFLOAT' # quoted float + return 'QFLOAT' # quoted float elif value.lower() in ['true', 'false']: - return 'QBOOLEAN' # quoted boolean + return 'QBOOLEAN' # quoted boolean else: return 'STRING' # Python 'bool' is a subclass of 'int' so we must check it first @@ -485,8 +488,9 @@ def convert_type(atype, btype): def is_string_type(thetype): """Returns true if the type is one of: STRING, TIMESTAMP, DATE, or TIME.""" - return thetype in ['STRING', 'TIMESTAMP', 'DATE', 'TIME', - 'QINTEGER', 'QFLOAT', 'QBOOLEAN'] + return thetype in [ + 'STRING', 'TIMESTAMP', 'DATE', 'TIME', 'QINTEGER', 'QFLOAT', 'QBOOLEAN' + ] def flatten_schema_map(schema_map, keep_nulls=False): @@ -496,8 +500,8 @@ def flatten_schema_map(schema_map, keep_nulls=False): data. """ if not isinstance(schema_map, dict): - raise Exception("Unexpected type '%s' for schema_map" % - type(schema_map)) + raise Exception( + "Unexpected type '%s' for schema_map" % type(schema_map)) # Build the BigQuery schema from the internal 'schema_map'. schema = [] @@ -575,7 +579,8 @@ def main(): default=1000) parser.add_argument( '--debugging_map', - help='Print the metadata schema_map instead of the schema for debugging', + help= + 'Print the metadata schema_map instead of the schema for debugging', action="store_true") args = parser.parse_args() diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py index 03ee8c8..54988d2 100755 --- a/tests/test_generate_schema.py +++ b/tests/test_generate_schema.py @@ -195,8 +195,9 @@ def test_infer_array_type(self): self.assertEqual('FLOAT', generator.infer_array_type([1.0, 2.0])) self.assertEqual('BOOLEAN', generator.infer_array_type([True, False])) self.assertEqual('STRING', generator.infer_array_type(['a', 'b'])) - self.assertEqual( - 'DATE', generator.infer_array_type(['2018-02-09', '2018-02-10'])) + self.assertEqual('DATE', + generator.infer_array_type( + ['2018-02-09', '2018-02-10'])) self.assertEqual('TIME', generator.infer_array_type(['10:44:00', '10:44:01'])) self.assertEqual('TIMESTAMP', @@ -210,8 +211,9 @@ def test_infer_array_type(self): self.assertEqual('__empty_array__', generator.infer_array_type([[]])) # Mixed TIME, DATE, TIMESTAMP converts to STRING - self.assertEqual( - 'STRING', generator.infer_array_type(['2018-02-09', '10:44:00'])) + self.assertEqual('STRING', + generator.infer_array_type(['2018-02-09', + '10:44:00'])) self.assertEqual('STRING', generator.infer_array_type( ['2018-02-09T11:00:00', '10:44:00'])) @@ -328,6 +330,7 @@ def test_is_string_type(self): self.assertTrue(is_string_type('TIME')) def test_sort_schema(self): + # yapf: disable unsorted = [{ "mode": "REPEATED", "name": "a", @@ -347,7 +350,6 @@ def test_sort_schema(self): "type": "STRING" }] - # yapf: disable expected = [ OrderedDict([ ("mode", "REPEATED"), From ec6febb9728f465967f28688e7f15ec685900676 Mon Sep 17 00:00:00 2001 From: Brian Park Date: Mon, 17 Dec 2018 10:40:04 -0800 Subject: [PATCH 11/14] Update comments in testdata.txt for consistency --- tests/testdata.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/testdata.txt b/tests/testdata.txt index a03cad3..df75626 100644 --- a/tests/testdata.txt +++ b/tests/testdata.txt @@ -527,7 +527,7 @@ SCHEMA ] END -# mixed [Q]INTEGER, [Q]FLOAT +# [Q]INTEGER + [Q]FLOAT -> FLOAT DATA { "qf_i" : "1.0", "qi_f": "2" } { "qf_i" : 1.1, "qi_f": 2.1 } @@ -546,7 +546,7 @@ SCHEMA ] END -# From STRING to [QINTEGER, QFLOAT, QBOOLEAN] = STRING +# STRING + [QINTEGER, QFLOAT, QBOOLEAN] -> STRING DATA { "qi" : "foo", "qf": "bar", "qb": "foo2" } { "qi" : "2", "qf": "1.1", "qb": "True" } @@ -623,7 +623,7 @@ SCHEMA ] END -# DATE, TIME, DATETIME + [QINTEGER, QFLOAT, QBOOLEAN] => STRING +# DATE, TIME, DATETIME + [QINTEGER, QFLOAT, QBOOLEAN] -> STRING DATA { "qd" : "2018-12-07", "qt": "21:52:00", "qdt": "2018-12-07T21:52:00-08:00" } { "qd" : "1", "qt": "1.1", "qdt": "true" } From 063505a840b15abddaf5089d8f4e538e62fb35da Mon Sep 17 00:00:00 2001 From: Brian Park Date: Mon, 17 Dec 2018 10:55:39 -0800 Subject: [PATCH 12/14] README.md: Add examples of QBOOLEAN, QINTEGER, and QFLOAT types --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fed1749..9b03283 100644 --- a/README.md +++ b/README.md @@ -311,9 +311,11 @@ compatibility rules implemented by **bq load**: * we follow the same logic as **bq load** and always infer these as `TIMESTAMP` -The BigQuery loader also looks inside strings to determine if they are actually -INTEGER or FLOAT types instead. Luigi Mori (jtschichold@) added additional logic -to replicate the type conversion logic used by `bq load` for these strings. +The BigQuery loader looks inside string values to determine if they are actually +BOOLEAN, INTEGER or FLOAT types instead. In other words, `"True"` is considered +a BOOLEAN type, `"1"` is considered an INTEGER type, and `"2.1"` is consiered a +FLOAT type. Luigi Mori (jtschichold@) added additional logic to replicate the +type conversion logic used by `bq load` for these strings. ## Examples @@ -403,6 +405,7 @@ took 77s on a Dell Precision M4700 laptop with an Intel Core i7-3840QM CPU @ This project was initially developed on Ubuntu 17.04 using Python 3.5.3. I have tested it on: +* Ubuntu 18.04, Python 3.6.7 * Ubuntu 17.10, Python 3.6.3 * Ubuntu 17.04, Python 3.5.3 * Ubuntu 16.04, Python 3.5.2 From 1f4679fbe46c86b64a57fcfbefdca112bb635b94 Mon Sep 17 00:00:00 2001 From: Brian Park Date: Mon, 17 Dec 2018 10:57:53 -0800 Subject: [PATCH 13/14] CHANGELOG.md: change to 4-space indents for MD files per GitHub guide --- CHANGELOG.md | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f74ae59..e6dc34b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,29 +1,31 @@ # Changelog * 0.2.1 (2018-07-18) - * Add `anonymizer.py` script to create anonymized data files for benchmarking. - * Add benchmark numbers to README.md. - * Add `DEVELOPER.md` file to record how to upload to PyPI. - * Fix some minor warnings from pylint3. + * Add `anonymizer.py` script to create anonymized data files for + benchmarking. + * Add benchmark numbers to README.md. + * Add `DEVELOPER.md` file to record how to upload to PyPI. + * Fix some minor warnings from pylint3. * 0.2.0 (2018-02-10) - * Add support for `DATE` and `TIME` types. - * Update type conversion rules to be more compatible with **bq load**. - * Allow `DATE`, `TIME` and `TIMESTAMP` to gracefully degrade to `STRING`. - * Allow type conversions of elements within arrays - (e.g. array of `INTEGER` and `FLOAT`, or array of mixed `DATE`, `TIME`, or - `TIMESTAMP` elements). - * Better detection of invalid values (e.g. arrays of arrays). + * Add support for `DATE` and `TIME` types. + * Update type conversion rules to be more compatible with **bq load**. + * Allow `DATE`, `TIME` and `TIMESTAMP` to gracefully degrade to + `STRING`. + * Allow type conversions of elements within arrays + (e.g. array of `INTEGER` and `FLOAT`, or array of mixed `DATE`, + `TIME`, or `TIMESTAMP` elements). + * Better detection of invalid values (e.g. arrays of arrays). * 0.1.6 (2018-01-26) - * Pass along command line arguments to `generate-schema`. + * Pass along command line arguments to `generate-schema`. * 0.1.5 (2018-01-25) - * Updated installation instructions for MacOS. + * Updated installation instructions for MacOS. * 0.1.4 (2018-01-23) - * Attempt #3 to fix exception during pip3 install. + * Attempt #3 to fix exception during pip3 install. * 0.1.3 (2018-01-23) - * Attempt #2 to fix exception during pip3 install. + * Attempt #2 to fix exception during pip3 install. * 0.1.2 (2018-01-23) - * Attemp to fix exception during pip3 install. Didn't work. Pulled. + * Attemp to fix exception during pip3 install. Didn't work. Pulled. * 0.1.1 (2018-01-03) - * Install `generate-schema` script in `/usr/local/bin` + * Install `generate-schema` script in `/usr/local/bin` * 0.1 (2018-01-02) - * Iniitial release to PyPI. + * Iniitial release to PyPI. From 1678335842d7536464d0fa113ca3d9cfd6ff1745 Mon Sep 17 00:00:00 2001 From: Brian Park Date: Mon, 17 Dec 2018 10:58:33 -0800 Subject: [PATCH 14/14] Bump version to 0.3 --- CHANGELOG.md | 6 ++++++ README.md | 2 +- setup.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6dc34b..b203f02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +* 0.3 (2018-12-17) + * Tighten TIMESTAMP and DATE validation (thanks jtschichold@). + * Inspect the internals of STRING values to infer BOOLEAN, INTEGER or FLOAT + types (thanks jtschichold@). + * Handle conversion of these string types when mixed with their non-quoted + equivalents, matching the conversion logic followed by 'bq load'. * 0.2.1 (2018-07-18) * Add `anonymizer.py` script to create anonymized data files for benchmarking. diff --git a/README.md b/README.md index 9b03283..28c7c4c 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Usage: $ generate-schema < file.data.json > file.schema.json ``` -Version: 0.2.1 (2018-07-18) +Version: 0.3 (2018-12-17) ## Background diff --git a/setup.py b/setup.py index f230034..4541557 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ long_description = 'BigQuery schema generator.' setup(name='bigquery-schema-generator', - version='0.2.1', + version='0.3', description='BigQuery schema generator', long_description=long_description, url='https://github.com/bxparks/bigquery-schema-generator',