diff --git a/.gitignore b/.gitignore index 749ccda..cb20bb7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,22 @@ __pycache__/ *.py[cod] *$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST diff --git a/README.md b/README.md index 992644c..aab10ef 100644 --- a/README.md +++ b/README.md @@ -27,10 +27,10 @@ When the auto-detect feature is used, the BigQuery data importer examines only the first 100 records of the input data. In many cases, this is sufficient because the data records were dumped from another database and the exact schema of the source table was known. However, for data extracted from a service -(e.g. using a REST API) the record fields were organically at later dates. In -this case, the first 100 records do not contain fields which are present in -later records. The **bq load** auto-detection fails and the data fails to -load. +(e.g. using a REST API) the record fields could have been organically added +at later dates. In this case, the first 100 records do not contain fields which +are present in later records. The **bq load** auto-detection fails and the data +fails to load. The **bq load** tool does not support the ability to process the entire dataset to determine a more accurate schema. This script fills in that gap. It @@ -119,12 +119,22 @@ With the ``keep_nulls``, the resulting schema file will be: ] ``` +Example: + +``` +$ generate_schema.py --keep_nulls < file.data.json > file.schema.json +``` + #### Debugging Interval By default, the `generate_schema.py` script prints a short progress message every 1000 lines of input data. This interval can be changed using the `--debugging_interval` flag. +``` +$ generate_schema.py --debugging_interval 1000 < file.data.json > file.schema.json +``` + #### Debugging Map Instead of printing out the BigQuery schema, the `--debugging_map` prints out @@ -132,6 +142,10 @@ the bookkeeping metadata map which is used internally to keep track of the various fields and theirs types that was inferred using the data file. This flag is intended to be used for debugging. +``` +$ generate_schema.py --debugging_map < file.data.json > file.schema.json +``` + ## Examples Here is an example of a single JSON data record on the STDIN: @@ -195,36 +209,6 @@ $ cat file.schema.json ] ``` -## Unit Tests - -Instead of embeddeding the input data records and the expected schema file into -the `test_generate_schema.py` file, we placed them into the `testdata.txt` -file. This has two advantages: - -* we can more easily update the input and output data records, and -* the `testdata.txt` data could be reused for versions written in other languages - -The output of `test_generate_schema.py` should look something like this: -``` ----------------------------------------------------------------------- -Ran 4 tests in 0.002s - -OK -Test chunk 1: First record: { "s": null, "a": [], "m": {} } -Test chunk 2: First record: { "s": null, "a": [], "m": {} } -Test chunk 3: First record: { "s": "string", "b": true, "i": 1, "x": 3.1, "t": "2017-05-22T17:10:00-07:00" } -Test chunk 4: First record: { "a": [1, 2], "r": { "r0": "r0", "r1": "r1" } } -Test chunk 5: First record: { "s": "string", "x": 3.2, "i": 3, "b": true, "a": [ "a", 1] } -Test chunk 6: First record: { "a": [1, 2] } -Test chunk 7: First record: { "r" : { "a": [1, 2] } } -Test chunk 8: First record: { "i": 1 } -Test chunk 9: First record: { "i": null } -Test chunk 10: First record: { "i": 3 } -Test chunk 11: First record: { "i": [1, 2] } -Test chunk 12: First record: { "r" : { "i": 3 } } -Test chunk 13: First record: { "r" : [{ "i": 4 }] } -``` - ## System Requirements This project was developed on Ubuntu 17.04 using Python 3.5. It is likely diff --git a/bigquery_schema_generator/__init__.py b/bigquery_schema_generator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py similarity index 99% rename from generator/generate_schema.py rename to bigquery_schema_generator/generate_schema.py index fe10550..e8de388 100755 --- a/generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # # Copyright 2017 Brian T. Park # @@ -18,7 +18,7 @@ Unlike the BigQuery importer which uses only the first 100 records, this script uses all available records in the data file. -Usage: generator_schema.py [-h] [flags ...] < file.data.json > file.schema.json +Usage: generate_schema.py [-h] [flags ...] < file.data.json > file.schema.json * file.data.json is a newline-delimited JSON data file, one JSON object per line. * file.schema.json is the schema definition of the table. diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4c10e44 --- /dev/null +++ b/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup + +# Convert README.md to README.rst because PyPI does not support Markdown. +try: + import pypandoc + long_description = pypandoc.convert('README.md', 'rst') +except OSError: + with open('README.md', encoding="utf-8") as f: + long_description = f.read() + +setup(name='bigquery-schema-generator', + version='0.1', + description='BigQuery schema generator', + long_description=long_description, + url='https://github.com/bxparks/bigquery-schema-generator', + author='Brian T. Park', + author_email='brian@xparks.net', + license='Apache 2.0', + packages=['bigquery_schema_generator'], + python_requires='~=3.5') diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..7423592 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,31 @@ +# Tests + +Instead of embedding the input data records and the expected schema into +the `test_generate_schema.py` file, we placed them into the `testdata.txt` +file which is parsed by the unit test program. This has two advantages: + +* we can more easily update the input and output data records, and +* the `testdata.txt` data can be reused for versions written in other languages + +The output of `test_generate_schema.py` should look something like this: +``` +---------------------------------------------------------------------- +Ran 4 tests in 0.002s + +OK +Test chunk 1: First record: { "s": null, "a": [], "m": {} } +Test chunk 2: First record: { "s": null, "a": [], "m": {} } +Test chunk 3: First record: { "s": "string", "b": true, "i": 1, "x": 3.1, "t": "2017-05-22T17:10:00-07:00" } +Test chunk 4: First record: { "a": [1, 2], "r": { "r0": "r0", "r1": "r1" } } +Test chunk 5: First record: { "s": "string", "x": 3.2, "i": 3, "b": true, "a": [ "a", 1] } +Test chunk 6: First record: { "a": [1, 2] } +Test chunk 7: First record: { "r" : { "a": [1, 2] } } +Test chunk 8: First record: { "i": 1 } +Test chunk 9: First record: { "i": null } +Test chunk 10: First record: { "i": 3 } +Test chunk 11: First record: { "i": [1, 2] } +Test chunk 12: First record: { "r" : { "i": 3 } } +Test chunk 13: First record: { "r" : [{ "i": 4 }] } +``` + + diff --git a/generator/data_reader.py b/tests/data_reader.py similarity index 99% rename from generator/data_reader.py rename to tests/data_reader.py index b40aec6..ceb5fe8 100755 --- a/generator/data_reader.py +++ b/tests/data_reader.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # # Copyright 2017 Brian T. Park # @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Parses the 'testdata.txt' date file used by the 'generate_schema_test.py' +Parses the 'testdata.txt' date file used by the 'test_generate_schema.py' program. Usage: diff --git a/generator/test_generate_schema.py b/tests/test_generate_schema.py similarity index 97% rename from generator/test_generate_schema.py rename to tests/test_generate_schema.py index 61d7b1c..ce1db3e 100755 --- a/generator/test_generate_schema.py +++ b/tests/test_generate_schema.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # # Copyright 2017 Brian T. Park # @@ -18,9 +18,9 @@ import os import json from collections import OrderedDict +from bigquery_schema_generator.generate_schema import SchemaGenerator +from bigquery_schema_generator.generate_schema import sort_schema from data_reader import DataReader -from generate_schema import SchemaGenerator -from generate_schema import sort_schema class TestSchemaGenerator(unittest.TestCase): diff --git a/generator/testdata.txt b/tests/testdata.txt similarity index 100% rename from generator/testdata.txt rename to tests/testdata.txt