[BEAM-2405] Write to BQ using the streaming API#3288
[BEAM-2405] Write to BQ using the streaming API#3288sb2nov wants to merge 2 commits intoapache:masterfrom
Conversation
|
R: @chamikaramj PTAL |
| request = bigquery.BigqueryTablesInsertRequest( | ||
| projectId=project_id, datasetId=dataset_id, table=table) | ||
| response = self.client.tables.Insert(request) | ||
| logging.info("Created the table with id %s", table_id) |
There was a problem hiding this comment.
logging.debug ?
"Created a table"
|
|
||
| def __init__(self, table_id, dataset_id, project_id, batch_size, schema, | ||
| create_disposition, write_disposition, client): | ||
| self.table_id = table_id |
There was a problem hiding this comment.
Please add a doc comment.
| self._rows_buffer = [] | ||
| # Transform the table schema into a bigquery.TableSchema instance. | ||
| if isinstance(self.schema, basestring): | ||
| # TODO(silviuc): Should add a regex-based validation of the format. |
There was a problem hiding this comment.
Are we still hoping to do this (TODO) ?
| if isinstance(self.schema, basestring): | ||
| # TODO(silviuc): Should add a regex-based validation of the format. | ||
| table_schema = bigquery.TableSchema() | ||
| schema_list = [s.strip(' ') for s in self.schema.split(',')] |
| field_schema = bigquery.TableFieldSchema() | ||
| field_schema.name = field_name | ||
| field_schema.type = field_type | ||
| field_schema.mode = 'NULLABLE' |
There was a problem hiding this comment.
Do we support other modes ?
There was a problem hiding this comment.
not in the string schema input
| create_disposition=self.create_disposition, | ||
| write_disposition=self.write_disposition, | ||
| client=self.test_client) | ||
| return pcoll | 'Write to BQ' >> ParDo(bigquery_write_fn) |
There was a problem hiding this comment.
BigQuery instead of BQ here ?
| client.tables.Get.return_value = bigquery.Table( | ||
| tableReference=bigquery.TableReference( | ||
| projectId='project_id', datasetId='dataset_id', tableId='table_id')) | ||
| client.tabledata.InsertAll.return_value = \ |
There was a problem hiding this comment.
I believe we usually use () instead of \ for line breaks.
| self.assertTrue(client.tables.Get.called) | ||
| self.assertTrue(client.tables.Insert.called) | ||
|
|
||
| def test_dofn_client_process_flush_not_called(self): |
There was a problem hiding this comment.
A better name might be "test_dofn_client_process_performs_batching".
| fn.finish_bundle() | ||
| # InsertRows called in finish bundle | ||
| self.assertTrue(client.tabledata.InsertAll.called) | ||
|
|
There was a problem hiding this comment.
Also, add a test that writes zero records.
| schema_list = [s.strip(' ') for s in self.schema.split(',')] | ||
| for field_and_type in schema_list: | ||
| field_name, field_type = field_and_type.split(':') | ||
| field_schema = bigquery.TableFieldSchema() |
There was a problem hiding this comment.
Please add tests for schema handling logic here.
sb2nov
left a comment
There was a problem hiding this comment.
Thanks for the review.
| request = bigquery.BigqueryTablesInsertRequest( | ||
| projectId=project_id, datasetId=dataset_id, table=table) | ||
| response = self.client.tables.Insert(request) | ||
| logging.info("Created the table with id %s", table_id) |
| self.create_disposition = create_disposition | ||
| self.write_disposition = write_disposition | ||
| self._rows_buffer = [] | ||
| self._max_batch_size = batch_size or 500 |
There was a problem hiding this comment.
This is based on what Java had
| self._rows_buffer = [] | ||
| # Transform the table schema into a bigquery.TableSchema instance. | ||
| if isinstance(self.schema, basestring): | ||
| # TODO(silviuc): Should add a regex-based validation of the format. |
| if isinstance(self.schema, basestring): | ||
| # TODO(silviuc): Should add a regex-based validation of the format. | ||
| table_schema = bigquery.TableSchema() | ||
| schema_list = [s.strip(' ') for s in self.schema.split(',')] |
There was a problem hiding this comment.
I would have just liked to deprecate this string schema thing and asked everyone to create a table schema object but that is a larger change
| field_schema = bigquery.TableFieldSchema() | ||
| field_schema.name = field_name | ||
| field_schema.type = field_type | ||
| field_schema.mode = 'NULLABLE' |
There was a problem hiding this comment.
not in the string schema input
| fn.finish_bundle() | ||
| # InsertRows called in finish bundle | ||
| self.assertTrue(client.tabledata.InsertAll.called) | ||
|
|
|
|
||
| def __init__(self, table_id, dataset_id, project_id, batch_size, schema, | ||
| create_disposition, write_disposition, client): | ||
| self.table_id = table_id |
| if isinstance(self.schema, basestring): | ||
| # TODO(silviuc): Should add a regex-based validation of the format. | ||
| table_schema = bigquery.TableSchema() | ||
| schema_list = [s.strip(' ') for s in self.schema.split(',')] |
| schema_list = [s.strip(' ') for s in self.schema.split(',')] | ||
| for field_and_type in schema_list: | ||
| field_name, field_type = field_and_type.split(':') | ||
| field_schema = bigquery.TableFieldSchema() |
| class WriteToBigQuery(PTransform): | ||
|
|
||
| def __init__(self, table, dataset=None, project=None, schema=None, | ||
| create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, |
|
LGTM. Thanks. |
Be sure to do all of the following to help us incorporate your contribution
quickly and easily:
[BEAM-<Jira issue #>] Description of pull requestmvn clean verify.<Jira issue #>in the title with the actual Jira issuenumber, if there is one.
Individual Contributor License Agreement.
This should be ready for review so that we can test it with streaming pipelines
There are few followup items after this PR: