New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[BEAM-3342] Create a Cloud Bigtable IO connector for Python #8457
Changes from 25 commits
6fa024d
156358a
90697db
ad9d3b9
8cc6485
99b0efd
c2270cb
b8c85bb
acc5a20
8819ee8
c4bd3ef
89d9424
c7688ef
ffce22f
aa20d08
057a7b0
9b76efe
c3b8b47
614887e
dd6c962
3114d98
727d54a
b771645
5414e0c
b27cef5
ad33d95
27f2a4a
9eb031d
cbfc64b
ae7d712
7acbacd
9c54caa
c3d4249
fdc426d
1962866
096953c
a8878d5
7eca091
d8c0130
3f6f37c
e9874c4
0d964ac
bc2a8dd
7bb69e1
2f3e37b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,16 +37,20 @@ | |
""" | ||
from __future__ import absolute_import | ||
|
||
from collections import namedtuple | ||
|
||
import apache_beam as beam | ||
from apache_beam.io import iobase | ||
from apache_beam.metrics import Metrics | ||
from apache_beam.transforms import core | ||
from apache_beam.transforms.display import DisplayDataItem | ||
|
||
try: | ||
from google.cloud.bigtable import Client | ||
except ImportError: | ||
pass | ||
Client = None | ||
|
||
__all__ = ['WriteToBigTable'] | ||
__all__ = ['ReadFromBigTable', 'WriteToBigTable'] | ||
|
||
|
||
class _BigTableWriteFn(beam.DoFn): | ||
|
@@ -67,27 +71,27 @@ def __init__(self, project_id, instance_id, table_id): | |
table_id(str): GCP Table to write the `DirectRows` | ||
""" | ||
super(_BigTableWriteFn, self).__init__() | ||
self.beam_options = {'project_id': project_id, | ||
'instance_id': instance_id, | ||
'table_id': table_id} | ||
self._beam_options = {'project_id': project_id, | ||
'instance_id': instance_id, | ||
'table_id': table_id} | ||
self.table = None | ||
self.batcher = None | ||
self.written = Metrics.counter(self.__class__, 'Written Row') | ||
|
||
def __getstate__(self): | ||
return self.beam_options | ||
return self._beam_options | ||
|
||
def __setstate__(self, options): | ||
self.beam_options = options | ||
self._beam_options = options | ||
self.table = None | ||
self.batcher = None | ||
self.written = Metrics.counter(self.__class__, 'Written Row') | ||
|
||
def start_bundle(self): | ||
if self.table is None: | ||
client = Client(project=self.beam_options['project_id']) | ||
instance = client.instance(self.beam_options['instance_id']) | ||
self.table = instance.table(self.beam_options['table_id']) | ||
client = Client(project=self._beam_options['project_id']) | ||
instance = client.instance(self._beam_options['instance_id']) | ||
self.table = instance.table(self._beam_options['table_id']) | ||
self.batcher = self.table.mutations_batcher() | ||
|
||
def process(self, row): | ||
|
@@ -107,11 +111,11 @@ def finish_bundle(self): | |
self.batcher = None | ||
|
||
def display_data(self): | ||
return {'projectId': DisplayDataItem(self.beam_options['project_id'], | ||
return {'projectId': DisplayDataItem(self._beam_options['project_id'], | ||
label='Bigtable Project Id'), | ||
'instanceId': DisplayDataItem(self.beam_options['instance_id'], | ||
'instanceId': DisplayDataItem(self._beam_options['instance_id'], | ||
label='Bigtable Instance Id'), | ||
'tableId': DisplayDataItem(self.beam_options['table_id'], | ||
'tableId': DisplayDataItem(self._beam_options['table_id'], | ||
label='Bigtable Table Id') | ||
} | ||
|
||
|
@@ -122,22 +126,145 @@ class WriteToBigTable(beam.PTransform): | |
A PTransform that write a list of `DirectRow` into the Bigtable Table | ||
|
||
""" | ||
def __init__(self, project_id=None, instance_id=None, | ||
table_id=None): | ||
def __init__(self, project_id=None, instance_id=None, table_id=None): | ||
""" The PTransform to access the Bigtable Write connector | ||
Args: | ||
project_id(str): GCP Project of to write the Rows | ||
instance_id(str): GCP Instance to write the Rows | ||
table_id(str): GCP Table to write the `DirectRows` | ||
""" | ||
super(WriteToBigTable, self).__init__() | ||
self.beam_options = {'project_id': project_id, | ||
'instance_id': instance_id, | ||
'table_id': table_id} | ||
self._beam_options = {'project_id': project_id, | ||
'instance_id': instance_id, | ||
'table_id': table_id} | ||
|
||
def expand(self, pvalue): | ||
beam_options = self.beam_options | ||
beam_options = self._beam_options | ||
return (pvalue | ||
| beam.ParDo(_BigTableWriteFn(beam_options['project_id'], | ||
beam_options['instance_id'], | ||
beam_options['table_id']))) | ||
|
||
|
||
class _BigtableReadFn(beam.DoFn): | ||
""" Creates the connector that can read rows for Beam pipeline | ||
|
||
Args: | ||
project_id(str): GCP Project ID | ||
instance_id(str): GCP Instance ID | ||
table_id(str): GCP Table ID | ||
|
||
""" | ||
|
||
def __init__(self, project_id, instance_id, table_id, filter_=b''): | ||
""" Constructor of the Read connector of Bigtable | ||
mf2199 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Args: | ||
project_id: [str] GCP Project of to write the Rows | ||
instance_id: [str] GCP Instance to write the Rows | ||
table_id: [str] GCP Table to write the `DirectRows` | ||
filter_: [RowFilter] Filter to apply to columns in a row. | ||
""" | ||
super(self.__class__, self).__init__() | ||
self._initialize({'project_id': project_id, | ||
'instance_id': instance_id, | ||
'table_id': table_id, | ||
'filter_': filter_}) | ||
|
||
def __getstate__(self): | ||
return self._beam_options | ||
|
||
def __setstate__(self, options): | ||
self._initialize(options) | ||
|
||
def _initialize(self, options): | ||
self._beam_options = options | ||
self.table = None | ||
self.sample_row_keys = None | ||
self.row_count = Metrics.counter(self.__class__.__name__, 'Rows read') | ||
|
||
def start_bundle(self): | ||
if self.table is None: | ||
self.table = Client(project=self._beam_options['project_id'])\ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: PEP 8 recommends using parenthesis for formatting instead of backslashes. |
||
.instance(self._beam_options['instance_id'])\ | ||
.table(self._beam_options['table_id']) | ||
|
||
def process(self, element, **kwargs): | ||
mf2199 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for row in self.table.read_rows(start_key=element.start_position, | ||
end_key=element.end_position, | ||
filter_=self._beam_options['filter_']): | ||
self.row_count.inc() | ||
yield row | ||
|
||
def display_data(self): | ||
return {'projectId': DisplayDataItem(self._beam_options['project_id'], | ||
label='Bigtable Project Id'), | ||
'instanceId': DisplayDataItem(self._beam_options['instance_id'], | ||
label='Bigtable Instance Id'), | ||
'tableId': DisplayDataItem(self._beam_options['table_id'], | ||
label='Bigtable Table Id'), | ||
'filter_': DisplayDataItem(str(self._beam_options['filter_']), | ||
label='Bigtable Filter') | ||
} | ||
|
||
|
||
class ReadFromBigTable(beam.PTransform): | ||
def __init__(self, project_id, instance_id, table_id, filter_=b''): | ||
""" The PTransform to access the Bigtable Read connector | ||
|
||
Args: | ||
project_id: [str] GCP Project of to read the Rows | ||
instance_id): [str] GCP Instance to read the Rows | ||
table_id): [str] GCP Table to read the Rows | ||
filter_: [RowFilter] Filter to apply to columns in a row. | ||
""" | ||
super(self.__class__, self).__init__() | ||
self._beam_options = {'project_id': project_id, | ||
'instance_id': instance_id, | ||
'table_id': table_id, | ||
'filter_': filter_} | ||
|
||
def __getstate__(self): | ||
return self._beam_options | ||
|
||
def __setstate__(self, options): | ||
self._beam_options = options | ||
|
||
def expand(self, pbegin): | ||
from apache_beam.transforms import util | ||
|
||
beam_options = self._beam_options | ||
table = Client(project=beam_options['project_id'])\ | ||
.instance(beam_options['instance_id'])\ | ||
.table(beam_options['table_id']) | ||
sample_row_keys = list(table.sample_row_keys()) | ||
|
||
if len(sample_row_keys) > 1 and sample_row_keys[0].row_key != b'': | ||
SampleRowKey = namedtuple("SampleRowKey", "row_key offset_bytes") | ||
first_key = SampleRowKey(b'', 0) | ||
sample_row_keys.insert(0, first_key) | ||
sample_row_keys = list(sample_row_keys) | ||
|
||
def split_source(unused_impulse): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method doesn't need to exist. I would think that you could use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the way it's implemented in Apache Beam iobase.Read(PTransform). The FlatMap needs a callable object to process the elements in parallel, and the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. Would it make sense to call this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed it too. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had a chat with Cham, and he rightly suggested it would be good to have unit testing of this splitting function, as it could cause data being dropped if it has any bugs - so maybe make it static, and add tests? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 on a unit test. |
||
bundles = [] | ||
for i in range(1, len(sample_row_keys)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the case that sample_row_keys is length 1, What you may want to do is only do the Reshuffle + FlatMap if sample_row_keys is > 1. That worked for me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If Maybe line 242 should read as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, I dont see a place where [, ""] is appended to sample_row_keys. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might have been implemented incorrectly. 'sample_row_keys' is implemented in the outer scope but 'split_source' is used as input to a FlatMap transform (hence split_source will be come a function wrapping a DoFn). Instead of using FlatMap here implement a DoFn class and provide whatever state that it needs to perform splitting to the constructor of the DoFn. Also note that this class (with it's state) has to be picklable. One difference between Direct and Dataflow runners is that Dataflow runner pickles DoFn objects to send to remote workers. That might explain why your code doesn't work for Dataflow. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chamikaramj It might, but then how could it work a few months back? The code below works now, with Dataflow, albeit from a standalone file, meaning that pickling is probably not an issue:
When packaged, it becomes more like this:
The latter breaks with Dataflow while still running under Direct. As you can see, the logic is nearly identical, suggesting that some magic might happen during [un]packaging. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean by "when packaged" ? Note that if you are trying to utilize other modules that are not included with SDK, you have to build a local tarball before testing. python setup.py sdist When running include the SDK tarball with your module using --sdk_location option (since the default tarball downloaded by Dataflow will not have your module). For example, for worcount, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "When packaged" means when packed into a local tarball, precisely via So far we've been using But I don't disagree, we could try specifying the |
||
key_1 = sample_row_keys[i - 1].row_key | ||
key_2 = sample_row_keys[i].row_key | ||
size = sample_row_keys[i].offset_bytes \ | ||
- sample_row_keys[i - 1].offset_bytes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will this work with variable key sizes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To @mf2199's point, sample row keys is a Cloud Bigtable core concept and is intended to be used this way. See https://github.com/googleapis/googleapis/blob/3c04bbedf1a1913507eca6dcb7ef9a1d849998f4/google/bigtable/v1/bigtable_service.proto#L47 for more detail. |
||
bundles.append(iobase.SourceBundle(size, None, key_1, key_2)) | ||
|
||
from random import shuffle | ||
# Shuffle randomizes reading locations for better performance | ||
shuffle(bundles) | ||
return bundles | ||
|
||
return (pbegin | ||
| core.Impulse() | ||
| 'Split' >> core.FlatMap(split_source) | ||
mf2199 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| util.Reshuffle() | ||
| 'Read Bundles' >> beam.ParDo( | ||
_BigtableReadFn(project_id=beam_options['project_id'], | ||
instance_id=beam_options['instance_id'], | ||
table_id=beam_options['table_id'], | ||
filter_=beam_options['filter_']))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
filter_
should probably default toNone
since that is the default forread_rows
if you pass in an empty bytestring like you do here, then you receive an error cause they do not have ato_pb
function as expected ofRowFilter
type objects.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1