apache · mf2199 · May 1, 2019 · May 1, 2019 · May 1, 2019 · May 2, 2019
diff --git a/.gitignore b/.gitignore
@@ -84,6 +84,3 @@ sdks/python/postcommit_requirements.txt
 # is an input to 'maven-assembly-plugin' that generates source distribution.
 # This is typically in files named 'src.xml' throughout this repository.
 
-# JetBrains Education files
-!**/study_project.xml
-**/.coursecreator/**/*
diff --git a/sdks/python/apache_beam/io/gcp/bigtableio.py b/sdks/python/apache_beam/io/gcp/bigtableio.py
@@ -37,16 +37,24 @@
 """
 from __future__ import absolute_import
 
+from collections import namedtuple
+
 import apache_beam as beam
+from apache_beam import coders
+from apache_beam.io import iobase
+from apache_beam.io.range_trackers import LexicographicKeyRangeTracker
 from apache_beam.metrics import Metrics
+from apache_beam.transforms import core
 from apache_beam.transforms.display import DisplayDataItem
 
 try:
   from google.cloud.bigtable import Client
+  from google.cloud.bigtable.row_set import RowRange
+  from google.cloud.bigtable.row_set import RowSet
 except ImportError:
-  pass
+  Client = None
 
-__all__ = ['WriteToBigTable']
+__all__ = ['ReadFromBigTable', 'WriteToBigTable']
 
 
 class _BigTableWriteFn(beam.DoFn):
@@ -67,27 +75,27 @@ def __init__(self, project_id, instance_id, table_id):
       table_id(str): GCP Table to write the `DirectRows`
     """
     super(_BigTableWriteFn, self).__init__()
-    self.beam_options = {'project_id': project_id,
-                         'instance_id': instance_id,
-                         'table_id': table_id}
+    self._beam_options = {'project_id': project_id,
+                          'instance_id': instance_id,
+                          'table_id': table_id}
     self.table = None
     self.batcher = None
     self.written = Metrics.counter(self.__class__, 'Written Row')
 
   def __getstate__(self):
-    return self.beam_options
+    return self._beam_options
 
   def __setstate__(self, options):
-    self.beam_options = options
+    self._beam_options = options
     self.table = None
     self.batcher = None
     self.written = Metrics.counter(self.__class__, 'Written Row')
 
   def start_bundle(self):
     if self.table is None:
-      client = Client(project=self.beam_options['project_id'])
-      instance = client.instance(self.beam_options['instance_id'])
-      self.table = instance.table(self.beam_options['table_id'])
+      client = Client(project=self._beam_options['project_id'])
+      instance = client.instance(self._beam_options['instance_id'])
+      self.table = instance.table(self._beam_options['table_id'])
     self.batcher = self.table.mutations_batcher()
 
   def process(self, row):
@@ -107,11 +115,11 @@ def finish_bundle(self):
     self.batcher = None
 
   def display_data(self):
-    return {'projectId': DisplayDataItem(self.beam_options['project_id'],
+    return {'projectId': DisplayDataItem(self._beam_options['project_id'],
                                          label='Bigtable Project Id'),
-            'instanceId': DisplayDataItem(self.beam_options['instance_id'],
+            'instanceId': DisplayDataItem(self._beam_options['instance_id'],
                                           label='Bigtable Instance Id'),
-            'tableId': DisplayDataItem(self.beam_options['table_id'],
+            'tableId': DisplayDataItem(self._beam_options['table_id'],
                                        label='Bigtable Table Id')
            }
 
@@ -122,22 +130,151 @@ class WriteToBigTable(beam.PTransform):
   A PTransform that write a list of `DirectRow` into the Bigtable Table
 
   """
-  def __init__(self, project_id=None, instance_id=None,
-               table_id=None):
+  def __init__(self, project_id=None, instance_id=None, table_id=None):
     """ The PTransform to access the Bigtable Write connector
     Args:
       project_id(str): GCP Project of to write the Rows
       instance_id(str): GCP Instance to write the Rows
       table_id(str): GCP Table to write the `DirectRows`
     """
     super(WriteToBigTable, self).__init__()
-    self.beam_options = {'project_id': project_id,
+    self._beam_options = {'project_id': project_id,
                          'instance_id': instance_id,
                          'table_id': table_id}
 
   def expand(self, pvalue):
-    beam_options = self.beam_options
+    beam_options = self._beam_options
     return (pvalue
             | beam.ParDo(_BigTableWriteFn(beam_options['project_id'],
                                           beam_options['instance_id'],
                                           beam_options['table_id'])))
+
+
+class _BigtableReadFn(beam.DoFn):
+  """ Creates the connector that can read rows for Beam pipeline
+
+  Args:
+    project_id(str): GCP Project ID
+    instance_id(str): GCP Instance ID
+    table_id(str): GCP Table ID
+
+  """
+
+  def __init__(self, project_id, instance_id, table_id, start_key=None, end_key=None, filter_=b''):
+    """ Constructor of the Read connector of Bigtable
+
+    Args:
+      project_id: [str] GCP Project of to write the Rows
+      instance_id: [str] GCP Instance to write the Rows
+      table_id: [str] GCP Table to write the `DirectRows`
+      filter_: [RowFilter] Filter to apply to columns in a row.
+    """
+    super(self.__class__, self).__init__()
+    self._initialize({'project_id': project_id,
+                      'instance_id': instance_id,
+                      'table_id': table_id,
+                      'start_key': start_key,
+                      'end_key': end_key,
+                      'filter_': filter_})
+
+  def __getstate__(self):
+    return self._beam_options
+
+  def __setstate__(self, options):
+    self._initialize(options)
+
+  def _initialize(self, options):
+    self._beam_options = options
+    self.table = None
+    self.sample_row_keys = None
+    self.row_count = Metrics.counter(self.__class__.__name__, 'Rows read')
+
+  def start_bundle(self):
+    if self.table is None:
+      self.table = Client(project=self._beam_options['project_id'])\
+                    .instance(self._beam_options['instance_id'])\
+                    .table(self._beam_options['table_id'])
+
+  def process(self, element, **kwargs):
+    for row in self.table.read_rows(start_key=self._beam_options['start_key'],
+                                    end_key=self._beam_options['end_key'],
+                                    filter_=self._beam_options['filter_']):
+      self.written.inc()
+      yield row
+
+  def get_initial_restriction(self, element):
+    pass
+
+  def finish_bundle(self):
+      pass
+
+  def display_data(self):
+    return {'projectId': DisplayDataItem(self._beam_options['project_id'],
+                                         label='Bigtable Project Id'),
+            'instanceId': DisplayDataItem(self._beam_options['instance_id'],
+                                          label='Bigtable Instance Id'),
+            'tableId': DisplayDataItem(self._beam_options['table_id'],
+                                       label='Bigtable Table Id'),
+            'filter_': DisplayDataItem(self._beam_options['filter_'],
+                                       label='Bigtable Filter')
+            }
+
+
+class ReadFromBigTable(beam.PTransform):
+  def __init__(self, project_id, instance_id, table_id, filter_=b''):
+    """ The PTransform to access the Bigtable Read connector
+
+    Args:
+      project_id: [str] GCP Project of to read the Rows
+      instance_id): [str] GCP Instance to read the Rows
+      table_id): [str] GCP Table to read the Rows
+      filter_: [RowFilter] Filter to apply to columns in a row.
+    """
+    super(self.__class__, self).__init__()
+    self._beam_options = {'project_id': project_id,
+                         'instance_id': instance_id,
+                         'table_id': table_id,
+                         'filter_': filter_}
+    self.table = None
+    self.sample_row_keys = None
+
+  def __getstate__(self):
+      return self._beam_options
+
+  def __setstate__(self, options):
+      self._beam_options = options
+
+  def expand(self, pbegin):
+    beam_options = self._beam_options
+    if self.table is None:
+      self.table = Client(project=self._beam_options['project_id'])\
+                    .instance(self._beam_options['instance_id'])\
+                    .table(self._beam_options['table_id'])
+
+    sample_row_keys = list(self.table.sample_row_keys())
+    if len(sample_row_keys) > 1 and sample_row_keys[0].row_key != b'':
+        SampleRowKey = namedtuple("SampleRowKey", "row_key offset_bytes")
+        first_key = SampleRowKey(b'', 0)
+        sample_row_keys.insert(0, first_key)
+        sample_row_keys = list(sample_row_keys)
+
+    def split_source(unused_impulse):
+      bundles = []
+      for i in range(1, len(sample_row_keys)):
+        key_1 = sample_row_keys[i - 1].row_key
+        key_2 = sample_row_keys[i].row_key
+        size = sample_row_keys[i].offset_bytes - sample_row_keys[i - 1].offset_bytes
+        bundles.append(iobase.SourceBundle(size, self, key_1, key_2))
+
+      from random import shuffle
+      # Shuffle is needed to allow reading from different locations of the table for better efficiency
+      shuffle(bundles)
+      return bundles
+
+    return (pbegin
+            | core.Impulse()
+            | 'Split' >> core.FlatMap(split_source)
+            | 'Read Bundles' >> beam.ParDo(_BigtableReadFn(project_id=beam_options['project_id'],
+                                                           instance_id=beam_options['instance_id'],
+                                                           table_id=beam_options['table_id'],
+                                                           filter_=beam_options['filter_'])))