## append.py

In [None]:
import apache_beam as beam
import argparse
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

# define a function for transforming the data 
class AppendDoFn(beam.DoFn):
    def process(self, element):
        return element + " - Hello World!"
        
# set up pipeline parameters 
parser = argparse.ArgumentParser()
parser.add_argument('--input', dest='input',
                  default='gs://dataflow-samples/shakespeare/kinglear.txt')
parser.add_argument('--output', dest='output',
                  default='gs://dsp_model_store/shakespeare/kinglear.txt')
known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)

# define the pipeline steps 
p = beam.Pipeline(options=pipeline_options)
lines = p | 'read' >> ReadFromText(known_args.input)
appended = lines | 'append' >> beam.ParDo(AppendDoFn())
appended | 'write' >> WriteToText(known_args.output)

# run the pipeline 
result = p.run()
result.wait_until_finish()


## dataflow_read.py

In [2]:
import apache_beam as beam
import argparse
from apache_beam.options.pipeline_options import PipelineOptions

parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)

class ApplyDoFn(beam.DoFn):
    def process(self, element):
        print(element)


query = """
select *
from `bigquery-public-data.samples.natality`
order by rand()
limit 100
"""

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)
data = p | 'Read from BigQuery' >> beam.io.Read(
       beam.io.BigQuerySource(query=query, use_standard_sql=True))
scored = data | 'Apply Model' >> beam.ParDo(ApplyDoFn())

# run the pipeline
result = p.run()
result.wait_until_finish()

ERROR:root:Exception at bundle <apache_beam.runners.direct.bundle_factory._Bundle object at 0x7fb004550f50>, due to an exception.
 Traceback (most recent call last):
  File "/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py", line 343, in call
    finish_state)
  File "/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py", line 383, in attempt_call
    result = evaluator.finish_bundle()
  File "/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py", line 319, in finish_bundle
    with self._source.reader() as reader:
  File "/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py", line 495, in reader
    kms_key=self.kms_key)
  File "/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py", line 864, in __init__
    'Missing executing project information. Please use the --project '
RuntimeError: Missing executi

RuntimeError: Missing executing project information. Please use the --project command line option to specify it.

## Query Natality

In [16]:
from google.cloud import bigquery
client = bigquery.Client()

sql = """
SELECT year, plurality, apgar_5min, 
       mother_age, father_age,    
       gestation_weeks, ever_born
       ,case when mother_married = true 
             then 1 else 0 end as mother_married
       ,weight_pounds as weight
  FROM  `bigquery-public-data.samples.natality`
  limit 10000
"""

natalityDF = client.query(sql).to_dataframe().fillna(0)
natalityDF.head()

Unnamed: 0,year,plurality,apgar_5min,mother_age,father_age,gestation_weeks,ever_born,mother_married,weight
0,1970,0.0,0.0,37,46,38.0,8,1,7.62579
1,1971,1.0,0.0,43,47,38.0,12,1,7.438397
2,1972,1.0,0.0,46,48,41.0,13,1,8.437091
3,1972,1.0,0.0,38,34,99.0,10,1,7.374463
4,1973,1.0,0.0,42,49,99.0,10,1,5.81359


## Train and Save Model

In [20]:
from sklearn.linear_model import LinearRegression
import pickle
from google.cloud import storage

# fit and pickle a model 
model = LinearRegression()
model.fit(natalityDF.iloc[:,1:8], natalityDF['weight'])
pickle.dump(model, open("natality.pkl", 'wb'))

# Save to GCS
bucket = storage.Client().get_bucket('dsp_model_store')
blob = bucket.blob('natality/sklearn-linear')
blob.upload_from_filename('natality.pkl')


## Test Model Loading 

In [12]:

from google.cloud import storage
import pickle 

bucket = storage.Client().get_bucket('dsp_model_store')
blob = bucket.get_blob('natality/sklearn-linear')
model = pickle.loads(blob.download_as_string())
model


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Prediction Pipeline 

In [26]:
import apache_beam as beam
import argparse
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.io.gcp.bigquery import parse_table_schema_from_json
import json

query = """
    SELECT year, plurality, apgar_5min, 
    mother_age, father_age,    
       gestation_weeks, ever_born
       ,case when mother_married = true 
          then 1 else 0 end as mother_married
      ,weight_pounds as weight
      ,current_timestamp as time
      ,GENERATE_UUID() as guid
    FROM `bigquery-public-data.samples.natality` 
    limit 100    
"""

class ApplyDoFn(beam.DoFn):

    def __init__(self):
        self._model = None
        from google.cloud import storage
        import pandas as pd
        import pickle as pkl
        self._storage = storage
        self._pkl = pkl
        self._pd = pd
     
    def process(self, element):
        if self._model is None:
            bucket = self._storage.Client().get_bucket('dsp_model_store')
            blob = bucket.get_blob('natality/sklearn-linear')
            self._model = self._pkl.loads(blob.download_as_string())
        
        new_x = self._pd.DataFrame.from_dict(element, orient = "index").transpose().fillna(0)   
        weight = self._model.predict(new_x.iloc[:,1:8])[0]
        return [ { 'guid': element['guid'], 'weight': weight, 'time': str(element['time']) } ]

schema = parse_table_schema_from_json(json.dumps({'fields':
            [ { 'name': 'guid', 'type': 'STRING'},
              { 'name': 'weight', 'type': 'FLOAT64'},
              { 'name': 'time', 'type': 'STRING'} ]}))

class PublishDoFn(beam.DoFn):
    
    def __init__(self):
        from google.cloud import datastore       
        self._ds = datastore
    
    def process(self, element):
        client = self._ds.Client()
        key = client.key('natality-guid', element['guid'])
        entity = self._ds.Entity(key)
        entity['weight'] = element['weight']         
        entity['time'] = element['time']
        client.put(entity)

parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)
data = p | 'Read from BigQuery' >> beam.io.Read(
       beam.io.BigQuerySource(query=query, use_standard_sql=True))
scored = data | 'Apply Model' >> beam.ParDo(ApplyDoFn())
scored | 'Save to BigQuery' >> beam.io.Write(beam.io.BigQuerySink(
                'weight_preds', 'dsp_demo', schema = schema,
                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

scored | 'Create entities' >> beam.ParDo(PublishDoFn())

# run the pipeline
result = p.run()
result.wait_until_finish()


ERROR:root:Exception at bundle <apache_beam.runners.direct.bundle_factory._Bundle object at 0x7f0e173ee960>, due to an exception.
 Traceback (most recent call last):
  File "/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py", line 343, in call
    finish_state)
  File "/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/executor.py", line 383, in attempt_call
    result = evaluator.finish_bundle()
  File "/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/runners/direct/transform_evaluator.py", line 319, in finish_bundle
    with self._source.reader() as reader:
  File "/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py", line 495, in reader
    kms_key=self.kms_key)
  File "/home/ec2-user/.local/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery_tools.py", line 864, in __init__
    'Missing executing project information. Please use the --project '
RuntimeError: Missing executi

RuntimeError: Missing executing project information. Please use the --project command line option to specify it.

## Read from Datastore

In [25]:

from google.cloud import datastore
client = datastore.Client()
query = client.query(kind='natality-guid')

query_iter = query.fetch()
for entity in query_iter:
    print(entity)
    break



<Entity('natality-guid', '0046cdef-6a0f-4586-86ec-4b995cfc7c4e') {'weight': 7.9434742419056, 'time': '2019-12-15 03:00:06.319496 UTC'}>
