# 7.1 Apache Beam

## append.py

In [1]:
%%file scripts/append.py

import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

# define a function for transforming the data 
class AppendDoFn(beam.DoFn):
    def process(self, element):
        yield element + " - Hello World!"
        
# set up pipeline parameters 
parser = argparse.ArgumentParser()
parser.add_argument('--input', dest='input',
                    default='gs://dataflow-samples/shakespeare/kinglear.txt')
parser.add_argument('--output', dest='output',
                    default='gs://dsp_model_store_00/shakespeare/kinglear.txt')
known_args, pipeline_args = parser.parse_known_args()
pipeline_options = PipelineOptions(pipeline_args)

# define the pipeline steps 
p = beam.Pipeline(options=pipeline_options)
lines = p | 'read' >> ReadFromText(known_args.input)
appended = lines | 'append' >> beam.ParDo(AppendDoFn())
appended | 'write' >> WriteToText(known_args.output)

# run the pipeline 
result = p.run()
result.wait_until_finish()

Writing scripts/append.py


# 7.2 Batch Model Pipeline

## dataflow_read.py

In [62]:
%%file scripts/dataflow_read.py

import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args()
pipeline_options = PipelineOptions(pipeline_args)

query = """
    SELECT
        *
    FROM
        `bigquery-public-data.samples.natality`
    ORDER BY
        RAND()
    LIMIT
        5
"""

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)
data = p | 'Read from BigQuery' >> beam.io.Read(
    beam.io.BigQuerySource(query=query, use_standard_sql=True)
)
scored = data | 'Print' >> beam.Map(print)

# run the pipeline
result = p.run()
result.wait_until_finish()

Overwriting scripts/dataflow_read.py


## Model Training

In [4]:
from google.cloud import bigquery
client = bigquery.Client()

sql = """
    SELECT 
        year,
        plurality, 
        apgar_5min,
        mother_age, 
        father_age,    
        gestation_weeks, 
        ever_born,
        CASE WHEN mother_married = true THEN 1 ELSE 0 END AS mother_married,
        weight_pounds AS weight
    FROM
        `bigquery-public-data.samples.natality`
    ORDER BY
        RAND()
    LIMIT
        10000
"""

natality_df = client.query(sql).to_dataframe().fillna(0)
natality_df.head()

Unnamed: 0,year,plurality,apgar_5min,mother_age,father_age,gestation_weeks,ever_born,mother_married,weight
0,1984,1.0,10.0,18,22,38.0,1.0,1,7.251004
1,1980,1.0,9.0,16,17,99.0,1.0,0,5.436599
2,1983,1.0,8.0,21,26,38.0,2.0,1,6.311835
3,1985,1.0,9.0,30,30,39.0,3.0,1,8.000575
4,1979,1.0,10.0,17,99,39.0,1.0,0,6.876218


In [5]:
import joblib
from sklearn.linear_model import LinearRegression
from google.cloud import storage

# fit and pickle a model 
model = LinearRegression()
model.fit(natality_df.drop(columns='weight'), natality_df['weight'])
joblib.dump(model, 'natality.pkl')

# Save to GCS
bucket = storage.Client().get_bucket('dsp_model_store_00')
blob = bucket.blob('natality/sklearn-linear')
blob.upload_from_filename('natality.pkl')

In [6]:
import joblib
from google.cloud import storage

bucket = storage.Client().get_bucket('dsp_model_store_00')
blob = bucket.get_blob('natality/sklearn-linear')
blob.download_to_filename('sklearn-linear')
model = joblib.load('sklearn-linear')
model

LinearRegression()

## BigQuery-Datastore Publish

In [1]:
%%file scripts/apply.py

import json
import argparse
import joblib
import pandas as pd
import apache_beam as beam

from google.cloud import storage
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.io.gcp.bigquery_tools import parse_table_schema_from_json
from apache_beam.io.gcp.datastore.v1new.types import Key
from apache_beam.io.gcp.datastore.v1new.types import Entity
from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore

query = """
    SELECT 
        year,
        plurality, 
        apgar_5min,
        mother_age, 
        father_age,    
        gestation_weeks, 
        ever_born,
        CASE WHEN mother_married = true THEN 1 
             ELSE 0
        END AS mother_married,
        weight_pounds AS weight,
        CURRENT_TIMESTAMP AS time,
        GENERATE_UUID() AS guid
    FROM
        `bigquery-public-data.samples.natality`
    LIMIT
        100
"""

class ApplyDoFn(beam.DoFn):

    def __init__(self):
        self.model = None
     
    def process(self, element):
        if self.model is None:
            bucket = storage.Client().get_bucket('dsp_model_store_00')
            blob = bucket.get_blob('natality/sklearn-linear')
            blob.download_to_filename('sklearn-linear')
            self.model = joblib.load('sklearn-linear')
        
        new_x = pd.DataFrame.from_dict(element, orient="index").T.fillna(0)   
        weight = self.model.predict(new_x.iloc[:, :8])[0]
        yield {'guid': element['guid'],
               'weight': weight,
               'time': str(element['time'])}

schema = parse_table_schema_from_json(json.dumps({
    'fields': [{'name': 'guid', 'type': 'STRING'},
               {'name': 'weight', 'type': 'FLOAT64'},
               {'name': 'time', 'type': 'STRING'}]
}))

class CreateEntityDoFn(beam.DoFn):
    def process(self, element):
        key = Key(['natality-guid', element['guid']])
        entity = Entity(key)
        entity.set_properties({
            'weight': element['weight'],
            'time': element['time']
        })
        yield entity

# set up pipeline options
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args()
pipeline_options = PipelineOptions(pipeline_args)
project = pipeline_options.view_as(GoogleCloudOptions).project

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)
data = p | 'Read from BigQuery' >> beam.io.ReadFromBigQuery(
    query=query, 
    use_standard_sql=True
)
scored = data | 'Apply Model' >> beam.ParDo(ApplyDoFn())
scored | 'Save to BigQuery' >> beam.io.WriteToBigQuery(
    table='weight_preds',
    dataset='dsp_demo', 
    schema=schema,
    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
    write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
)
(scored
 | 'Create Entities' >> beam.ParDo(CreateEntityDoFn())
 | 'Save to Datastore' >> WriteToDatastore(project))

# run the pipeline
result = p.run()
result.wait_until_finish()

Overwriting scripts/apply.py


## datastore_read.py

In [65]:
%%file scripts/datastore_read.py

import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
from apache_beam.io.gcp.datastore.v1new.types import Query

parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)
project = pipeline_options.view_as(GoogleCloudOptions).project

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)

data = p | 'Read from Datastore' >> ReadFromDatastore(
    query=Query('natality-guid', project, limit=5)
)
scored = data | 'Print' >> beam.Map(print)

# run the pipeline
result = p.run()
result.wait_until_finish()

Overwriting scripts/datastore_read.py


In [66]:
from google.cloud import datastore
client = datastore.Client()
query = client.query(kind='natality-guid')

query_iter = query.fetch()
for entity in query_iter:
    print(entity)
    break

<Entity('natality-guid', '008b402d-32ff-4b14-aba0-3a4b2d712363') {'time': '2020-09-28 10:03:14.660679+00:00', 'weight': 7.449097035666264}>
