In [None]:
#@title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Apache Beam RunInference for scikit-learn

<button>
  <a href="https://beam.apache.org/documentation/sdks/python-machine-learning/">
    <img src="https://beam.apache.org/images/favicon.ico" alt="Open the docs" height="16"/>
    Beam RunInference
  </a>
</button>

In this notebook, we walk through the use of the RunInference transform for [scikit-learn](https://scikit-learn.org/) also call sklearn.
Beam [RunInference](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.RunInference) has implementations of it's [ModelHandler](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.ModelHandler) specifically for scikit-learn.

Choose a model handler for your implementation:
* The [numpy model handler](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.sklearn_inference.html#apache_beam.ml.inference.sklearn_inference.SklearnModelHandlerNumpy)
* The [pandas dataframes model handler](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.sklearn_inference.html#apache_beam.ml.inference.sklearn_inference.SklearnModelHandlerNumpy)

These ModelHandlers coupled with RunInference take care of batching, vectorization and optomizing predictions for you pipeline or model.

This notebook illustrates common RunInference patterns such as the following:
*   Generating predictions.
*   Post processing results after RunInference.
*   Inference with multiple models in the same pipeline.

The linear regression models used in these samples are trained on data that correspondes to the 5 and 10 times table; that is,`y = 5x` and `y = 10x` respectively.

## Setup

1. Install dependencies for beam.
1. Authenticate with Google Cloud.
1. Specify your project and bucket.  This will be needed to save and load models.

In [5]:
!pip install google-api-core --quiet
!pip install google-cloud-pubsub google-cloud-bigquery-storage --quiet
!pip install apache-beam[gcp,dataframe] --quiet

## About scikit-learn Versions

scikit-learn is a build-dependency of Apache Beam. If a different version of sklearn needs to be installed, use `%pip install scikit-learn==<version>`

In [6]:
from google.colab import auth
auth.authenticate_user()

In [7]:
import os

# Constants
project = "<your GCP project>"
bucket = "<your GCP bucket>"

# set the project to avoid warnings.
os.environ['GOOGLE_CLOUD_PROJECT'] = project

save_model_dir_multiply_five = 'five_times_table_torch.pt'
save_model_dir_multiply_ten = 'ten_times_table_torch.pt'

In [42]:
import pickle
from sklearn import linear_model
from typing import Tuple

import numpy as np
import apache_beam as beam

from apache_beam.ml.inference.sklearn_inference import ModelFileType
from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerNumpy
from apache_beam.ml.inference.base import KeyedModelHandler
from apache_beam.ml.inference.base import PredictionResult
from apache_beam.ml.inference.base import RunInference
from apache_beam.options.pipeline_options import PipelineOptions


## Create the Data and the scikit-learn Model.
In this cell, we perform:
1. Create the data to train the scikit-learn linear regression model.
2. Train the linear regression model.
3. Save the scikit-learn model using `pickle`.

In [15]:
# Input data to train the sklearn model.
x = np.arange(0, 100, dtype=np.float32).reshape(-1, 1)
y = (x * 5).reshape(-1, 1)

regression = linear_model.LinearRegression()
regression.fit(x,y)

sklearn_model_filename = 'sklearn_5x_model.pkl'
with open(sklearn_model_filename, 'wb') as f:
    pickle.dump(regression, f)

### scikit-learn RunInference pipeline.

1. Define the scikit-learn model handler that accepts array_like object as input.
2. Read the data from BigQuery.
3. Use the scikit-learn trained model and the scikit-learn RunInference transform on unkeyed data.

In [22]:
%pip install --upgrade google-cloud-bigquery --quiet

[K     |████████████████████████████████| 211 kB 4.1 MB/s 
[K     |████████████████████████████████| 76 kB 5.4 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.13.3 requires google-cloud-bigquery[bqstorage,pandas]<2.0.0dev,>=1.11.1, but you have google-cloud-bigquery 3.3.2 which is incompatible.
google-cloud-storage 1.18.1 requires google-resumable-media<0.5.0dev,>=0.3.1, but you have google-resumable-media 2.3.3 which is incompatible.[0m
[?25h

In [24]:
!gcloud config set project $project

Updated property [core/project].


In [32]:
# Populated BigQuery Table

from google.cloud import bigquery

client = bigquery.Client(project=project)

# Make sure the dataset_id is unique in your project.
dataset_id = '{project}.maths'.format(project=project)
dataset = bigquery.Dataset(dataset_id)

# Modify the location based on your project configuration.
dataset.location = 'US'
dataset = client.create_dataset(dataset, exists_ok=True)

# Table name in the BigQuery dataset.
table_name = 'maths_problems_1'

query = """
    CREATE OR REPLACE TABLE
      {project}.maths.{table} ( key STRING OPTIONS(description="A unique key for the maths problem"),
    value FLOAT64 OPTIONS(description="Our maths problem" ) );
    INSERT INTO maths.{table}
    VALUES
      ("first_example", 105.00),
      ("second_example", 108.00),
      ("third_example", 1000.00),
      ("fourth_example", 1013.00)
""".format(project=project, table=table_name)

create_job = client.query(query)
create_job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7fec32790750>

In [33]:
# SklearnModelHandlerNumpy accepts only unkeyed examples.
# Use model_file_type=ModelFileType.JOBLIB if the model is seriazlized using joblib.
sklearn_model_handler = SklearnModelHandlerNumpy(model_uri=sklearn_model_filename,
                                                 model_file_type=ModelFileType.PICKLE) 


pipeline_options = PipelineOptions().from_dictionary(
                                      {'temp_location':f'gs://{bucket}/tmp'})
pipeline = beam.Pipeline(options=pipeline_options)

# Define BigQuery table specification.
table_name = 'maths_problems_1'
table_spec = f'{project}:maths.{table_name}'

with pipeline as p:
  (
      p 
      | "ReadFromBQ" >> beam.io.ReadFromBigQuery(table=table_spec)
      | "ExtractInputs" >> beam.Map(lambda x: [x['value']]) 
      | "RunInferenceSklearn" >> RunInference(model_handler=sklearn_model_handler)
      | beam.Map(print)
  )

PredictionResult(example=[105.0], inference=array([525.]))
PredictionResult(example=[1000.0], inference=array([5000.]))
PredictionResult(example=[108.0], inference=array([540.]))
PredictionResult(example=[1013.0], inference=array([5065.]))


### Sklearn RunInference on keyed inputs.
1. Wrap the `SklearnModelHandlerNumpy` object around `KeyedModelHandler` to handle keyed data.
2. Read the data from BigQuery.
3. Use the Sklearn trained model and the Sklearn RunInference transform on a keyed data.

In [37]:
# Use model_file_type=ModelFileType.JOBLIB if the model is serialized using joblib.
sklearn_model_handler = SklearnModelHandlerNumpy(model_uri=sklearn_model_filename,
                                                 model_file_type=ModelFileType.PICKLE) 
keyed_sklearn_model_handler = KeyedModelHandler(sklearn_model_handler)

pipeline_options = PipelineOptions().from_dictionary(
                                      {'temp_location':f'gs://{bucket}/tmp'})
pipeline = beam.Pipeline(options=pipeline_options)

with pipeline as p:
  (
  p 
  | "ReadFromBQ" >> beam.io.ReadFromBigQuery(table=table_spec)
  | "ExtractInputs" >> beam.Map(lambda x: (x['key'], [x['value']])) 
  | "RunInferenceSklearn" >> RunInference(model_handler=keyed_sklearn_model_handler)
  | beam.Map(print)
  )

('first_example', PredictionResult(example=[105.0], inference=array([525.])))
('third_example', PredictionResult(example=[1000.0], inference=array([5000.])))
('second_example', PredictionResult(example=[108.0], inference=array([540.])))
('fourth_example', PredictionResult(example=[1013.0], inference=array([5065.])))
