# Module 09: Train sklearn classifier on Ray, batch predict with BQML

In [1]:
#pip install skl2onnx==1.15



In [5]:
import time
import numpy as np
import joblib
import pandas as pd
import seaborn as sns
import xgboost as xgb

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform.preview import vertex_ray
import ray
from ray.runtime_env import RuntimeEnv
from ray.air.config import RunConfig
from ray.air import CheckpointConfig, ScalingConfig
from ray.util.joblib import register_ray
from ray import train
import tempfile

from google.cloud import bigquery, storage, aiplatform as vertex_ai


## [1] Train model with sklearn on Ray on Vertex

The pipeline here is slightly different to accomodate sklearn-onnix limitations - does not support categorical value imputer. So - we will impute missing values prior to training.

In [4]:
# Declare, initialize
project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_output[0]
project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_NBR = project_nbr_output[0]
REGION="us-central1"

aiplatform.init(project='ray-of-sunshine', location='us-central1')
RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/us-central1/persistentResources/ray-kicking-tires-cluster"

import joblib, sys
sys.modules['sklearn.externals.joblib'] = joblib
from ray.util.joblib import register_ray
register_ray()

RUNTIME_ENV = {
  "pip": [
      "google-cloud-aiplatform[ray]==1.40.0",
      "ray[data]==2.4.0",
      "ray[train]==2.4.0",
      "ray[tune]==2.4.0",
      "scikit-learn==1.2.2",
      "google-cloud-bigquery",
      "google-cloud-aiplatform",
      "joblib",
      "pandas<2.0.0"
  ],
}
ray.shutdown()
ray.init(address=RAY_ADDRESS,runtime_env=RUNTIME_ENV)

# The below statement will parallelize all code placed below it
with joblib.parallel_backend('ray'):

  # Column listing
  numerical_columns_list=["culmen_length_mm","culmen_depth_mm","flipper_length_mm","body_mass_g"]
  categorical_columns_list=["island", "sex"]

  # Read training data from BigQuery
  client = bigquery.Client()
  source_df = client.query("SELECT * FROM `ray_lab_ds.penguins_curated`").to_dataframe()

  # Features
  X = source_df.drop(columns = ['species'])

  # sklearn-onnx does not support imputer for categorical variables, so, we need to handle them ahead of time
  # Impute missing values with most frequent value where not null
  island_most_frequent=X["island"].describe()["top"]
  X["island"].fillna(island_most_frequent, inplace=True)

  sex_most_frequent=X["sex"].describe()["top"]
  X["sex"].fillna("missing", inplace=True)

  # Label
  Y = source_df['species']

  # Split into train and test data
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 123)


  # Preprocessing of numerical data
  numerical_transformer = SimpleImputer(strategy='mean')
  numerical_scaler = MinMaxScaler()

  # Preprocessing for categorical data
  categorical_preprocessing_pipe = Pipeline(steps=[
      #('cat_col_imputer', SimpleImputer(strategy='most_frequent')),
      ('cat_col_onehotencoder', OneHotEncoder(handle_unknown='ignore'))
  ])

  # Bundle preprocessing for numerical imputer and categorical preprocessing pipeline
  preprocessor = ColumnTransformer(
      transformers=[
          ('num_col_imputer', numerical_transformer, numerical_columns_list),
          ('cat_col_preprocessor', categorical_preprocessing_pipe, categorical_columns_list)
      ])

  random_forest_model = RandomForestClassifier(n_estimators=10)

  # Bundle preprocessing and modeling code in a pipeline
  penguin_training_pipeline = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('scaler', numerical_scaler),
      ('model', random_forest_model)])

  penguin_training_pipeline.fit(X_train, Y_train)

  # Testing
  penguin_predictions = penguin_training_pipeline.predict(X_test)
  penguin_predictions

  print('Accuracy : ', accuracy_score(Y_test, penguin_predictions))
  print('F1 Score : ', f1_score(Y_test, penguin_predictions, average = 'weighted'))
  print('Precision : ', precision_score(Y_test, penguin_predictions , average = 'weighted'))
  print('Recall : ', recall_score(Y_test, penguin_predictions, average = 'weighted'))

[Ray on Vertex AI]: Cluster State = State.RUNNING
Accuracy :  1.0
F1 Score :  1.0
Precision :  1.0
Recall :  1.0


## [2] Persist the model to ONNX format in Cloud Storage

In [27]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from pathlib import Path as path
import os
root_path = path.cwd()
# Disable zipmap as it is not supported in BigQuery ML.
options = {id(penguin_training_pipeline): {'zipmap': False}}

# Define input features. scikit-learn does not store information about the
# training dataset. It is not always possible to retrieve the number of features
# or their types. That's why the function needs another argument called initial_types.
initial_types = [
    ('island', StringTensorType([None, 1])),
   ('culmen_length_mm', FloatTensorType([None, 1])),
   ('culmen_depth_mm', FloatTensorType([None, 1])),
   ('flipper_length_mm', FloatTensorType([None, 1])),
   ('body_mass_g', FloatTensorType([None, 1])),
    ('sex', StringTensorType([None, 1])),
]
onnx_model_filename="penguin_species_predictor_pop_onnx.onnx"

# Convert the model.
onnx_model = convert_sklearn(penguin_training_pipeline, onnx_model_filename, initial_types=initial_types, options=options,target_opset=8)

# And save.
onnx_model_local_fqp=os.path.join(root_path, '/' + onnx_model_filename)

with open(onnx_model_local_fqp, 'wb') as f:
  f.write(onnx_model.SerializeToString())

In [28]:
# Upload model artifact to Cloud Storage
bucket_id=f"ray_lab_model_bucket_{PROJECT_NBR}"
bucket_path = "penguin_classifer_model_onnx"
model_bucket_fq_gcs_uri="{}/{}".format(bucket_id,bucket_path)

# Upload the model to GCS
bucket = storage.Client().bucket(bucket_id)
blob = bucket.blob('{}/{}'.format(
    bucket_path,
    onnx_model_filename))
blob.upload_from_filename(onnx_model_local_fqp)

## [3] Import ONNX model into BQML

In [29]:
%%bigquery

 CREATE OR REPLACE MODEL `ray_lab_ds.penguin_species_predictor`
  OPTIONS (MODEL_TYPE='ONNX',
   MODEL_PATH='gs://ray_lab_model_bucket_567162267085/penguin_classifer_model_onnx/*')

Query is running:   0%|          |

## [4] Batch predict with BQML

### Run batch predictions

In [37]:
 %%bigquery

WITH PREDICTIONS AS(
 SELECT *
   FROM ML.PREDICT(MODEL ray_lab_ds.penguin_species_predictor,
     (SELECT REGEXP_EXTRACT(species, r'\w+') species, island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, sex
      FROM bigquery-public-data.ml_datasets.penguins LIMIT 5))
) SELECT island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, sex, species as label_species, label as predicted_species, probabilities
  FROM PREDICTIONS


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,label_species,predicted_species,probabilities
0,Dream,36.6,18.4,184.0,3475.0,FEMALE,Adelie,Adelie,"[1.0000001192092896, 0.0, 0.0]"
1,Dream,39.8,19.1,184.0,4650.0,MALE,Adelie,Adelie,"[0.9000000953674316, 0.0, 0.10000000149011612]"
2,Dream,40.9,18.9,184.0,3900.0,MALE,Adelie,Adelie,"[1.0000001192092896, 0.0, 0.0]"
3,Dream,46.5,17.9,192.0,3500.0,FEMALE,Chinstrap,Chinstrap,"[0.0, 1.0000001192092896, 0.0]"
4,Dream,37.3,16.8,192.0,3000.0,FEMALE,Adelie,Adelie,"[1.0000001192092896, 0.0, 0.0]"
