# Interactive Distributed Scikit-Learn with Ray on Vertex AI

## [1] Adjust the ray version in Colab to match that of the Ray cluster

In [1]:
#!pip install ray==2.4.0



## [1] Train model

In [2]:
import time
import numpy as np
import joblib
import pandas as pd
import seaborn as sns
import xgboost as xgb

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform.preview import vertex_ray
import ray
from ray.runtime_env import RuntimeEnv
from ray.air.config import RunConfig
from ray.air import CheckpointConfig, ScalingConfig
from ray.util.joblib import register_ray
from ray import train
import tempfile

from google.cloud import bigquery, storage, aiplatform as vertex_ai


In [3]:
# Declare, initialize
project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_output[0]
project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_NBR = project_nbr_output[0]
REGION="us-central1"

aiplatform.init(project='ray-of-sunshine', location='us-central1')
RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/us-central1/persistentResources/ray-kicking-tires"

numerical_columns_list=["culmen_length_mm","culmen_depth_mm","flipper_length_mm","body_mass_g"]
categorical_columns_list=["island", "sex"]
feature_columns_list = ["island","culmen_length_mm","culmen_depth_mm","flipper_length_mm","body_mass_g","sex"]

import joblib, sys
sys.modules['sklearn.externals.joblib'] = joblib
from ray.util.joblib import register_ray
register_ray()

RUNTIME_ENV = {
  "pip": [
      "google-cloud-aiplatform[ray]==1.40.0",
      "ray[data]==2.4.0",
      "ray[train]==2.4.0",
      "ray[tune]==2.4.0",
      "scikit-learn==1.2.2",
      "google-cloud-bigquery",
      "google-cloud-aiplatform",
      "joblib",
      "pandas<2.0.0"
  ],
}
ray.shutdown()
ray.init(address=RAY_ADDRESS,runtime_env=RUNTIME_ENV)

[Ray on Vertex AI]: Cluster State = State.RUNNING


0,1
Python version:,3.10.13
Ray version:,2.4.0
Vertex SDK version:,1.44.0
Dashboard:,c1068d42e788a11d-dot-us-central1.aiplatform-training.googleusercontent.com
Interactive Terminal Uri:,e21427b331ff6335-dot-us-central1.aiplatform-training.googleusercontent.com
Cluster Name:,ray-kicking-tires


In [4]:
def fnReturnTrainingPipeline_old():

  # Preprocessing of numerical data
  numerical_transformer = SimpleImputer(strategy='mean')
  numerical_scaler = MinMaxScaler()

  # Preprocessing for categorical data
  categorical_preprocessing_pipe = Pipeline(steps=[
      ('cat_col_imputer', SimpleImputer(strategy='most_frequent')),
      ('cat_col_onehotencoder', OneHotEncoder(handle_unknown='ignore'))
  ])


  # Bundle preprocessing for numerical imputer and categorical preprocessing pipeline
  preprocessor = ColumnTransformer(
      transformers=[
          ('num_col_imputer', numerical_transformer, numerical_columns_list),
          ('cat_col_preprocessor', categorical_preprocessing_pipe, categorical_columns_list)
      ])

  random_forest_model = RandomForestClassifier(n_estimators=10)

  # Bundle preprocessing and modeling code in a pipeline
  penguin_training_pipeline = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('scaler', numerical_scaler),
      ('model', random_forest_model)])

  return penguin_training_pipeline

In [99]:
def fnReturnTrainingPipeline():

  # Preprocessing of numerical data
  numerical_transformer = SimpleImputer(strategy='mean')
  numerical_scaler = MinMaxScaler()

  # Preprocessing for categorical data
  categorical_preprocessing_pipe = Pipeline(steps=[
      ('cat_col_imputer', SimpleImputer(strategy='most_frequent')),
      ('cat_col_onehotencoder', OneHotEncoder(handle_unknown='ignore'))
  ])

  #feature_columns_list = ["island","culmen_length_mm","culmen_depth_mm","flipper_length_mm","body_mass_g","sex"]

  # Bundle preprocessing for numerical imputer and categorical preprocessing pipeline
  preprocessor = ColumnTransformer(
      transformers=[
          ('num_col_imputer', numerical_transformer, [1,2,3,4]),
          ('cat_col_preprocessor', categorical_preprocessing_pipe, [0,5])
      ])

  random_forest_model = RandomForestClassifier(n_estimators=10)

  # Bundle preprocessing and modeling code in a pipeline
  penguin_training_pipeline = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('scaler', numerical_scaler),
      ('model', random_forest_model)])

  return penguin_training_pipeline

In [100]:
# The below statement will parallelize all code placed below it
with joblib.parallel_backend('ray'):

  # Read training data from BigQuery
  client = bigquery.Client()
  source_df = client.query("SELECT * FROM `ray_lab_ds.penguins_curated`").to_dataframe()

  # Features
  X = source_df.drop(columns = ['species'])

  # Label
  Y = source_df['species']

  # Split into train and test data
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 123)

  # Model
  random_forest_model = RandomForestClassifier(n_estimators=10)

  # Preprocessor pipeline
  penguin_training_pipeline = fnReturnTrainingPipeline()

  # Fit the model
  penguin_training_pipeline.fit(X_train, Y_train)

  # Testing
  penguin_predictions = penguin_training_pipeline.predict(X_test)
  penguin_predictions

  print('Accuracy : ', accuracy_score(Y_test, penguin_predictions))
  print('F1 Score : ', f1_score(Y_test, penguin_predictions, average = 'weighted'))
  print('Precision : ', precision_score(Y_test, penguin_predictions , average = 'weighted'))
  print('Recall : ', recall_score(Y_test, penguin_predictions, average = 'weighted'))

Accuracy :  0.9767441860465116
F1 Score :  0.9767441860465116
Precision :  0.9767441860465116
Recall :  0.9767441860465116


## [2] Persist the model to Cloud Storage

In [101]:
import joblib

# Model file name
model_filename = "model.joblib"

# Persist locally
local_path = model_filename
joblib.dump(penguin_training_pipeline, local_path)

# Upload model artifact to Cloud Storage
bucket_id=f"ray_lab_model_bucket_{PROJECT_NBR}"
bucket_path = "penguin_species_predictor_ray"
model_bucket_fq_gcs_uri="{}/{}".format(bucket_id,bucket_path)

# Upload the model to GCS
bucket = storage.Client().bucket(bucket_id)
blob = bucket.blob('{}/{}'.format(
    bucket_path,
    model_filename))
blob.upload_from_filename(model_filename)

## [3] Register the model into Vertex AI Model Registry

In [102]:
DEPLOY_IMAGE_URI="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-2:latest"
MODEL_NAME = model_filename
DEPLOY_COMPUTE = "n1-standard-4"

registered_model_reference = vertex_ai.Model.upload(
    display_name="penguin_species_predictor_ray_model",
    serving_container_image_uri=DEPLOY_IMAGE_URI,
    artifact_uri="gs://{}".format(model_bucket_fq_gcs_uri)
)

registered_model_reference

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/567162267085/locations/us-central1/models/8967829041885216768/operations/3209145721861177344
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/567162267085/locations/us-central1/models/8967829041885216768@1
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/567162267085/locations/us-central1/models/8967829041885216768@1')


<google.cloud.aiplatform.models.Model object at 0x7f1d395d80a0> 
resource name: projects/567162267085/locations/us-central1/models/8967829041885216768

## [4] Online serving

### 4.1. Create a Vertex AI model serving endpoint resource

In [106]:
endpoint = vertex_ai.Endpoint.create(
    display_name="penguins_species_predictor_ray_endpoint_resource",
    project=PROJECT_ID,
    location=REGION
)

print(endpoint)

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/567162267085/locations/us-central1/endpoints/2736787795627474944/operations/2652951167880921088
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/567162267085/locations/us-central1/endpoints/2736787795627474944
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/567162267085/locations/us-central1/endpoints/2736787795627474944')


<google.cloud.aiplatform.models.Endpoint object at 0x7f1d5afbdd50> 
resource name: projects/567162267085/locations/us-central1/endpoints/2736787795627474944


### 4.2. Deploy the model to the endpoint

Take a power nap. May take an hour.

In [107]:
print(endpoint)

<google.cloud.aiplatform.models.Endpoint object at 0x7f1d5afbdd50> 
resource name: projects/567162267085/locations/us-central1/endpoints/2736787795627474944


In [108]:
print(registered_model_reference)

<google.cloud.aiplatform.models.Model object at 0x7f1d395d80a0> 
resource name: projects/567162267085/locations/us-central1/models/8967829041885216768


In [109]:
DEPLOY_COMPUTE = "n1-standard-4"

endpoint.deploy(
    model=registered_model_reference,
    deployed_model_display_name="penguins_species_predictor_ray_model_ep",
    machine_type=DEPLOY_COMPUTE
)

INFO:google.cloud.aiplatform.models:Deploying Model projects/567162267085/locations/us-central1/models/8967829041885216768 to Endpoint : projects/567162267085/locations/us-central1/endpoints/2736787795627474944
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/567162267085/locations/us-central1/endpoints/2736787795627474944/operations/9216947624773419008
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/567162267085/locations/us-central1/endpoints/2736787795627474944


## [5] Online predictions

### 5.1. Get some penguin data to run predictions on

We will take 3 records

In [147]:
(prediction_ready_features, prediction_ready_label) = (X_test.head(3), Y_test.head(3))
test_recordset_df=pd.concat([prediction_ready_features, prediction_ready_label], axis=1)
test_recordset_df

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,species
309,Torgersen,42.8,18.5,195.0,4250.0,MALE,Adelie
234,Biscoe,42.6,13.7,213.0,4950.0,FEMALE,Gentoo
143,Biscoe,44.5,14.3,216.0,4100.0,,Gentoo


### 5.2. Test Prediction 1

In [150]:
# Grab the first record
test_record_1_df=test_recordset_df.iloc[0:1]
test_record_1_df

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,species
309,Torgersen,42.8,18.5,195.0,4250.0,MALE,Adelie


In [160]:
# Drop species label
test_features_1_df = test_record_1_df.iloc[:,:-1]

In [152]:
# Predicting directly with model pipeline we created
penguin_training_pipeline.predict(test_features_1_df)

array(['Adelie'], dtype=object)

In [161]:
# Predicting with the Vertex AI model endpoint we created
prediction=endpoint.predict(instances=test_features_1_df.values.tolist())
print('The species predicted for the Penguin attributes is {}'.format(prediction[0][0]))

The species predicted for the Penguin attributes is Adelie


### Test Prediction 2

In [162]:
# Grab the second record
test_record_2_df=test_recordset_df.iloc[1:2]
test_record_2_df

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,species
234,Biscoe,42.6,13.7,213.0,4950.0,FEMALE,Gentoo


In [165]:
# Drop species label
test_features_2_df = test_record_2_df.iloc[:,:-1]

In [166]:
# Predicting with the Vertex AI model endpoint we created
prediction=endpoint.predict(instances=test_features_2_df.values.tolist())
print('The species predicted for the Penguin attributes is {}'.format(prediction[0][0]))

The species predicted for the Penguin attributes is Gentoo


### Test Prediction 3

In [167]:
# Grab the third record
test_record_3_df=test_recordset_df.iloc[2:3]
test_record_3_df

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,species
143,Biscoe,44.5,14.3,216.0,4100.0,,Gentoo


In [169]:
# Drop species label
test_features_3_df = test_record_3_df.iloc[:,:-1]

In [170]:
# Predicting with the Vertex AI model endpoint we created
prediction=endpoint.predict(instances=test_features_3_df.values.tolist())
print('The species predicted for the Penguin attributes is {}'.format(prediction[0][0]))

The species predicted for the Penguin attributes is Gentoo


This conlcudes the lab module. Proceed to the lab guide of the [next module](https://github.com/anagha-google/ray-labs/blob/main/01-sklearn/module-08-ray-train-sklearn-serve-batch-bqml-README.md).