# Interactive Distributed Scikit-Learn with Ray on Vertex AI

In [98]:
import time
import numpy as np
import joblib
import pandas as pd
import seaborn as sns
import xgboost as xgb

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform.preview import vertex_ray
import ray
from ray.runtime_env import RuntimeEnv
from ray.air.config import RunConfig
from ray.air import CheckpointConfig, ScalingConfig
from ray.util.joblib import register_ray
from ray import train
import tempfile

from google.cloud import bigquery, storage, aiplatform as vertex_ai


## 1. Train model

In [99]:
# Declare, initialize
project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_output[0]
project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_NBR = project_nbr_output[0]
REGION="us-central1"

aiplatform.init(project='ray-of-sunshine', location='us-central1')
RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/us-central1/persistentResources/ray-kicking-tires-cluster"

numerical_columns_list=["culmen_length_mm","culmen_depth_mm","flipper_length_mm","body_mass_g"]
categorical_columns_list=["island", "sex"]
feature_columns_list = ["island","culmen_length_mm","culmen_depth_mm","flipper_length_mm","body_mass_g","sex"]

import joblib, sys
sys.modules['sklearn.externals.joblib'] = joblib
from ray.util.joblib import register_ray
register_ray()

RUNTIME_ENV = {
  "pip": [
      "google-cloud-aiplatform[ray]==1.40.0",
      "ray[data]==2.4.0",
      "ray[train]==2.4.0",
      "ray[tune]==2.4.0",
      "scikit-learn==1.2.2",
      "google-cloud-bigquery",
      "google-cloud-aiplatform",
      "joblib",
      "pandas<2.0.0"
  ],
}
ray.shutdown()
ray.init(address=RAY_ADDRESS,runtime_env=RUNTIME_ENV)

[Ray on Vertex AI]: Cluster State = State.RUNNING


0,1
Python version:,3.10.13
Ray version:,2.4.0
Vertex SDK version:,1.39.0
Dashboard:,755d3a0b41a330d0-dot-us-central1.aiplatform-training.googleusercontent.com
Interactive Terminal Uri:,87d1c5fdddbfd3fe-dot-us-central1.aiplatform-training.googleusercontent.com
Cluster Name:,ray-kicking-tires-cluster


In [None]:
def fnReturnTrainingPipeline():

  # Preprocessing of numerical data
  numerical_transformer = SimpleImputer(strategy='mean')
  numerical_scaler = MinMaxScaler()

  # Preprocessing for categorical data
  categorical_preprocessing_pipe = Pipeline(steps=[
      ('cat_col_imputer', SimpleImputer(strategy='most_frequent')),
      ('cat_col_onehotencoder', OneHotEncoder(handle_unknown='ignore'))
  ])

  # Bundle preprocessing for numerical imputer and categorical preprocessing pipeline
  preprocessor = ColumnTransformer(
      transformers=[
          ('num_col_imputer', numerical_transformer, numerical_columns_list),
          ('cat_col_preprocessor', categorical_preprocessing_pipe, categorical_columns_list)
      ])

  random_forest_model = RandomForestClassifier(n_estimators=10)

  # Bundle preprocessing and modeling code in a pipeline
  penguin_training_pipeline = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('scaler', numerical_scaler),
      ('model', random_forest_model)])

  return penguin_training_pipeline

In [100]:

# The below statement will parallelize all code placed below it
with joblib.parallel_backend('ray'):

  # Read training data from BigQuery
  client = bigquery.Client()
  source_df = client.query("SELECT * FROM `ray_lab_ds.penguins_curated`").to_dataframe()

  # Features
  X = source_df.drop(columns = ['species'])

  # Label
  Y = source_df['species']

  # Split into train and test data
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 123)

  # Model
  random_forest_model = RandomForestClassifier(n_estimators=10)

  # Preprocessor pipeline
  penguin_training_pipeline = fnReturnTrainingPipeline()

  # Fit the model
  penguin_training_pipeline.fit(X_train, Y_train)

  # Testing
  penguin_predictions = penguin_training_pipeline.predict(X_test)
  penguin_predictions

  print('Accuracy : ', accuracy_score(Y_test, penguin_predictions))
  print('F1 Score : ', f1_score(Y_test, penguin_predictions, average = 'weighted'))
  print('Precision : ', precision_score(Y_test, penguin_predictions , average = 'weighted'))
  print('Recall : ', recall_score(Y_test, penguin_predictions, average = 'weighted'))

Accuracy :  1.0
F1 Score :  1.0
Precision :  1.0
Recall :  1.0


## 2. Persist the model to Cloud Storage

In [None]:
import joblib

# Model file name
model_filename = "penguin_classifier.joblib"

# Persist locally
local_path = model_filename
joblib.dump(random_forest_model, local_path)

# Upload model artifact to Cloud Storage
bucket_id=f"ray_lab_model_bucket_{PROJECT_NBR}"
bucket_path = "penguin_classifer_model"
model_bucket_fq_gcs_uri="{}/{}".format(bucket_id,bucket_path)


# Upload the model to GCS
bucket = storage.Client().bucket(bucket_id)
blob = bucket.blob('{}/{}'.format(
    bucket_path,
    model_filename))
blob.upload_from_filename(model_filename)

## 3. Register the model into Vertex AI Model Registry

In [None]:
DEPLOY_IMAGE_URI="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-2:latest"
MODEL_NAME = model_filename
DEPLOY_COMPUTE = "n1-standard-4"

registered_model_reference = vertex_ai.Model.upload(
    display_name=MODEL_NAME,
    serving_container_image_uri=DEPLOY_IMAGE_URI,
    artifact_uri="gs://{}".format(model_bucket_fq_gcs_uri)
)

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/567162267085/locations/us-central1/models/874540553616752640/operations/5846310067430752256
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/567162267085/locations/us-central1/models/874540553616752640@1
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/567162267085/locations/us-central1/models/874540553616752640@1')


## 4. Online serving

### 4.1. Create a Vertex AI model serving endpoint

In [None]:
endpoint = vertex_ai.Endpoint.create(
    display_name="penguins_species_predictor_er",
    project=PROJECT_ID,
    location=REGION
)

print(endpoint)

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/567162267085/locations/us-central1/endpoints/7334457239823974400/operations/3540467058217058304
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/567162267085/locations/us-central1/endpoints/7334457239823974400
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/567162267085/locations/us-central1/endpoints/7334457239823974400')


<google.cloud.aiplatform.models.Endpoint object at 0x78180db011e0> 
resource name: projects/567162267085/locations/us-central1/endpoints/7334457239823974400


### 4.2. Deploy the model to the endpoint

In [None]:
DEPLOY_COMPUTE = "n1-standard-4"

response = endpoint.deploy(
    model=registered_model_reference,
    deployed_model_display_name="penguins_species_predictor",
    machine_type=DEPLOY_COMPUTE,
)
print(endpoint)

INFO:google.cloud.aiplatform.models:Deploying Model projects/567162267085/locations/us-central1/models/874540553616752640 to Endpoint : projects/567162267085/locations/us-central1/endpoints/7334457239823974400
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/567162267085/locations/us-central1/endpoints/7334457239823974400/operations/6837664935405682688
INFO:google.cloud.aiplatform.models:Endpoint model deployed. Resource name: projects/567162267085/locations/us-central1/endpoints/7334457239823974400


<google.cloud.aiplatform.models.Endpoint object at 0x78180db011e0> 
resource name: projects/567162267085/locations/us-central1/endpoints/7334457239823974400


## 5. Online predictions

### 5.2. Get some penguin data to run predictions on

We will take 3 records

In [101]:
(prediction_ready_features, prediction_ready_label) = (X_test.head(3), Y_test.head(3))
pd.concat([prediction_ready_features, prediction_ready_label], axis=1)

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,species
309,Torgersen,42.1,19.1,195.0,4000.0,MALE,Adelie
234,Biscoe,42.6,13.7,213.0,4950.0,FEMALE,Gentoo
143,Biscoe,49.6,15.0,216.0,4750.0,MALE,Gentoo


### 5.3. Prediction 1

In [102]:
pd.concat([prediction_ready_features, prediction_ready_label], axis=1).iloc[0:1]

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,species
309,Torgersen,42.1,19.1,195.0,4000.0,MALE,Adelie


In [103]:
df = prediction_ready_features.iloc[0:1]
display(df)

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
309,Torgersen,42.1,19.1,195.0,4000.0,MALE


In [104]:
# Predicting directly with model we created
penguin_training_pipeline.predict(df)

array(['Adelie'], dtype=object)

In [91]:
prediction_df=endpoint.predict(instances=[[0.0,39.8,19.1,184.0,4650.0,0.0]])
species_encoded=round(prediction_df.predictions[0])
print('The species predicted for the Penguin attributes is {}'.format([species_encoded])[0])

InternalServerError: 500 {"detail":"The following exception has occurred: ValueError. Arguments: ('X has 6 features, but RandomForestClassifier is expecting 10 features as input.',)."}

In [49]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

transformer = make_column_transformer(
  (OneHotEncoder(), ['island', 'sex']),
  remainder='passthrough')

transformed = transformer.fit_transform(df)

transformed_df = pd.DataFrame(
  transformed, columns=['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex'])
display(transformed_df.head())

ValueError: Shape of passed values is (1, 7), indices imply (1, 6)

In [85]:
# Prediciting with the endpoint
penguin_processor_pipeline.fit(prediction_ready_features.iloc[0:1])
penguin_processor_pipeline.transform(prediction_ready_features.iloc[1:2])


prediction_df=endpoint.predict(instances=[penguin_processor_pipeline.transform(prediction_ready_features.iloc[1:2])])
species_encoded=round(prediction_df.predictions[0])
print('The species predicted for the Penguin attributes is {}'.format([species_encoded])[0])

ValueError: Unable to coerce value: array([[ 5.0e-01, -5.4e+00,  1.8e+01,  9.5e+02, -1.0e+00, -1.0e+00]])

In [None]:
prediction_ready_feature_list=penguin_processor_pipeline.transform(prediction_ready_features.iloc[1:2]).tolist()
prediction_ready_feature_list

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# Predicting via online prediction service
prediction_df=endpoint.predict(instances=[prediction_ready_feature_list])
species_encoded=round(prediction_df.predictions[0])
print('The species predicted for the Penguin attributes is {}'.format([species_encoded])[0])

### Test record 2

In [None]:
pd.concat([predictions_features, predictions_label], axis=1).iloc[1:2]

### Test record 3

In [None]:
pd.concat([predictions_features, predictions_label], axis=1).iloc[2:3]

In [21]:
prediction_df=endpoint.predict(instances=[["Dream",39.8,19.1,184.0,4650.0,"MALE"]])
species_encoded=round(prediction_df.predictions[0])
print('The species predicted for the Penguin attributes is {}'.format([species_encoded])[0])

InternalServerError: 500 {"detail":"The following exception has occurred: ValueError. Arguments: (\"could not convert string to float: 'Dream'\",)."}

In [None]:
prediction_df=endpoint.predict(instances=[[0,0.000000,0.285714,0.271186,0.097222,0]])
species_encoded=round(prediction_df.predictions[0])
print('The species predicted for the Penguin attributes is {}'.format(le.inverse_transform([species_encoded])[0]))

In [None]:
prediction_df=endpoint.predict(instances=[[0,0.000000,0.285714,0.271186,0.097222,0]])
species_encoded=round(prediction_df.predictions[0])
print('The species predicted for the Penguin attributes is {}'.format(le.inverse_transform([species_encoded])[0]))