# Interactive Distributed Scikit-Learn with Ray on Vertex AI

In [2]:
import time
import numpy as np
import joblib
import pandas as pd
import seaborn as sns
import xgboost as xgb

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform.preview import vertex_ray
import ray
from ray.runtime_env import RuntimeEnv
from ray.air.config import RunConfig
from ray.air import CheckpointConfig, ScalingConfig
from ray.util.joblib import register_ray


In [3]:
# Declare, initialize
project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_output[0]
project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_NBR = project_nbr_output[0]
REGION="us-central1"

aiplatform.init(project='ray-of-sunshine', location='us-central1')
RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/us-central1/persistentResources/ray-kicking-tires-cluster"

In [6]:
import joblib, sys
sys.modules['sklearn.externals.joblib'] = joblib
from ray.util.joblib import register_ray
register_ray()

RUNTIME_ENV = {
  "pip": [
      "google-cloud-aiplatform[ray]==1.40.0",
      "ray[data]==2.4.0",
      "ray[train]==2.4.0",
      "ray[tune]==2.4.0",
      "scikit-learn==1.2.2",
      "google-cloud-bigquery",
      "google-cloud-aiplatform",
      "joblib",
      "pandas<2.0.0"
  ],
}
ray.shutdown()
ray.init(address=RAY_ADDRESS,runtime_env=RUNTIME_ENV)

# The below statement will parallelize all code placed below it
with joblib.parallel_backend('ray'):

  # Column listing
  numerical_columns_list=["culmen_length_mm","culmen_depth_mm","flipper_length_mm","body_mass_g"]
  categorical_columns_list=["island", "sex"]

  # Read training data from BigQuery
  client = bigquery.Client()
  source_df = client.query("SELECT * FROM `ray_lab_ds.penguins_curated`").to_dataframe()

  # Features
  X = source_df.drop(columns = ['species'])

  # Label
  Y = source_df['species']

  # Split into train and test data
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 123)

  # Preprocessing of numerical data
  numerical_transformer = SimpleImputer(strategy='mean')
  numerical_scaler = MinMaxScaler()

  # Preprocessing for categorical data
  categorical_preprocessing_pipe = Pipeline(steps=[
      ('cat_col_imputer', SimpleImputer(strategy='most_frequent')),
      ('cat_col_onehotencoder', OneHotEncoder(handle_unknown='ignore'))
  ])

  # Bundle preprocessing for numerical imputer and categorical preprocessing pipeline
  preprocessor = ColumnTransformer(
      transformers=[
          ('num_col_imputer', numerical_transformer, numerical_columns_list),
          ('cat_col_preprocessor', categorical_preprocessing_pipe, categorical_columns_list)
      ])

  random_forest_model = RandomForestClassifier(n_estimators=10)

  # Bundle preprocessing and modeling code in a pipeline
  penguin_training_pipeline = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('scaler', numerical_scaler),
      ('model', random_forest_model)])

  penguin_training_pipeline.fit(X_train, Y_train)

  # Testing
  penguin_predictions = penguin_training_pipeline.predict(X_test)
  penguin_predictions

  print('Accuracy : ', accuracy_score(Y_test, penguin_predictions))
  print('F1 Score : ', f1_score(Y_test, penguin_predictions, average = 'weighted'))
  print('Precision : ', precision_score(Y_test, penguin_predictions , average = 'weighted'))
  print('Recall : ', recall_score(Y_test, penguin_predictions, average = 'weighted'))

[Ray on Vertex AI]: Cluster State = State.RUNNING
Accuracy :  1.0
F1 Score :  1.0
Precision :  1.0
Recall :  1.0
