# Module 6: Job API submission of distributed Scikit-Learn on Ray on Vertex AI

In this module, we will submit the same training job in module 5, to the Ray job API.

## [0] Adjust Ray version in Colab to match cluster as needed

In [1]:
#!pip install ray==2.4.0

## [1] Local setup to submit job to Ray cluster

In [2]:
from pathlib import Path as path

root_path = path.cwd()
ray_lab_local_dir = root_path / "ray_lab_local_dir"
script_path = ray_lab_local_dir / "code"
script_path.mkdir(parents=True, exist_ok=True)

## [2] Create the training script and persist locally

In [3]:
project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_output[0]

project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_NBR = project_nbr_output[0]
REGION="us-central1"

training_script = """
import time, sys, joblib
import numpy as np, pandas as pd
import ray

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from google.cloud import bigquery, aiplatform
from google.cloud.aiplatform.preview import vertex_ray
from ray.runtime_env import RuntimeEnv
from ray.air.config import RunConfig, CheckpointConfig, ScalingConfig
from ray.util.joblib import register_ray

RAY_ADDRESS="vertex_ray://projects/_REPLACE_PROJECT_NBR_/locations/_REPLACE_REGION_/persistentResources/ray-kicking-tires"

sys.modules['sklearn.externals.joblib'] = joblib

aiplatform.init(project="_REPLACE_PROJECT_ID_", location="_REPLACE_REGION_")
register_ray()

ray.shutdown()
ray.init()

# The below statement will parallelize all code placed below it
with joblib.parallel_backend('ray'):

  # Column listing
  numerical_columns_list=["culmen_length_mm","culmen_depth_mm","flipper_length_mm","body_mass_g"]
  categorical_columns_list=["island", "sex"]

  # Read training data from BigQuery
  client = bigquery.Client(project="_REPLACE_PROJECT_ID_")
  source_df = client.query("SELECT * FROM `ray_lab_ds.penguins_curated`").to_dataframe()

  # Features
  X = source_df.drop(columns = ['species'])

  # Label
  Y = source_df['species']

  # Split into train and test data
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 123)

  # Preprocessing of numerical data
  numerical_transformer = SimpleImputer(strategy='mean')
  numerical_scaler = MinMaxScaler()

  # Preprocessing for categorical data
  categorical_preprocessing_pipe = Pipeline(steps=[
      ('cat_col_imputer', SimpleImputer(strategy='most_frequent')),
      ('cat_col_onehotencoder', OneHotEncoder(handle_unknown='ignore'))
  ])

  # Bundle preprocessing for numerical imputer and categorical preprocessing pipeline
  preprocessor = ColumnTransformer(
      transformers=[
          ('num_col_imputer', numerical_transformer, numerical_columns_list),
          ('cat_col_preprocessor', categorical_preprocessing_pipe, categorical_columns_list)
      ])

  random_forest_model = RandomForestClassifier(n_estimators=10)

  # Bundle preprocessing and modeling code in a pipeline
  penguin_training_pipeline = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('scaler', numerical_scaler),
      ('model', random_forest_model)])

  penguin_training_pipeline.fit(X_train, Y_train)

  # Testing
  penguin_predictions = penguin_training_pipeline.predict(X_test)
  penguin_predictions

  print('Accuracy : ', accuracy_score(Y_test, penguin_predictions))
  print('F1 Score : ', f1_score(Y_test, penguin_predictions, average = 'weighted'))
  print('Precision : ', precision_score(Y_test, penguin_predictions , average = 'weighted'))
  print('Recall : ', recall_score(Y_test, penguin_predictions, average = 'weighted'))
"""

training_script=training_script.replace("_REPLACE_PROJECT_ID_",PROJECT_ID)
training_script=training_script.replace("_REPLACE_PROJECT_NBR_",PROJECT_NBR)
training_script=training_script.replace("_REPLACE_REGION_", REGION)

with open(script_path / "task.py", "w") as f:
    f.write(training_script)
f.close()

## [3] Create the requirements script and persist locally

In [4]:
requirements = """
google-cloud-aiplatform[ray]==1.40.0
ray[data]==2.4.0
ray[train]==2.4.0
ray[tune]==2.4.0
scikit-learn==1.2.2
google-cloud-bigquery
google-cloud-aiplatform
joblib
pandas<2.0.0
db-dtypes
"""

with open(ray_lab_local_dir / "requirements.txt", "w") as f:
    f.write(requirements)
f.close()

## [4] Submit script to Ray cluster using jobs API & poll for completion

In [5]:
from google.cloud import aiplatform as vertex_ai
import random, string, time, ray
from ray.job_submission import JobSubmissionClient, JobStatus
import vertex_ray
from vertex_ray import Resources
from datetime import datetime

project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_output[0]
project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_NBR = project_nbr_output[0]
REGION="us-central1"
CLUSTER_NAME="ray-kicking-tires"
AIP_BUCKET_URI = f"gs://ray_lab_log_bucket_{PROJECT_NBR}/"
RAY_CLUSTER_RESOURCE_NAME='projects/{}/locations/{}/persistentResources/{}'.format(PROJECT_NBR,REGION,CLUSTER_NAME)

vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=AIP_BUCKET_URI)
ray_client = JobSubmissionClient("vertex_ray://{}".format(RAY_CLUSTER_RESOURCE_NAME))

[Ray on Vertex AI]: Cluster State = State.RUNNING


In [6]:
EXPERIMENT_NAME = "penguin_species_predictor"
RAY_JOB_LOGGING_URI = f"gs://ray_lab_log_bucket_{PROJECT_NBR}/logs"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
job_id_suffix = "".join(random.choices(string.ascii_lowercase + string.digits, k=4))

job_id = ray_client.submit_job(
    submission_id=f"penguin-trainer-{TIMESTAMP}-{job_id_suffix}",
    entrypoint=f"python3 task.py --experiment-name={EXPERIMENT_NAME} --num-workers=2 --logging-dir={RAY_JOB_LOGGING_URI}",
    runtime_env={
        "pip": {"packages": str(ray_lab_local_dir / "requirements.txt")},"working_dir": str(script_path),
    },
)

INFO:ray.dashboard.modules.dashboard_sdk:Uploading package gcs://_ray_pkg_8d0add2c0100ab76.zip.


In [7]:
while True:
    job_status = ray_client.get_job_status(job_id)
    if job_status == JobStatus.SUCCEEDED:
        print("Job succeeded!")
        break
    else:
        if job_status == JobStatus.FAILED:
            print("Job failed!")
            break
        else:
            print("Job is running...")
            time.sleep(60)

Job is running...
Job is running...
Job succeeded!


## [5] Model metrics in the job logs

Navigate to the Ray on Vertex's Ray Dashboard.

1.   Click on the jobs tab
2.   Click on the job associated with your submission
3.   Click on the logs link
4.   Click on the driver log
5.   You should see the model metrics



This concludes the lab module. Proceed to the lab guide for the [next module](https://github.com/anagha-google/ray-labs/blob/main/01-sklearn/module-07-ray-train-sklearn-serve-online-vertex-endpoint-README.md).