# Module 04: Primer on parallelizing Scikit-Learn RandomizedSearchCV on Ray

This RandomizedSearchCV sample is from the Ray docs and showcases parallelizing Scikit-Learn on Ray - in our case runs on Ray on Vertex AI.



Ray documentation: https://docs.ray.io/en/latest/ray-more-libs/joblib.html

## Part 1. Sample in interactive mode

In [1]:
#pip install ray==2.4.0 # Ensure the ray version in Colab and the Ray on Vertex AI cluster are consistent


In [2]:
import sklearn, ray
print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The ray version is {}.'.format(ray.__version__))


The scikit-learn version is 1.2.2.
The ray version is 2.4.0.


In [3]:
import numpy as np
import joblib

from sklearn.datasets import load_digits
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

import ray
from ray.runtime_env import RuntimeEnv
from ray.air.config import RunConfig
from ray.util.joblib import register_ray

from google.cloud import aiplatform
from google.cloud.aiplatform.preview import vertex_ray

In [4]:
# Declare, initialize
project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_output[0]
project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_NBR = project_nbr_output[0]
REGION="us-central1"
RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/us-central1/persistentResources/ray-kicking-tires"

In [5]:
aiplatform.init(project=PROJECT_ID, location=REGION)

### 1.1. **Without** Ray's parallelization
Takes ~5 minutes

In [6]:
digits = load_digits()
param_space = {
    'C': np.logspace(-6, 6, 30),
    'gamma': np.logspace(-8, 8, 30),
    'tol': np.logspace(-4, -1, 30),
    'class_weight': [None, 'balanced'],
}
model = SVC(kernel='rbf')
search = RandomizedSearchCV(model, param_space, cv=5, n_iter=300, verbose=10)
search.fit(digits.data, digits.target)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[CV 1/5; 1/300] START C=22122.1629107045, class_weight=None, gamma=85.31678524172814, tol=0.014873521072935119
[CV 1/5; 1/300] END C=22122.1629107045, class_weight=None, gamma=85.31678524172814, tol=0.014873521072935119;, score=0.100 total time=   0.3s
[CV 2/5; 1/300] START C=22122.1629107045, class_weight=None, gamma=85.31678524172814, tol=0.014873521072935119
[CV 2/5; 1/300] END C=22122.1629107045, class_weight=None, gamma=85.31678524172814, tol=0.014873521072935119;, score=0.100 total time=   0.4s
[CV 3/5; 1/300] START C=22122.1629107045, class_weight=None, gamma=85.31678524172814, tol=0.014873521072935119
[CV 3/5; 1/300] END C=22122.1629107045, class_weight=None, gamma=85.31678524172814, tol=0.014873521072935119;, score=0.103 total time=   0.3s
[CV 4/5; 1/300] START C=22122.1629107045, class_weight=None, gamma=85.31678524172814, tol=0.014873521072935119
[CV 4/5; 1/300] END C=22122.1629107045, class_weight=None, gamma=8

### 1.2. **With** Ray's parallelization in an interactive mode
Takes <= 1 minute

In [7]:
digits = load_digits()
param_space = {
    'C': np.logspace(-6, 6, 30),
    'gamma': np.logspace(-8, 8, 30),
    'tol': np.logspace(-4, -1, 30),
    'class_weight': [None, 'balanced'],
}
model = SVC(kernel='rbf')
search = RandomizedSearchCV(model, param_space, cv=5, n_iter=300, verbose=10)

# Define runtime env
RUNTIME_ENV = {
  "pip": [
      "google-cloud-aiplatform[ray]==1.40.0",
      "ray[data]==2.4.0",
      "ray[train]==2.4.0",
      "scikit-learn==1.2.2",
      "google-cloud-aiplatform",
      "joblib",
      "pandas<2.0.0"
  ],
}

# Restart runtime with requisite dependencies
ray.shutdown()
ray.init(address=RAY_ADDRESS,runtime_env=RUNTIME_ENV)

# Parallelize with Ray
register_ray()
with joblib.parallel_backend('ray'):
    search.fit(digits.data, digits.target)

[Ray on Vertex AI]: Cluster State = State.RUNNING




Fitting 5 folds for each of 300 candidates, totalling 1500 fits




[2m[36m(PoolActor pid=56944)[0m [CV 2/5; 1/300] START C=0.005298316906283708, class_weight=None, gamma=2212216.29107045, tol=0.02395026619987486
[2m[36m(PoolActor pid=56896)[0m [CV 4/5; 3/300] START C=28.072162039411758, class_weight=balanced, gamma=1.6102620275609392e-06, tol=0.00221221629107045
[2m[36m(PoolActor pid=57002)[0m [CV 5/5; 2/300] START C=0.00011721022975334806, class_weight=balanced, gamma=0.0002592943797404667, tol=0.03856620421163472
[2m[36m(PoolActor pid=10618, ip=10.126.0.5)[0m [CV 5/5; 1/300] START C=0.005298316906283708, class_weight=None, gamma=2212216.29107045, tol=0.02395026619987486
[2m[36m(PoolActor pid=10616, ip=10.126.0.5)[0m [CV 4/5; 1/300] START C=0.005298316906283708, class_weight=None, gamma=2212216.29107045, tol=0.02395026619987486
[2m[36m(PoolActor pid=10615, ip=10.126.0.5)[0m [CV 1/5; 1/300] START C=0.005298316906283708, class_weight=None, gamma=2212216.29107045, tol=0.02395026619987486
[2m[36m(PoolActor pid=9800, ip=10.126.0.4)[0m

## Part 2. The same sample using Ray Job API

In [8]:
from datetime import datetime
import random, string, time
from pathlib import Path as path

from google.cloud import aiplatform as vertex_ai
import ray, vertex_ray
from vertex_ray import Resources
from ray.job_submission import JobStatus, JobSubmissionClient

In [10]:
project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_ID = project_id_output[0]
PROJECT_NBR = project_nbr_output[0]
REGION="us-central1"

root_path = path.cwd()
ray_lab_local_dir = root_path / "ray_lab_local_dir"
script_path = ray_lab_local_dir / "src"
script_path.mkdir(parents=True, exist_ok=True)

CLUSTER_NAME="ray-kicking-tires"
AIP_BUCKET_URI = f"gs://ray_lab_log_bucket_{PROJECT_NBR}/"
RAY_CLUSTER_RESOURCE_NAME='projects/{}/locations/{}/persistentResources/{}'.format(PROJECT_NBR,REGION,CLUSTER_NAME)
EXPERIMENT_NAME = "RandomizedSearchCV-Distributed-Sklearn"
RAY_JOB_LOGGING_URI = f"gs://ray_lab_log_bucket_{PROJECT_NBR}/logs"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
JOB_SUFFIX = "".join(random.choices(string.ascii_lowercase + string.digits, k=4))

vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=AIP_BUCKET_URI)
ray_client = JobSubmissionClient("vertex_ray://{}".format(RAY_CLUSTER_RESOURCE_NAME))

[Ray on Vertex AI]: Cluster State = State.RUNNING


### 2.1. Create the training script and persist locally

In [11]:
training_script = """
from google.cloud import aiplatform
from google.cloud.aiplatform.preview import vertex_ray
import ray
from ray.runtime_env import RuntimeEnv
from ray.air.config import RunConfig
from ray.air import CheckpointConfig, ScalingConfig
from ray.util.joblib import register_ray

import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
digits = load_digits()
param_space = {
    'C': np.logspace(-6, 6, 30),
    'gamma': np.logspace(-8, 8, 30),
    'tol': np.logspace(-4, -1, 30),
    'class_weight': [None, 'balanced'],
}
model = SVC(kernel='rbf')
search = RandomizedSearchCV(model, param_space, cv=5, n_iter=300, verbose=10)


ray.shutdown()
ray.init()

import joblib
from ray.util.joblib import register_ray
register_ray()
with joblib.parallel_backend('ray'):
    search.fit(digits.data, digits.target)
"""

with open(script_path / "task.py", "w") as f:
    f.write(training_script)
f.close()

### 2.2. Create the requirements.txt script and persist locally

In [12]:
requirements = """
google-cloud-aiplatform[ray]==1.40.0
ray[data]==2.4.0
ray[train]==2.4.0
ray[tune]==2.4.0
scikit-learn==1.2.2
google-cloud-bigquery
google-cloud-aiplatform
joblib
pandas<2.0.0
"""

with open(ray_lab_local_dir / "requirements.txt", "w") as f:
    f.write(requirements)
f.close()

### 2.3. Submit the job

In [13]:
JOB_SUBMISSION_ID=f"ray-job-{TIMESTAMP}-{JOB_SUFFIX}"

job_id = ray_client.submit_job(
    submission_id=JOB_SUBMISSION_ID,
    entrypoint=f"python3 task.py --experiment-name={EXPERIMENT_NAME} --num-workers=2 --logging-dir={RAY_JOB_LOGGING_URI}",
    runtime_env={
        "pip": {"packages": str(ray_lab_local_dir / "requirements.txt")},
        "working_dir": str(script_path),
    },
)

2024-03-27 21:10:20,491	INFO dashboard_sdk.py:317 -- Uploading package gcs://_ray_pkg_45bf254d5b193b8c.zip.
2024-03-27 21:10:20,494	INFO packaging.py:520 -- Creating a file package for local directory '/content/ray_lab_local_dir/src'.


### 2.4. Poll for results

In [14]:
while True:
    job_status = ray_client.get_job_status(job_id)
    if job_status == JobStatus.SUCCEEDED:
        print("Job succeeded!")
        break
    else:
        if job_status == JobStatus.FAILED:
            print("Job failed!")
            break
        else:
            print("Job is running...")
            time.sleep(60)

Job is running...




Job is running...
Job succeeded!


This concludes the module. Proceed to the lab guide for the [next module](https://github.com/anagha-google/ray-labs/blob/main/01-sklearn/module-05-ray-train-sklearn-interactive-README.md).