# Module 2: Primer on ray.data with BigQuery

This module covers reading and writing from BigQuery with ray.data.

Docs:

https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/bigquery-integration


In [36]:
project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_output[0]

project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_NBR = project_nbr_output[0]

RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/us-central1/persistentResources/ray-kicking-tires-cluster"

print('PROJECT_ID: ', PROJECT_ID)
print('PROJECT_NBR: ', PROJECT_NBR)
print('RAY_ADDRESS:', RAY_ADDRESS)

import ray
from ray.runtime_env import RuntimeEnv
from ray.air.config import RunConfig
from ray.util.joblib import register_ray

from google.cloud import aiplatform
from google.cloud.aiplatform.preview import vertex_ray
from vertex_ray import BigQueryDatasource


PROJECT_ID:  ray-of-sunshine
PROJECT_NBR:  567162267085
RAY_ADDRESS: vertex_ray://projects/567162267085/locations/us-central1/persistentResources/ray-kicking-tires-cluster


In [37]:
aiplatform.init()

RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/us-central1/persistentResources/ray-kicking-tires-cluster"

runtime_env = {
    "pip":
       ["google-cloud-aiplatform[ray]", "ipywidgets>=8"
        ]
  }
ray.shutdown()
ray.init(address=RAY_ADDRESS, runtime_env=runtime_env)



[Ray on Vertex AI]: Cluster State = State.RUNNING


0,1
Python version:,3.10.13
Ray version:,2.4.0
Vertex SDK version:,1.39.0
Dashboard:,755d3a0b41a330d0-dot-us-central1.aiplatform-training.googleusercontent.com
Interactive Terminal Uri:,87d1c5fdddbfd3fe-dot-us-central1.aiplatform-training.googleusercontent.com
Cluster Name:,ray-kicking-tires-cluster


## 1. Read from BigQuery

In [None]:
@ray.remote
def fntReadBQPrintPenguinSchema():
  bq_dataset = "bigquery-public-data.ml_datasets"
  bq_read_parallelism = 4
  bq_query = f"SELECT * from {bq_dataset}.penguins LIMIT 10"

  penguin_ds = ray.data.read_datasource(
      BigQueryDatasource(),
      parallelism=bq_read_parallelism,
      query=bq_query
  )
  return penguin_ds.schema()

ray.get(fntReadBQPrintPenguinSchema.remote())



[2m[36m(_get_read_tasks pid=14681, ip=10.126.0.4)[0m [Ray on Vertex AI]: Created streams: 1
[2m[36m(_get_read_tasks pid=14681, ip=10.126.0.4)[0m [Ray on Vertex AI]: The number of streams created by the BigQuery Storage Read API is less than the requested parallelism due to the size of the dataset.


species: string
island: string
culmen_length_mm: double
culmen_depth_mm: double
flipper_length_mm: double
body_mass_g: double
sex: string

In [None]:
@ray.remote
def fntReadBQPrintPenguin():
  bq_dataset = "bigquery-public-data.ml_datasets"
  bq_read_parallelism = 4
  bq_query = f"SELECT * from {bq_dataset}.penguins LIMIT 10"

  penguin_ds = ray.data.read_datasource(
      BigQueryDatasource(),
      parallelism=bq_read_parallelism,
      query=bq_query
  )
  return penguin_ds.take(4)

  ray.get(fntReadBQPrintPenguin.remote())

## 2. Write to BigQuery

Lets read the public dataset and make a copy in our BQ dataset

In [49]:
@ray.remote
def fntCreateClonePenguins():
  bq_dataset = "bigquery-public-data.ml_datasets"
  bq_read_parallelism = 4
  bq_query = f"SELECT * from {bq_dataset}.penguins LIMIT 10"

  penguin_ds = ray.data.read_datasource(
      BigQueryDatasource(),
      parallelism=bq_read_parallelism,
      query=bq_query
  ).write_datasource(
      BigQueryDatasource(),
      dataset="ray_lab_ds.penguin_copy"
  )
  return "Completed"

ray.get(fntCreateClonePenguins.remote())



[2m[36m(_get_read_tasks pid=16641, ip=10.126.0.4)[0m [Ray on Vertex AI]: Created streams: 1
[2m[36m(_get_read_tasks pid=16641, ip=10.126.0.4)[0m [Ray on Vertex AI]: The number of streams created by the BigQuery Storage Read API is less than the requested parallelism due to the size of the dataset.
[2m[36m(_do_write pid=16641, ip=10.126.0.4)[0m [Ray on Vertex AI]: Dataset ray_lab_ds already exists. The table will be overwritten if it already exists.
[2m[36m(_do_write pid=16641, ip=10.126.0.4)[0m [Ray on Vertex AI]: Writing 1 blocks
[2m[36m(_write_single_block pid=16729, ip=10.126.0.4)[0m [Ray on Vertex AI]: Starting to write 10 rows


'Completed'

[2m[36m(_write_single_block pid=16729, ip=10.126.0.4)[0m [Ray on Vertex AI]: Finished writing 10 rows
