# Module 1: Storage Primer

This module covers reading and writing from storage systems with ray.data

## 1. Cloud Storage - SKIP THIS SECTION - DOES NOT WORK

In [9]:
project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_output[0]

project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_NBR = project_nbr_output[0]

RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/us-central1/persistentResources/ray-kicking-tires-cluster"

print('PROJECT_ID: ', PROJECT_ID)
print('PROJECT_NBR: ', PROJECT_NBR)
print('RAY_ADDRESS:', RAY_ADDRESS)

import ray
from ray.runtime_env import RuntimeEnv
from ray.air.config import RunConfig
from ray.util.joblib import register_ray

from google.cloud import aiplatform
from google.cloud.aiplatform.preview import vertex_ray

IRIS_DATA_CSV_SRC_GCS_FQ_URI=f"gs://ray_lab_data_bucket_{PROJECT_NBR}/sample-input-data/iris.csv"
IRIS_DATA_PARQUET_TARGET_GCS_FQ_URI=f"gs://ray_lab_data_bucket_{PROJECT_NBR}/sample-output-data/iris.parquet"

print('IRIS_DATA_CSV_SRC_GCS_FQ_URI:', IRIS_DATA_CSV_SRC_GCS_FQ_URI)
print('IRIS_DATA_PARQUET_TARGET_GCS: ', IRIS_DATA_PARQUET_TARGET_GCS_FQ_URI)

PROJECT_ID:  ray-of-sunshine
PROJECT_NBR:  567162267085
RAY_ADDRESS: vertex_ray://projects/567162267085/locations/us-central1/persistentResources/ray-kicking-tires-cluster
IRIS_DATA_CSV_SRC_GCS_FQ_URI: gs://ray_lab_data_bucket_567162267085/sample-input-data/iris.csv
IRIS_DATA_PARQUET_TARGET_GCS:  gs://ray_lab_data_bucket_567162267085/sample-output-data/iris.parquet


In [None]:
# This is already installed in Colab RTT auto-created by Ray.
#pip install gcsfs

### 1.1. Read CSV data in GCS

In [10]:
! gsutil cat $IRIS_DATA_CSV_SRC_GCS_FQ_URI | head -2

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
1,5.1,3.5,1.4,0.2,Iris-setosa


In [11]:
from typing import Dict
import numpy as np
import vertex_ray
import ray
import pyarrow
from pyarrow._gcsfs import GcsFileSystem
import ray

ray.shutdown()
runtime_env = {
    "pip":
       ["google-cloud-aiplatform[ray]", "pyarrow<7.0.0"]
  }

ray.init(address=RAY_ADDRESS, runtime_env=runtime_env)

# Read GCS & print 5 records
iris_raw_ds = ray.data.read_csv(
    paths=IRIS_DATA_CSV_SRC_GCS_FQ_URI,
    filesystem=GcsFileSystem(),
)
iris_raw_ds.show(5)

[Ray on Vertex AI]: Cluster State = State.RUNNING


Error in data channel:
Queue filler thread failed to join before timeout: 10
2024-02-27 20:15:47,053	ERROR dataclient.py:323 -- Unrecoverable error in data channel.


ConnectionError: Failed during this or a previous request. Exception that broke the connection: <_MultiThreadedRendezvous of RPC that terminated with:
	status = StatusCode.FAILED_PRECONDITION
	details = "No module named 'pyarrow._gcsfs'"
	debug_error_string = "UNKNOWN:Error received from peer ipv4:10.126.0.5:10001 {grpc_message:"No module named \'pyarrow._gcsfs\'", grpc_status:9, created_time:"2024-02-27T20:15:47.052687668+00:00"}"
>

In [13]:
iris_raw_ds.schema()

NameError: name 'iris_raw_ds' is not defined

### 1.2. Transform and write as Parquet to GCS

In [None]:
from typing import Dict
import numpy as np
import ray


ds = ray.data.read_csv(IRIS_DATA_CSV_SRC_GCS_FQ_URI)

# Apply functions to transform data. Ray Data executes transformations in parallel.
def compute_area(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
    length = batch["petal length (cm)"]
    width = batch["petal width (cm)"]
    batch["petal area (cm^2)"] = length * width
    return batch

transformed_ds = ds.map_batches(compute_area)

# Iterate over batches of data.
for batch in transformed_ds.iter_batches(batch_size=4):
    print(batch)

# Save dataset contents to on-disk files or cloud storage.
transformed_ds.write_parquet(IRIS_DATA_PARQUET_TARGET_GCS_FQ_URI)

## 2. BigQuery

In [19]:
import ray
from google.cloud import aiplatform
from vertex_ray import BigQueryDatasource

aiplatform.init()

RAY_ADDRESS=f"vertex_ray://projects/{PROJECT_NBR}/locations/us-central1/persistentResources/ray-kicking-tires-cluster"

runtime_env = {
    "pip":
       ["google-cloud-aiplatform[ray]", "ipywidgets>=8"
        ]
  }
ray.shutdown()
ray.init(address=RAY_ADDRESS, runtime_env=runtime_env)

dataset = "bigquery-public-data.ml_datasets"
parallelism = 4
query = f"SELECT * from {dataset}.penguins LIMIT 10"

print("Query:", query)

ds = ray.data.read_datasource(
    BigQueryDatasource(),
    parallelism=parallelism,
    query=query
)
ds.show(10)

[Ray on Vertex AI]: Cluster State = State.RUNNING
Query: SELECT * from bigquery-public-data.ml_datasets.penguins LIMIT 10


2024-02-27 20:25:53,481	INFO streaming_executor.py:83 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[DoRead]
2024-02-27 20:25:53,482	INFO streaming_executor.py:84 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)


[2m[36m(_get_read_tasks pid=333824)[0m [Ray on Vertex AI]: Created streams: 1
[2m[36m(_get_read_tasks pid=333824)[0m [Ray on Vertex AI]: The number of streams created by the BigQuery Storage Read API is less than the requested parallelism due to the size of the dataset.


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

RaySystemError: System error: Ray has not been started yet. You can start Ray with 'ray.init()'.