# UAT to ensure Kubeflow pipelines can access Spark 

This notebook verifies that the Kubeflow pipeline stages are able to run Spark jobs. This notebook creates a component that runs a trivial Spark job to calculate the number of vowels in a sample string, and then a pipeline that executes this component.

The expected outcome is that the Spark job inside the component runs successfully, and this is verified by polling the run status. In the component, an assertion is made for the correct result, and if the run was successful, it means that the assertion passed. In the event of failure of assertion, the run would be in FAILURE state.

This notebook requires Kubeflow + Spark setup to have been deployed.

### Install necessary packages and dependencies

In [None]:
# Please check the requirements.in file for more details
!python3 -m pip install -r requirements.txt

### Import required packages

In [None]:
import os

import kfp

from kfp import dsl, kubernetes
from tenacity import retry, stop_after_attempt, wait_exponential

### Define literals

In [None]:
EXPERIMENT_NAME = "Count the number of vowels in a given string"
CHARMED_SPARK_OCI_IMAGE = "ghcr.io/canonical/charmed-spark:3.5.5-22.04_edge"

### Prepare proxy wrapper utilities if applicable

In [None]:
HTTP_PROXY = HTTPS_PROXY = NO_PROXY = None

if os.environ.get("HTTP_PROXY") and os.environ.get("HTTPS_PROXY") and os.environ.get("NO_PROXY"):
    HTTP_PROXY = os.environ["HTTP_PROXY"]
    HTTPS_PROXY = os.environ["HTTPS_PROXY"]
    NO_PROXY = os.environ["NO_PROXY"]


def add_proxy(obj, http_proxy=HTTP_PROXY, https_proxy=HTTPS_PROXY, no_proxy=NO_PROXY):
    """Adds the proxy env vars to the PipelineTask object."""
    return (
        obj.set_env_variable(name="http_proxy", value=http_proxy)
        .set_env_variable(name="https_proxy", value=https_proxy)
        .set_env_variable(name="HTTP_PROXY", value=http_proxy)
        .set_env_variable(name="HTTPS_PROXY", value=https_proxy)
        .set_env_variable(name="no_proxy", value=no_proxy)
        .set_env_variable(name="NO_PROXY", value=no_proxy)
    )


def proxy_envs_set() -> bool:
    if HTTP_PROXY and HTTPS_PROXY and NO_PROXY:
        return True
    return False

### Create Spark test component

In [None]:
@dsl.component(
    base_image=CHARMED_SPARK_OCI_IMAGE,
)
def spark_test_component() -> None:
    import logging
    import os
    from operator import add
    from spark8t.session import SparkSession

    def count_vowels(text: str) -> int:
        """Function that counts vowels in the given string."""
        count = 0
        for char in text:
            if char.lower() in "aeiou":
                count += 1
        return count

    lines = """Canonical's Charmed Data Platform solution for Apache Spark runs Spark jobs on your Kubernetes cluster.
    You can get started right away with MicroK8s - the mightiest tiny Kubernetes distro around! 
    The spark-client snap simplifies the setup process to run Spark jobs against your Kubernetes cluster. 
    Spark on Kubernetes is a complex environment with many moving parts.
    Sometimes, small mistakes can take a lot of time to debug and figure out.
    """

    SPARK_SERVICE_ACCOUNT = os.environ["SPARK_SERVICE_ACCOUNT"]
    SPARK_NAMESPACE = os.environ["SPARK_NAMESPACE"]

    with SparkSession(
        app_name="CountVowels", namespace=SPARK_NAMESPACE, username=SPARK_SERVICE_ACCOUNT
    ) as session:
        n = session.sparkContext.parallelize(lines.splitlines(), 2).map(count_vowels).reduce(add)
        assert n == count_vowels(lines)

    logging.warning(f"The number of vowels in the string is {n}")

### Create a KF Pipeline (without proxy)

In [None]:
@dsl.pipeline(name="spark-test-pipeline")
def spark_pipeline():
    task = spark_test_component()
    kubernetes.pod_metadata.add_pod_label(
        task,
        label_key="access-spark-pipeline",
        label_value="true",
    )

### Create KF Pipeline (with proxy)

In [None]:
@dsl.pipeline(name="spark-test-pipeline")
def spark_pipeline_proxy():
    task = add_proxy(spark_test_component())
    kubernetes.pod_metadata.add_pod_label(
        task,
        label_key="access-spark-pipeline",
        label_value="true",
    )

### Create KFP client and run the pipeline

In [None]:
client = kfp.Client()

# Select appropriate pipeline based on proxy environment variables
if proxy_envs_set():
    run = client.create_run_from_pipeline_func(
        spark_pipeline_proxy, arguments={}, experiment_name=EXPERIMENT_NAME, enable_caching=False
    )
else:
    run = client.create_run_from_pipeline_func(
        spark_pipeline, arguments={}, experiment_name=EXPERIMENT_NAME, enable_caching=False
    )

# See the experiment and run status
client.list_experiments().experiments
client.get_run(run.run_id).state

### Define assertion to check that the run was successful

In [None]:
@retry(
    wait=wait_exponential(multiplier=2, min=1, max=10),
    stop=stop_after_attempt(30),
    reraise=True,
)
def assert_run_succeeded(client, run_id):
    """Wait for the run to complete successfully."""
    status = client.get_run(run_id).state
    assert status == "SUCCEEDED", f"KFP run in {status} state."

### Assert that the run was successful

In [None]:
# fetch KFP experiment to ensure it exists
client.get_experiment(experiment_name=EXPERIMENT_NAME)

assert_run_succeeded(client, run.run_id)