In [None]:
!cat jupyter-spark-service.yaml \
  | sed "s/<NOTEBOOK_NAME>/$(hostname | awk -F '-' '{OFS=FS;NF=NF-1;print $0}')/g" \
  | sed "s/<NAMESPACE>/$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace)/g" \
  | kubectl apply -n "$(cat  /var/run/secrets/kubernetes.io/serviceaccount/namespace)" -f -

In [None]:
!kubectl apply -n "$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace)" -f spark-user.yaml

In [None]:
!java --version

In [None]:
!python --version

In [None]:
# !pip install pyspark==3.2.1 --user -q
!conda install -c conda-forge pyspark=3.2.1 -y -q

In [None]:
import pyspark
import os
from pyspark import SparkConf

In [None]:
with open("/var/run/secrets/kubernetes.io/serviceaccount/token", "r") as f:
    auth_token = f.read()
    assert auth_token is not None
    
master_ip=os.getenv("KUBERNETES_SERVICE_HOST", "kubernetes.default.svc")
master_port = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "443")
master_url=f"k8s://https://{master_ip}:{master_port}"
    
container_image = "datamechanics/spark:3.2.1-hadoop-3.3.1-java-8-scala-2.12-python-3.8-dm17"
    
namespace = os.getenv("NB_PREFIX").split("/")[2]
hostname = os.getenv("NB_PREFIX").split("/")[3]
service_account = "spark-user"
app_name = f"{hostname}-app"
driver_host = f"{hostname}-spark-svc"

minio_access_key = os.getenv("AWS_ACCESS_KEY_ID")
minio_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
minio_url = os.getenv("MINIO_ENDPOINT_URL")

In [None]:
from pyspark.sql import SparkSession

conf = SparkConf().setAppName(app_name).setMaster(master_url)
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.1,')
conf.set("spark.kubernetes.namespace", namespace)
conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", service_account)
conf.set("spark.kubernetes.authenticate.driver.oauthToken", auth_token)
conf.set("spark.kubernetes.container.image", container_image)
conf.set("spark.kubernetes.allocation.batch.size", "5")
conf.set("spark.kubernetes.executor.instances", "1")
conf.set("spark.executor.instances", 1)
conf.set("spark.driver.bindAddress", "0.0.0.0")
conf.set("spark.driver.host", driver_host)
conf.set("spark.driver.port", "37371")
conf.set("spark.blockManager.port", "6060")
conf.set("spark.pyspark.python", "/opt/spark/python")
conf.set("spark.sql.repl.eagerEval.enabled", True)

conf.set("fs.s3a.access.key", minio_access_key)
conf.set("fs.s3a.secret.key", minio_secret_key)
conf.set("fs.s3a.endpoint", minio_url)
conf.set("fs.s3a.path.style.access", "true")
conf.set("fs.s3a.connection.ssl.enabled", "false")
conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("fs.s3a.connection.ssl.enabled", "false")

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

In [None]:
# Create a list of numbers
numbers = [1, 2, 3, 4, 5]

# Create a DataFrame from the list of numbers
df = spark.createDataFrame([(num,) for num in numbers], ["number"])

# Use Spark DataFrame API to calculate the sum of the numbers
sum_df = df.agg({"number": "sum"})

# Extract the sum value from the DataFrame
sum_value = sum_df.first()[0]

# Print the sum
print("Sum of numbers:", sum_value)

In [None]:
df = spark.read.option("delimiter", ";").csv("s3a://tests/sample.csv", header=True, inferSchema=True)
df.show()

In [None]:
spark.stop()

In [None]:
!which python

In [None]:
!which java