# query.ipynb
Playground notebook, for making queries and exploration

In [1]:
from pyspark.sql import SparkSession
from spark_config import spark_config_bronze, spark_config_silver, spark_config_gold, spark_config_minio

In [2]:
# Stop any existing spark sessions, from previous jupyter runs
spark = SparkSession.builder.getOrCreate()
spark.stop()

# Create a new spark session
builder = SparkSession.builder
builder.appName("query")
builder.master("spark://spark-master:7077")

spark_config_bronze(builder)
spark_config_silver(builder)
spark_config_gold(builder)
spark_config_minio(builder)

spark = builder.getOrCreate()

25/02/27 07:11:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df = spark.sql(f"""
    SELECT *
    FROM bronze.data_platform_example.page_load_v1 LIMIT 10
""")

df.show()

25/02/27 07:11:47 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+--------------------+--------------------+
|            metadata|             payload|
+--------------------+--------------------+
|{page_load, 2025-...|{Chrome, /contact...|
|{page_load, 2025-...|{NULL, /cart, Tyr...|
|{page_load, 2025-...|{Safari, /home, A...|
|{page_load, 2025-...|{Safari, /contact...|
|{page_load, 2025-...|{Firefox, /home, ...|
|{page_load, 2025-...|{Safari, /product...|
|{page_load, 2025-...|{Edge, /home, Joh...|
|{page_load, 2025-...|{Safari, /home, D...|
|{page_load, 2025-...|{Safari, /home, C...|
|{page_load, 2025-...|{Edge, /contact, ...|
+--------------------+--------------------+



In [4]:
df = spark.sql(f"""
   SELECT metadata.name, metadata.version, metadata.timestamp, payload.page, payload.user_name, payload.browser 
   FROM bronze.data_platform_example.page_load_v1 limit 10
""")

df.show()

+---------+-------+--------------------+---------+---------------+-------+
|     name|version|           timestamp|     page|      user_name|browser|
+---------+-------+--------------------+---------+---------------+-------+
|page_load|     v1|2025-02-21T00:11:...| /contact|  Tyrone Turner| Chrome|
|page_load|     v1|2025-02-21T00:26:...|    /cart|  Tyrone Turner|   NULL|
|page_load|     v1|2025-02-21T17:08:...|    /home|Andrew Stafford| Safari|
|page_load|     v1|2025-02-21T22:39:...| /contact|  Cassidy Moody| Safari|
|page_load|     v1|2025-02-21T19:27:...|    /home|   Rachel Evans|Firefox|
|page_load|     v1|2025-02-21T22:06:...|/products|  Cassidy Moody| Safari|
|page_load|     v1|2025-02-21T11:46:...|    /home|   Johnny Stone|   Edge|
|page_load|     v1|2025-02-21T14:07:...|    /home|  Dennis Warner| Safari|
|page_load|     v1|2025-02-24T15:33:...|    /home|  Cassidy Moody| Safari|
|page_load|     v1|2025-02-24T08:17:...| /contact|Andrew Stafford|   Edge|
+---------+-------+------

25/02/27 07:19:32 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/02/27 07:19:33 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:981)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [7]:
df = spark.sql(f"""
    SELECT COUNT(*)
    FROM bronze.data_platform_example.page_load_v1
""")
df.show()

+--------+
|count(1)|
+--------+
|    9389|
+--------+



In [8]:
df = spark.sql(f"""
    SELECT *
    FROM silver.data_platform_example.page_load
""")
df.show()

+----------+-------------+--------------------+---------+--------------------+-------+
|event_name|event_version|            event_ts|     page|           user_name|browser|
+----------+-------------+--------------------+---------+--------------------+-------+
| page_load|           v1|2025-02-09 23:29:...|    /cart|     Jessica Johnson|   Edge|
| page_load|           v1|2025-02-14 14:37:...|/products|      Carlos Watkins| Safari|
| page_load|           v1|2025-02-14 01:01:...| /contact|   Christopher Adams| Safari|
| page_load|           v1|2025-02-13 19:04:...|/products|       Cody Martinez|   Edge|
| page_load|           v1|2025-02-13 05:48:...|/checkout|     Erica Wilkinson|   Edge|
| page_load|           v1|2025-02-11 13:22:...| /contact|       Adam Stephens|Firefox|
| page_load|           v1|2025-02-14 14:36:...|   /about|           Amy Jones| Safari|
| page_load|           v1|2025-02-09 16:28:...|    /home|           Amy Jones|Firefox|
| page_load|           v1|2025-02-13 11:41:

In [9]:
df = spark.sql(f"""
    SELECT COUNT(*)
    FROM silver.data_platform_example.page_load
""")
df.show()

+--------+
|count(1)|
+--------+
|    2527|
+--------+



In [18]:
df = spark.sql(f"""
    SELECT *
    FROM gold.data_platform_example.page_loads_per_day
    ORDER BY page, date
""")
df.show()

+-------------------+------+---------------+
|               date|  page|page_load_count|
+-------------------+------+---------------+
|2025-02-08 00:00:00|/about|             21|
|2025-02-09 00:00:00|/about|             21|
|2025-02-10 00:00:00|/about|             25|
|2025-02-11 00:00:00|/about|             19|
|2025-02-12 00:00:00|/about|             28|
|2025-02-13 00:00:00|/about|             20|
|2025-02-14 00:00:00|/about|             27|
|2025-02-15 00:00:00|/about|              1|
|2025-02-20 00:00:00|/about|             14|
|2025-02-21 00:00:00|/about|             11|
|2025-02-22 00:00:00|/about|             16|
|2025-02-23 00:00:00|/about|             17|
|2025-02-24 00:00:00|/about|              4|
|2025-02-25 00:00:00|/about|             10|
|2025-02-26 00:00:00|/about|             14|
|2025-02-27 00:00:00|/about|              7|
|2025-02-08 00:00:00| /cart|             11|
|2025-02-09 00:00:00| /cart|             29|
|2025-02-10 00:00:00| /cart|             25|
|2025-02-1

In [19]:
spark.stop()