In [None]:
from pyspark.sql import SparkSession
import os


# packages = "org.apache.iceberg:iceberg-spark:1.10.0,org.apache.iceberg:iceberg-spark-runtime-4.0_2.13:1.10.0,org.apache.iceberg:iceberg-spark-extensions-4.0_2.13:1.10.0,com.amazonaws:aws-java-sdk-bundle:1.12.791,software.amazon.awssdk:bundle:2.34.0,org.apache.hadoop:hadoop-aws:3.4.1"
spark = SparkSession.builder \
    .appName("IcebergInspector") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "glue") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.defaultCatalog", "spark_catalog") \
    .config("spark.sql.catalog.spark_catalog.warehouse", "s3a://data-on-eks-spark-logs-20251001184655839600000005/iceberg-warehouse/") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem").getOrCreate()
    # .config("spark.sql.catalog.spark_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \


spark.sparkContext.setLogLevel("DEBUG")

In [None]:
spark.sql("show databases").show()
spark.sql("use data_on_eks").show()
spark.sql("show tables").show()
# spark.sql("SELECT * FROM cat_locations_raw").show()
# spark.sql("CALL spark_catalog.system.compute_table_stats('spark_catalog.default_database.cat_interactions_sink')")

+----------------+--------------------+-----------+
|       namespace|           tableName|isTemporary|
+----------------+--------------------+-----------+
|default_database|   cat_locations_raw|      false|
|default_database|visitor_checkins_raw|      false|
|default_database|    cat_wellness_raw|      false|
|default_database|     cafe_orders_raw|      false|
|default_database|cat_interactions_raw|      false|
+----------------+--------------------+-----------+

+--------------------+--------------------+----------------+
|          event_time|              cat_id|        location|
+--------------------+--------------------+----------------+
|2025-09-30T01:50:...|1b2fbc8d-43d7-45f...|    cat_tree_top|
|2025-09-30T01:50:...|c6897dc0-f452-48f...|   counter_stool|
|2025-09-30T01:50:...|5d51794d-6ffd-47e...|   reading_chair|
|2025-09-30T01:50:...|6ff62c72-5ed6-475...| feeding_station|
|2025-09-30T01:50:...|eb540a05-1c62-487...|southwest_window|
|2025-09-30T01:50:...|af5555e1-23b7-47b...|

In [None]:
spark.sql("""
SELECT
    *
FROM
    spark_catalog.data_on_eks.cat_locations_raw.files
""").show()

+--------------------+----------+
|              cat_id|like_count|
+--------------------+----------+
|d10aa663-8181-4bd...|         2|
|faedb29e-347f-4fb...|         2|
|df02677b-fe73-44a...|         2|
|06903bbb-d833-47d...|         2|
|639b5649-0afa-48b...|         2|
|e61d0fb2-7bf1-403...|         2|
|4211416e-0fe0-408...|         2|
|f9f699ef-c31d-406...|         2|
|0d92ae1a-4063-48d...|         2|
|9aac6583-d104-41a...|         2|
+--------------------+----------+



In [9]:

spark.sql("""
WITH visitor_spending AS (
  SELECT
    visitor_id,
    SUM(total_amount) AS total_spent
  FROM
    cafe_orders_raw
  GROUP BY
    visitor_id
),
visitor_interactions AS (
  SELECT
    visitor_id,
    COUNT(*) AS interaction_count
  FROM
    cat_interactions_raw
  GROUP BY
    visitor_id
)
SELECT
  s.visitor_id,
  s.total_spent,
  i.interaction_count
FROM
  visitor_spending s
JOIN
  visitor_interactions i ON s.visitor_id = i.visitor_id
ORDER BY
  s.total_spent DESC
LIMIT 20
""").show()

+--------------------+-----------+-----------------+
|          visitor_id|total_spent|interaction_count|
+--------------------+-----------+-----------------+
|1cf9d09b-e3ac-4c2...|     121.02|               56|
|c769a763-38b0-4f0...|      99.93|               58|
|cb3d930c-4e45-449...|      95.92|               34|
|6aafcf40-bd7b-4dc...|      85.53|               25|
|6f919bde-89f7-44f...|      78.51|               52|
|95c88c73-b615-4df...|      71.61|               10|
|2a189d80-6cd9-43f...|      71.39|               30|
|91c12fb9-fe73-4ae...|      68.16|               45|
|fbd34de9-41f1-44b...|      66.61|               45|
|0bd40fb6-0a1f-4cf...|      61.65|               26|
|d30eb388-ec93-474...|      61.57|               28|
|468e0326-627a-493...|      61.10|               33|
|c2ba9287-2dc6-48b...|      57.41|               27|
|3f74fa8f-53d1-42f...|      57.07|               33|
|736b8943-1d8e-477...|      56.78|               38|
|0b432d2a-7bcb-42c...|      55.71|            