In [None]:
data_bucket_name_in_minio = "mhmm"
minio_ip = "172.18.0.4"
kafka_ip = "172.18.0.6"

sensor_topic = "temp_producer" 

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

In [3]:
CATALOG_URI = "http://nessie:19120/api/v1" ## Nessie Server URI
WAREHOUSE = "s3://" + data_bucket_name_in_minio +"/" ## S3 Address to Write to
STORAGE_URI = "http://"+ minio_ip +":9000"

In [None]:

spark = (
    SparkSession.builder
    .appName("KafkaSparkIceberg")
    .config("spark.jars.packages",
            ",".join([
                "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0",
                "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0",
                "org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1",
                "software.amazon.awssdk:bundle:2.24.8",
                "software.amazon.awssdk:url-connection-client:2.24.8"
            ]))
    .config("spark.sql.extensions",
            "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,"
            "org.projectnessie.spark.extensions.NessieSparkSessionExtensions")
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .config("spark.sql.catalog.nessie.uri", CATALOG_URI)
    .config("spark.sql.catalog.nessie.ref", "main")
    .config("spark.sql.catalog.nessie.authentication.type", "NONE")
    .config("spark.sql.catalog.nessie.warehouse", WAREHOUSE)
    .config("spark.sql.catalog.nessie.s3.endpoint", STORAGE_URI)
    .config("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .getOrCreate()
)

print("✅ Spark Kafka-Nessie setup ready.")


In [5]:
schema = StructType() \
    .add("source", StringType()) \
    .add("temp", DoubleType()) \
    .add("timestamp", TimestampType())

In [7]:
table_name_in_nessie = sensor_topic + "_data"

In [None]:
# spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie." + table_name_in_nessie)

In [None]:
spark.sql("DROP TABLE IF EXISTS nessie." + table_name_in_nessie)


In [None]:
print("check 1")
# Read data from Kafka topic
kafka_df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", kafka_ip + ":9092")
    .option("subscribe", sensor_topic)
    .option("startingOffsets", "earliest")  # or "earliest/latest"
    # .option("failOnDataLoss", "false")
    .load()
)

# Convert the binary Kafka `value` column into structured JSON
parsed_df = (
    kafka_df.selectExpr("CAST(value AS STRING)")
    .select(from_json(col("value"), schema).alias("data"))
    .select("data.*")
)

# Optional transformation (filter, aggregate, etc.)
processed_df = parsed_df.filter(col("temp") > 0)

print("check 2")
# Write stream to console for testing
query = (
    parsed_df.writeStream
    .format("iceberg")
    .outputMode("append")
    .option("checkpointLocation", "/workspace/tables")  # necessary
    .trigger(processingTime="60 seconds") 
    .toTable("nessie." + table_name_in_nessie)
)

query.awaitTermination()
print("check 3")


In [None]:
# spark.read.table("nessie.sensor_yo").show()


In [None]:
# print("check 1")
# # Read data from Kafka topic
# kafka_df = (
#     spark.readStream
#     .format("kafka")
#     .option("kafka.bootstrap.servers", kafka_ip + ":9092")
#     .option("subscribe", sensor_topic)
#     .option("startingOffsets", "earliest")  # or "earliest/latest"
#     # .option("failOnDataLoss", "false")
#     .load()
# )

# # Convert the binary Kafka `value` column into structured JSON
# parsed_df = (
#     kafka_df.selectExpr("CAST(value AS STRING)")
#     .select(from_json(col("value"), schema).alias("data"))
#     .select("data.*")
# )

# # Optional transformation (filter, aggregate, etc.)
# processed_df = parsed_df.filter(col("temp") > 0)

# print("check 2")
# # Write stream to console for testing
# query = (
#     processed_df.writeStream
#     .format("console")
#     .outputMode("append")
#     .option("truncate", False)
#     .start()
# )

# print("check 3")


In [13]:
spark.stop()