In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = ( SparkSession.builder.appName("KafkaStreamReader")
        .master("local[2]")  
        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0")
        .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
        .config("spark.sql.adaptive.enabled", "true")
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
        .config("spark.executor.memory", "4g")
        .config("spark.driver.memory", "4g")
        .config("spark.sql.shuffle.partitions", "2")
        .getOrCreate()
        )

spark.sparkContext.setLogLevel("WARN")


sample_json = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "csv-data") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load() \
    .select(col("value").cast("string")) \
    .first().value

auto_schema = spark.range(1).select(schema_of_json(lit(sample_json)).alias("schema")).collect()[0].schema

df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "csv-data") \
    .option("startingOffsets", "earliest") \
    .load() 

parsed_df = df.select(
    col("key").cast("string")
    ,from_json(col("value").cast("string"), auto_schema).alias("parsed_value")
    ,col("timestamp").alias("kafka_timestamp")
    ,col("partition")
    ,col("offset")
).select(
    "key"
    ,"parsed_value.*"
    ,"kafka_timestamp"
    ,"offset"
    ,"partition"
)

df_with_area = parsed_df.withColumn(
    "area_for_price",
    when(col("Type") == "Garage", col("TotalArea"))
    .when(col("Type") == "Land", col("LotSize"))
    .when(col("Type").isin("Office", "Building", "Hotel"), col("GrossArea"))
    .otherwise(col("LivingArea"))
)

filtered_df = ( df_with_area
        .filter((col("Price") >= 50_000) &
        (col("area_for_price") >= 20) &
        col("Price").isNotNull() &
        col("LivingArea").isNotNull()
))

grouped_df = ( filtered_df
            .groupBy(["type", "district"])
            .agg(
                avg("Price").alias("avg_price")
                ,avg("area_for_price").alias("avg_area")
                ,count("*").alias("total_ads")
                ,avg(col("price") / col("area_for_price")).alias("avg_price_for_sqt")
                )
            .orderBy(["district", "type"])
             )

query = ( grouped_df.writeStream
         .outputMode("complete")
         .format("console")
         .trigger(processingTime='30 seconds')
         .start()
)

query.awaitTermination()
