In [None]:
import os
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType
import pandas as pd

# ========== Spark & Kafka Setup ==========
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0 pyspark-shell'
findspark.init()

# === 1. Spark Initialization ===
spark = SparkSession.builder \
    .appName("CybersecurityLogAnalysis") \
    .getOrCreate()

# === 2. Define Kafka Log Schema ===
log_schema = StructType() \
    .add("timestamp", StringType()) \
    .add("source_ip", StringType()) \
    .add("destination_ip", StringType()) \
    .add("protocol", StringType()) \
    .add("port", StringType()) \
    .add("threat", StringType()) \
    .add("user_agent", StringType()) \
    .add("location", StringType()) \
    .add("bytes_sent", StringType()) \
    .add("bytes_received", StringType())

# === 3. Read Kafka Stream ===
raw_logs = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "logs") \
    .option("startingOffsets", "latest") \
    .load()

logs = raw_logs.selectExpr("CAST(value AS STRING) as json_str") \
    .select(from_json(col("json_str"), log_schema).alias("data")) \
    .select("data.*")

# === 4. Process Each Micro-Batch ===
def process_batch(batch_df, batch_id):
    print(f"ðŸ“¦ Processing batch {batch_id} with {batch_df.count()} records")
    batch_df.show(truncate=False)

# === 5. Start Streaming Query ===
query = logs.writeStream \
    .trigger(processingTime="30 seconds") \
    .foreachBatch(process_batch) \
    .outputMode("append") \
    .option("checkpointLocation", "/tmp/checkpoints/log_analysis") \
    .start()

query.awaitTermination()

ðŸ“¦ Processing batch 36 with 131 records
+-------------------+---------------+---------------+--------+----+----------+---------------------+--------+----------+--------------+
|timestamp          |source_ip      |destination_ip |protocol|port|threat    |user_agent           |location|bytes_sent|bytes_received|
+-------------------+---------------+---------------+--------+----+----------+---------------------+--------+----------+--------------+
|2025-05-24 22:28:17|64.138.66.19   |62.52.8.67     |UDP     |8080|none      |Mozilla/5.0          |ZA      |2917      |1701          |
|2025-05-24 22:28:18|249.228.230.58 |196.235.92.137 |ICMP    |22  |none      |curl/7.68.0          |FR      |4604      |3580          |
|2025-05-24 22:28:19|88.148.175.239 |70.105.246.17  |ICMP    |53  |ddos      |Mozilla/5.0          |RU      |2291      |1601          |
|2025-05-24 22:28:20|97.164.114.221 |129.41.165.157 |UDP     |53  |malware   |curl/7.68.0          |IN      |3012      |2719          |
|2025-