# Spark Streaming et fenêtres

Use case : Wind Power Smart Monitor

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

conf = SparkConf() \
    .setAppName('SparkApp') \
    .setMaster('spark://spark:7077') \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .set("spark.sql.shuffle.partitions", "10")
 

sc = SparkContext.getOrCreate(conf=conf)

from pyspark.sql import SQLContext
# Créer un SQLContext pour les opérations SQL
sql_context = SQLContext(sc)

minio_ip_address = "minio"

## Moyenne glissante (Batch)

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, avg
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, FloatType

# Kafka configuration
kafka_broker = "kafka1:9092"
kafka_topic = "wind"

# Define schema for the Kafka message
schema = StructType([
    StructField("created_at", StringType(), True),
    StructField("entry_id", IntegerType(), True),
    StructField("wind_speed", StringType(), True)  # Voltage is initially a string
])

# Read raw data from Kafka
raw_stream = sql_context.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .load()

# Parse Kafka messages
## dropDuplicates : retirer les éléments ayant la même valeur
parsed_stream = raw_stream.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema).alias("data")) \
    .select(
        col("data.created_at").cast(TimestampType()).alias("created_at"),  # Convert timestamp to Spark TimestampType
        col("data.entry_id").alias("entry_id"),
        col("data.wind_speed").cast(FloatType()).alias("wind_speed")  # Convert voltage to FloatType for aggregation
    ).dropDuplicates(["entry_id"]).withWatermark("created_at", "5 minutes")

parsed_stream = parsed_stream.filter(col("data.wind_speed").isNotNull())


# Compute rolling average over a 5-minute window
rolling_average = parsed_stream \
    .groupBy(window(col("created_at"), "5 minutes")) \
    .agg(avg("wind_speed").alias("rolling_avg_wind_speed")) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        col("rolling_avg_wind_speed")
    )  # Explicit ordering by window start

# Collect the result as a Pandas DataFrame
pandas_df = rolling_average.toPandas()
pandas_df

import seaborn as sns
import matplotlib.dates as md

g = sns.lineplot(data=pandas_df, x="window_start", y="rolling_avg_wind_speed")
g.xaxis.set_major_formatter(md.DateFormatter('%d/%m\n%H:%M'))


In [4]:
pandas_df.dtypes

## Moyenne glissante (Streaming)

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, avg
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, FloatType

# Kafka configuration
kafka_broker = "kafka1:9092"
kafka_topic = "wind"

# Define schema for the Kafka message
schema = StructType([
    StructField("created_at", StringType(), True),
    StructField("entry_id", IntegerType(), True),
    StructField("wind_speed", StringType(), True)  # Voltage is initially a string
])

# Read raw data from Kafka
raw_stream = sql_context.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .load()

# Parse Kafka messages
parsed_stream = raw_stream.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema).alias("data")) \
    .select(
        col("data.created_at").cast(TimestampType()).alias("created_at"),  # Convert timestamp to Spark TimestampType
        col("data.entry_id").alias("entry_id"),
        col("data.wind_speed").cast(FloatType()).alias("wind_speed")  # Convert voltage to FloatType for aggregation
    ).dropDuplicates(["entry_id"]).withWatermark("created_at", "5 minutes")

parsed_stream = parsed_stream.filter(col("data.wind_speed").isNotNull())


# Compute rolling average over a 5-minute window
rolling_average = parsed_stream \
    .groupBy(window(col("created_at"), "5 minutes")) \
    .agg(avg("wind_speed").alias("rolling_avg_wind_speed")) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        col("rolling_avg_wind_speed")
    )  # Explicit ordering by window start

# Output rolling average to the console
query = rolling_average.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query.awaitTermination()

In [6]:
query.stop()