# **Partie Streaming de donnée**

In [3]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
  .appName("Spark Structured Streaming from Kafka") \
  .getOrCreate()

In [None]:
sdfAntennes = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "antennes") \
  .option("startingOffsets", "latest") \
  .load() \
  .selectExpr("CAST(value AS STRING)")

In [None]:
from pyspark.sql.types import *

schema_ant = StructType([StructField("t", IntegerType()),
                     StructField("AntennaId", IntegerType()),
                     StructField("EventCode", IntegerType()),
                     StructField("PhoneId", IntegerType()),
                     StructField("x", FloatType()), 
                     StructField("y", FloatType()),
                     StructField("TileId", IntegerType()) ])

In [None]:
def parse_data_from_kafka_message(sdf, schema):
    
    from pyspark.sql.functions import split
    
    assert sdf.isStreaming == True, "DataFrame doesn't receive streaming data"
    col = split(sdf['value'], ',')
    
    for idx, field in enumerate(schema): 
        sdf = sdf.withColumn(field.name, col.getItem(idx).cast(field.dataType))
        
    return sdf.select([field.name for field in schema])

In [None]:
sdfAntennes = parse_data_from_kafka_message(sdfAntennes, schema_ant)

In [None]:
query = sdfAntennes.select("AntennaId").writeStream.format("console").start()
# Pour arrêter le stream on fait query.stop()

# **Partie Retraitement**

In [None]:
def actualisation(base, batch):
    
    base = base.union(batch.na.drop() \
                           .orderBy('t', ascending=False) \
                           .dropDuplicates(subset = ['PhoneId']))
    
    base = base.orderBy('t', ascending=False) \
               .coalesce(1) \
               .dropDuplicates(subset = ['PhoneId'])
    
    return base