<a href="https://colab.research.google.com/github/anaferreira744/DE-DP-ADF/blob/main/spark_streaming/examples/example_3_api_json.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()
sc = spark.sparkContext

In [41]:
!rm -rf /content/landing
!rm -rf /content/bronze
!mkdir -p /content/landing

## Simulate producer:
- extract data from API
- store data as json in the lake
- run task async

In [42]:
import requests
from pyspark.sql.types import *
import json
import datetime
import asyncio


async def ingest_from_api(url: str, table: str, schema: StructType = None):
  response = requests.get(url)
  timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") #quero adicionar o timestamp no nome do file
  if response.status_code == 200:
    data = response.json()
    with open(f"/content/landing/{table}_{int(timestamp)}.json", "w") as f: #como o open é python entao temos de criar o dir primeiro.se fosse spark criava logo
        json.dump(data, f)

#outro método
async def producer(loop: int, interval_time: int): #interval time o tempo que eu quero esperar entre o loop
  for i in range(loop):
    await ingest_from_api("https://api.carrismetropolitana.pt/vehicles", "vehicles")
    await ingest_from_api("https://api.carrismetropolitana.pt/lines", "lines")
    await asyncio.sleep(interval_time)

async def main():
  asyncio.create_task(producer(10, 30)) #iterar 10 vezes, 30 segundos de intervalo

await main()

- Read from /content/landing as streaming
- store data in memory (for testing)
- store data in the bronze layer

In [64]:
!ls /content/landing | wc

     20      20     550


In [65]:
from pyspark.sql.types import *

vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

stream = spark.readStream.format("json").schema(vehicle_schema).load("/content/landing/vehicles*")

dedup = stream.dropDuplicates()

In [12]:
dedup.printSchema()

root
 |-- bearing: integer (nullable = true)
 |-- block_id: string (nullable = true)
 |-- current_status: string (nullable = true)
 |-- id: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- line_id: string (nullable = true)
 |-- lon: float (nullable = true)
 |-- pattern_id: string (nullable = true)
 |-- route_id: string (nullable = true)
 |-- schedule_relationship: string (nullable = true)
 |-- shift_id: string (nullable = true)
 |-- speed: float (nullable = true)
 |-- stop_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- trip_id: string (nullable = true)



In [66]:
# using memory for testing
try:
  if query.isActive:
    query.stop()
except:
  pass

query = (dedup.writeStream.format("memory").option("queryName", "vehicles").start())

In [70]:
query.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [72]:
spark.sql("select * from vehicles").show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|    170|           2_2752-21| IN_TRANSIT_TO|  41|814|38.805965|   1205|-9.329573|  1205_0_1|  1205_0|            SCHEDULED|        2752| 9.444445| 170249|2024-11-30 10:35:46|1205_0_1_1000_102...|
|    233|20241130-64020021...| IN_TRANSIT_TO|44|12550| 38.52468|   4562|-8.998977|  4562_0_1|  4562_0|            SCHEDULED|111030000007|      7.5| 160897|2024-11-30 10:35:47|4562_0_1|3000|094...|
|      0|      

In [73]:
query.stop()

In [74]:
!rm -rf /content/bronze

In [75]:
from pyspark.sql.functions import *

# watermark is necessary because of the aggregation
transformed = stream.withWatermark("timestamp", "60 seconds")
agg = (transformed
       .groupBy(window(transformed.timestamp, "5 minutes"), col("current_status"))
       .agg(min(transformed.timestamp).alias("init_timestamp"), count("*").alias("count")))

def insert_vehicles(df, batch_id): #recebe um df, por default um batch_id
  #df2 = df.groupBy("window").pivot("current_status").sum("count")
  df.write.format("parquet").mode("append").save("/content/bronze/vehicles") #processo batch -pq o spark stream nao tem um write parque por default

# using memory for testing
query2 = (agg
          .writeStream
          .outputMode("append")
          .foreachBatch(insert_vehicles) #temos de usar o foreach por ser streaming
          .option("checkpointLocation", "/content/bronze/checkpoint")
          .trigger(processingTime='20 seconds')
          .start())

In [79]:
spark.read.format("parquet").load("/content/bronze/vehicles/*").show(100, False)

+------------------------------------------+--------------+-------------------+-----+
|window                                    |current_status|init_timestamp     |count|
+------------------------------------------+--------------+-------------------+-----+
|{2024-11-30 10:30:00, 2024-11-30 10:35:00}|IN_TRANSIT_TO |2024-11-30 10:32:45|1495 |
|{2024-11-30 10:30:00, 2024-11-30 10:35:00}|INCOMING_AT   |2024-11-30 10:32:49|488  |
|{2024-11-30 10:30:00, 2024-11-30 10:35:00}|STOPPED_AT    |2024-11-30 10:32:33|698  |
+------------------------------------------+--------------+-------------------+-----+



## Report
- show vehicles by status in 5-min window time
- one line per window time

In [80]:
def pivot_data(df: DataFrame):
  result = df.orderBy("init_timestamp").groupBy("window").pivot("current_status").sum("count")
  result.show(100, False)

df = spark.read.format("parquet").load("/content/bronze/vehicles/*")
pivot_data(df)

+------------------------------------------+-----------+-------------+----------+
|window                                    |INCOMING_AT|IN_TRANSIT_TO|STOPPED_AT|
+------------------------------------------+-----------+-------------+----------+
|{2024-11-30 10:30:00, 2024-11-30 10:35:00}|488        |1495         |698       |
+------------------------------------------+-----------+-------------+----------+



In [81]:
query2.stop()