<a href="https://colab.research.google.com/github/carsofferrei/04_data_processing/blob/main/spark_streaming/read_from_api_carris.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
import requests
from pyspark.sql.types import *


spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

In [22]:
import requests
import json
import time
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F

# Initialize Spark Session
spark = SparkSession.builder.master("local").appName("API Streaming").getOrCreate()

# Define schema for the JSON data
vehicle_schema = StructType([
    StructField('bearing', IntegerType(), True),
    StructField('block_id', StringType(), True),
    StructField('current_status', StringType(), True),
    StructField('id', StringType(), True),
    StructField('lat', FloatType(), True),
    StructField('line_id', StringType(), True),
    StructField('lon', FloatType(), True),
    StructField('pattern_id', StringType(), True),
    StructField('route_id', StringType(), True),
    StructField('schedule_relationship', StringType(), True),
    StructField('shift_id', StringType(), True),
    StructField('speed', FloatType(), True),
    StructField('stop_id', StringType(), True),
    StructField('timestamp', TimestampType(), True),
    StructField('trip_id', StringType(), True)
])

# API URL
API_URL = "https://api.carrismetropolitana.pt/vehicles"

# Directory to store streaming JSON files
streaming_dir = "/tmp/api_stream"
os.makedirs(streaming_dir, exist_ok=True)

# Function to fetch API data and save it to JSON files
def fetch_and_write_api_data(batch_id):
    response = requests.get(API_URL)
    if response.status_code == 200:
        api_data = response.json()
        with open(f"{streaming_dir}/batch_{batch_id}.json", "w") as f:
            json.dump(api_data, f)
        print(f"Batch {batch_id} written to {streaming_dir}/batch_{batch_id}.json")
    else:
        print(f"Failed to fetch API data. Status: {response.status_code}")

# Periodically fetch API data
def write_streaming_api_data(interval=5, max_batches=10):
    batch_id = 0
    while batch_id < max_batches:
        fetch_and_write_api_data(batch_id)
        time.sleep(interval)
        batch_id += 1

def save_parquet(df, batch_id):
  (df
   .withColumn("batch_id",F.lit(batch_id))
   .withColumn("load_time",F.current_timestamp())
   .write.mode("append")
   .parquet("content/output/api_carris_streaming")
  )


# Start fetching API data in the background
import threading
threading.Thread(target=write_streaming_api_data).start()

# Read the generated files as a streaming source
df = spark.readStream.schema(vehicle_schema).json(streaming_dir)

# Transform the streaming DataFrame
transformed = df.withColumn("minute", F.minute("timestamp"))

# Write the transformed stream to Parquet
query = (transformed.writeStream
            .option('checkpointLocation', 'content/output/checkpoint')
            .trigger(processingTime='20 seconds')
            .outputMode('append')
            .foreachBatch(save_parquet)
            .start()
            )

# Wait for the query to terminate
query.awaitTermination(20)

Batch 0 written to /tmp/api_stream/batch_0.json
Batch 1 written to /tmp/api_stream/batch_1.json
Batch 2 written to /tmp/api_stream/batch_2.json
Batch 3 written to /tmp/api_stream/batch_3.json


False

In [28]:
result = spark.read.format("parquet").load("content/output/api_carris_streaming/")
result.show(100, False)

+-------+------------------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+--------------------+---------+-------+-------------------+-------------------------------------------+------+--------+--------------------------+
|bearing|block_id                      |current_status|id      |lat      |line_id|lon      |pattern_id|route_id|schedule_relationship|shift_id            |speed    |stop_id|timestamp          |trip_id                                    |minute|batch_id|load_time                 |
+-------+------------------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+--------------------+---------+-------+-------------------+-------------------------------------------+------+--------+--------------------------+
|45     |20241123-64020061-121180000007|IN_TRANSIT_TO |44|12527|38.66093 |4600   |-8.987601|4600_0_2  |4600_0  |SCHEDULED            |121180000007        |0.

In [24]:
print(spark.read.parquet("content/output/api_carris_streaming").count())
print(spark.read.parquet("content/output/api_carris_streaming").dropDuplicates().count())

print("Data have dupplicates. The number of records are:")
print((spark.read.parquet("content/output/api_carris_streaming").count())-(spark.read.parquet("content/output/api_carris_streaming").dropDuplicates().count()))

4200
2734
Data have dupplicates. The number of records are:
1466


In [16]:
#!rm -rf "/content/output/api_carris_streaming"