<a href="https://colab.research.google.com/github/anaferreira744/DE-DP-ADF/blob/main/spark/challenges/challenge_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHALLENGE 1
##  Implement INGESTION process
- Set up path in the "lake"
  - !mkdir -p /content/lake/bronze

- Read data from API https://api.carrismetropolitana.pt/
  - Endpoints:
    - vehicles
    - lines
    - municipalities
  - Use StructFields to enforce schema

- Transformations
  - vehicles
    - create "date" extracted from "timestamp" column (format: hh24miss)

- Write data as PARQUET into the BRONZE layer (/content/lake/bronze)
  - Partition "vehicles" by "date" column
  - Paths:
    - vehicles - path: /content/lake/bronze/vehicles
    - lines - path: /content/lake/bronze/lines
    - municipalities - path: /content/lake/bronze/municipalities
  - Make sure there is only 1 single parquet created
  - Use overwrite as write mode

# Setting up PySpark

In [4]:
%pip install pyspark



In [9]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode, lit
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType, FloatType, TimestampType


In [45]:

spark = SparkSession.builder.master('local').appName('Challenge_AF').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

In [46]:
import requests
from pyspark.sql.types import *

def readFromAPI(url: str, schema: StructType = None):
  response = requests.get(url)
  rdd = sc.parallelize(response.json())

  if schema:
    df = spark.read.schema(schema).json(rdd)
  else:
    df = spark.read.json(rdd)
  return df



In [47]:

vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

vehicles = readFromAPI("https://api.carrismetropolitana.pt/vehicles", vehicle_schema)
print(vehicles.count())
vehicles.show()

324
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|    170|20241122-64010155...| IN_TRANSIT_TO|44|12703|38.767754|   4710| -9.10039|  4710_0_1|  4710_0|            SCHEDULED|113270234560|      0.0| 060009|2024-11-22 21:57:41|4710_0_1|2200|220...|
|     51|20241122-64010054...|   INCOMING_AT|44|12645|38.709724|   4504| -8.92177|  4504_0_1|  4504_0|            SCHEDULED|121890234560|13.611111| 010112|2024-11-22 21:57:51|4504_0_1|2200|213...|
|    265|20

In [52]:
#corrigir este schema
lines_schema = StructType([
    StructField("color", StringType(), True),
    StructField("facilities", ArrayType(StringType()), True), # Specify elementType as StringType
    StructField("id", StringType(), True),
    StructField("localities", ArrayType(StringType()), True), # Specify elementType as StringType
    StructField("long_name", StringType(), True),
    StructField("municipalities", ArrayType(StringType()), True),
    StructField("patterns", ArrayType(StringType()), True),
    StructField("routes", ArrayType(StringType()), True),
    StructField("short_name", StringType(), True),
    StructField("text_color", StringType(), True)
])


lines = readFromAPI("https://api.carrismetropolitana.pt/lines", lines_schema)
print(lines.count())
lines.show()



723
+--------------------+-------+----------+----+--------------------+--------------------+--------------+--------------------+--------------------+----------+----------+
|     _corrupt_record|  color|facilities|  id|          localities|           long_name|municipalities|            patterns|              routes|short_name|text_color|
+--------------------+-------+----------+----+--------------------+--------------------+--------------+--------------------+--------------------+----------+----------+
|                NULL|#C61D23|        []|1001|[Alfragide, Amado...|Alfragide (Estr S...|        [1115]|[1001_0_1, 1001_0_2]|            [1001_0]|      1001|   #FFFFFF|
|                NULL|#C61D23|        []|1002|[Reboleira, Amado...|Reboleira (Estaçã...|        [1115]|          [1002_0_3]|            [1002_0]|      1002|   #FFFFFF|
|                NULL|#C61D23|        []|1003|[Amadora, Amadora...|Amadora (Estação ...|        [1115]|[1003_0_1, 1003_0_2]|            [1003_0]|      1003|

In [49]:
municipalities_schema = StructType([
    StructField("district_id", StringType(), True),
    StructField("district_name", StringType(), True),
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("prefix", StringType(), True),
    StructField("region_id", StringType(), True),
    StructField("region_name", StringType(), True)

])

municipalities = readFromAPI("https://api.carrismetropolitana.pt/municipalities", municipalities_schema)
print(municipalities.count())
municipalities.show()

23
+-----------+-------------+----+--------------------+------+---------+----------------+
|district_id|district_name|  id|                name|prefix|region_id|     region_name|
+-----------+-------------+----+--------------------+------+---------+----------------+
|         07|        Évora|0712|        Vendas Novas|    19|    PT187|Alentejo Central|
|         11|       Lisboa|1101|            Alenquer|    20|    PT16B|           Oeste|
|         11|       Lisboa|1102|   Arruda dos Vinhos|    20|    PT16B|           Oeste|
|         11|       Lisboa|1105|             Cascais|    05|    PT170|             AML|
|         11|       Lisboa|1106|              Lisboa|    06|    PT170|             AML|
|         11|       Lisboa|1107|              Loures|    07|    PT170|             AML|
|         11|       Lisboa|1109|               Mafra|    08|    PT170|             AML|
|         11|       Lisboa|1110|              Oeiras|    12|    PT170|             AML|
|         11|       Lisboa|11

In [50]:
# transformations
from pyspark.sql.functions import to_date, date_format
df = vehicles.withColumn("date", date_format("timestamp", "yyyyMMdd"))

df.show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|    date|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------+
|    170|20241122-64010155...| IN_TRANSIT_TO|44|12703|38.767754|   4710| -9.10039|  4710_0_1|  4710_0|            SCHEDULED|113270234560|      0.0| 060009|2024-11-22 21:57:41|4710_0_1|2200|220...|20241122|
|     51|20241122-64010054...|   INCOMING_AT|44|12645|38.709724|   4504| -8.92177|  4504_0_1|  4504_0|            SCHEDULED|121890234560|13.611111| 010112|2024-11-22 21:57:51|4

Write data as PARQUET into the BRONZE layer (/content/lake/bronze)

Partition "vehicles" by "date" column

Paths:
vehicles - path: /content/lake/bronze/vehicles

lines - path: /content/lake/bronze/lines

municipalities - path: /content/lake/bronze/municipalities

Make sure there is only 1 single parquet created
Use overwrite as write mode

In [51]:
from pyspark.sql.functions import date_format

# Define paths for bronze layer
vehicle_path = "/content/lake/bronze/vehicles"
lines_path = "/content/lake/bronze/lines"
municipalities_path = "/content/lake/bronze/municipalities"

# Assume `vehicles` DataFrame is preloaded with required data
# Create a partition column from the timestamp
df = vehicles.withColumn("date", date_format("timestamp", "yyyyMMdd"))

# Configure Spark to use dynamic partition overwrite
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

# Gravar o DataFrame 'vehicles' particionado por 'date'
(vehicles.coalesce(1)  # Garante apenas um arquivo por partição
   .write
   .mode("overwrite")  # Sobrescreve dados existentes
   .partitionBy("date")  # Particiona os dados pela coluna 'date'
   .format("parquet")  # Formato Parquet
   .save(vehicle_path))  # Caminho para salvar

# Gravar o DataFrame 'lines' sem particionamento
(lines.coalesce(1)  # Consolidar em um único arquivo
   .write
   .mode("overwrite")  # Sobrescreve dados existentes
   .format("parquet")  # Formato Parquet
   .save(lines_path))  # Caminho para salvar

# Gravar o DataFrame 'municipalities' sem particionamento
(municipalities.coalesce(1)  # Consolidar em um único arquivo
   .write
   .mode("overwrite")  # Sobrescreve dados existentes
   .format("parquet")  # Formato Parquet
   .save(municipalities_path))  # Caminho para salvar
# List the content of the target directory to verify the files
!ls /content/lake/bronze/vehicles

# Verify the count of records in the written Parquet files
record_count = spark.read.format("parquet").load(vehicle_path).count()
print(f"Total records written: {record_count}")


AnalysisException: Partition column `date` not found in schema struct<bearing:int,block_id:string,current_status:string,id:string,lat:float,line_id:string,lon:float,pattern_id:string,route_id:string,schedule_relationship:string,shift_id:string,speed:float,stop_id:string,timestamp:timestamp,trip_id:string>.