<a href="https://colab.research.google.com/github/anaferreira744/DE-DP-ADF/blob/main/spark/challenges/challenge_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHALLENGE 1
##  Implement INGESTION process
- Set up path in the "lake"
  - !mkdir -p /content/lake/bronze

- Read data from API https://api.carrismetropolitana.pt/
  - Endpoints:
    - vehicles
    - lines
    - municipalities
  - Use StructFields to enforce schema

- Transformations
  - vehicles
    - create "date" extracted from "timestamp" column (format: hh24miss)

- Write data as PARQUET into the BRONZE layer (/content/lake/bronze)
  - Partition "vehicles" by "date" column
  - Paths:
    - vehicles - path: /content/lake/bronze/vehicles
    - lines - path: /content/lake/bronze/lines
    - municipalities - path: /content/lake/bronze/municipalities
  - Make sure there is only 1 single parquet created
  - Use overwrite as write mode

# Setting up PySpark

In [None]:
%pip install pyspark



In [153]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql.types import *
import requests


In [154]:
spark = SparkSession.builder.master('local').appName('Challenge_AF').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

In [155]:
import requests
from pyspark.sql.types import *

def readFromAPI(url: str, schema: StructType = None):
  response = requests.get(url)
  rdd = sc.parallelize(response.json())

  if schema:
    df = spark.read.schema(schema).json(rdd)
  else:
    df = spark.read.json(rdd)
  return df



In [156]:

vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

vehicles = readFromAPI("https://api.carrismetropolitana.pt/vehicles", vehicle_schema)
vehicles.count()

227

In [157]:
lines_schema = StructType([
    StructField("color", StringType(), True),
    StructField("facilities", ArrayType(StringType()), True),
    StructField("id", StringType(), True),
    StructField("localities", ArrayType(StringType()), True),
    StructField("long_name", StringType(), True),
    StructField("municipalities", ArrayType(StringType()), True),
    StructField("patterns", ArrayType(StringType()), True),
    StructField("routes", ArrayType(StringType()), True),
    StructField("short_name", StringType(), True),
    StructField("text_color", StringType(), True)
])


lines = readFromAPI("https://api.carrismetropolitana.pt/lines", lines_schema)
lines.show()



+-------+----------+----+--------------------+--------------------+--------------+--------------------+--------------------+----------+----------+
|  color|facilities|  id|          localities|           long_name|municipalities|            patterns|              routes|short_name|text_color|
+-------+----------+----+--------------------+--------------------+--------------+--------------------+--------------------+----------+----------+
|#C61D23|        []|1001|[Alfragide, Amado...|Alfragide (Estr S...|        [1115]|[1001_0_1, 1001_0_2]|            [1001_0]|      1001|   #FFFFFF|
|#C61D23|        []|1002|[Reboleira, Amado...|Reboleira (Estaçã...|        [1115]|          [1002_0_3]|            [1002_0]|      1002|   #FFFFFF|
|#C61D23|        []|1003|[Amadora, Amadora...|Amadora (Estação ...|        [1115]|[1003_0_1, 1003_0_2]|            [1003_0]|      1003|   #FFFFFF|
|#C61D23|        []|1004|[Amadora, Moinhos...|Amadora (Estação ...|        [1115]|          [1004_0_3]|            [10

In [158]:
municipalities_schema = StructType([
    StructField("district_id", StringType(), True),
    StructField("district_name", StringType(), True),
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("prefix", StringType(), True),
    StructField("region_id", StringType(), True),
    StructField("region_name", StringType(), True)

])

municipalities = readFromAPI("https://api.carrismetropolitana.pt/municipalities", municipalities_schema)
municipalities.show()

+-----------+-------------+----+--------------------+------+---------+----------------+
|district_id|district_name|  id|                name|prefix|region_id|     region_name|
+-----------+-------------+----+--------------------+------+---------+----------------+
|         07|        Évora|0712|        Vendas Novas|    19|    PT187|Alentejo Central|
|         11|       Lisboa|1101|            Alenquer|    20|    PT16B|           Oeste|
|         11|       Lisboa|1102|   Arruda dos Vinhos|    20|    PT16B|           Oeste|
|         11|       Lisboa|1105|             Cascais|    05|    PT170|             AML|
|         11|       Lisboa|1106|              Lisboa|    06|    PT170|             AML|
|         11|       Lisboa|1107|              Loures|    07|    PT170|             AML|
|         11|       Lisboa|1109|               Mafra|    08|    PT170|             AML|
|         11|       Lisboa|1110|              Oeiras|    12|    PT170|             AML|
|         11|       Lisboa|1111|

In [159]:
# transformations
from pyspark.sql.functions import to_date, date_format
vehicles = vehicles.withColumn("date", date_format("timestamp", "yyyyMMdd"))

vehicles.show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|    date|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------+
|    164|20241127-64010153...| IN_TRANSIT_TO|44|12703|  38.7687|   4715|-9.104567|  4715_0_2|  4715_0|            SCHEDULED|113280234560|3.8888888| 060011|2024-11-27 23:18:26|4715_0_2|2700|223...|20241127|
|    230|20241127-64010193...| IN_TRANSIT_TO|44|12083| 38.52783|   4428|-8.911337|  4428_0_2|  4428_0|            SCHEDULED|112560234500| 9.722222| 160451|2024-11-27 23:18:20|4

Write data as PARQUET into the BRONZE layer (/content/lake/bronze)

Partition "vehicles" by "date" column

Paths:
vehicles - path: /content/lake/bronze/vehicles

lines - path: /content/lake/bronze/lines

municipalities - path: /content/lake/bronze/municipalities

Make sure there is only 1 single parquet created
Use overwrite as write mode

In [160]:
# Define paths for bronze layer
vehicle_path = "/content/lake/bronze/vehicles"
lines_path = "/content/lake/bronze/lines"
municipalities_path = "/content/lake/bronze/municipalities"

In [161]:
!mkdir -p /content/lake/bronze

In [162]:
# Gravar o DataFrame 'vehicles' particionado por 'date'
(vehicles.coalesce(1)  # Garante apenas um arquivo por partição
   .write
   .mode("overwrite")  # Sobrescreve dados existentes
   .partitionBy("date")  # Particiona os dados pela coluna 'date'
   .format("parquet")  # Formato Parquet
   .save(vehicle_path))  # Caminho para salvar

In [163]:
# Gravar o DataFrame 'lines' sem particionamento
(lines.coalesce(1)  # Consolidar em um único arquivo
   .write
   .mode("overwrite")  # Sobrescreve dados existentes
   .format("parquet")  # Formato Parquet
   .save(lines_path))  # Caminho para salvar

In [164]:
# Gravar o DataFrame 'municipalities' sem particionamento
(municipalities.coalesce(1)  # Consolidar em um único arquivo
   .write
   .mode("overwrite")  # Sobrescreve dados existentes
   .format("parquet")  # Formato Parquet
   .save(municipalities_path))  # Caminho para salvar



# CHALLENGE 2
##  Implement CLEANSING process
- Set up path in the "lake"
  - !mkdir -p /content/lake/silver

- Read data from BRONZE layer as PARQUET:
    - vehicles - path: /content/lake/bronze/vehicles
    - lines - path: /content/lake/bronze/lines
    - municipalities - path: /content/lake/bronze/municipalities

- Transformations
  - vehicles
    - rename "lat" and "lon" to "latitude" and "longitude" respectively
    - remove possible duplicates
    - remove rows when the column CURRENT_STATUS is null
    - remove any corrupted record
  - lines
    - remove duplicates
    - remove rows when the column LONG_NAME is null
    - remove any corrupted record
  - municipalities
    - remove duplicates
    - remove rows when the columns NAME or DISTRICT_NAME are null
    - remove any corrupted record

- Write data as PARQUET into the SILVER layer (/content/lake/silver)
  - Partition "vehicles" by "date"(created in the ingestion)
  - Paths:
    - vehicles - path: /content/lake/silver/vehicles
    - lines - path: /content/lake/silver/lines
    - municipalities - path: /content/lake/silver/municipalities

In [165]:
# read from parquet
vehicles = spark.read.format("parquet").load("/content/lake/bronze/vehicles")
lines = spark.read.format("parquet").load("/content/lake/bronze/lines")
municipalities = spark.read.format("parquet").load("/content/lake/bronze/municipalities")

In [166]:
#vehicles
#rename "lat" and "lon" to "latitude" and "longitude" respectively
vehicles = vehicles.withColumnRenamed("lat", "latitude").withColumnRenamed("lon", "longitude")
vehicles.show()
vehicles.count()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------+
|bearing|            block_id|current_status|      id| latitude|line_id|longitude|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|    date|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------+
|    164|20241127-64010153...| IN_TRANSIT_TO|44|12703|  38.7687|   4715|-9.104567|  4715_0_2|  4715_0|            SCHEDULED|113280234560|3.8888888| 060011|2024-11-27 23:18:26|4715_0_2|2700|223...|20241127|
|    230|20241127-64010193...| IN_TRANSIT_TO|44|12083| 38.52783|   4428|-8.911337|  4428_0_2|  4428_0|            SCHEDULED|112560234500| 9.722222| 160451|2024-11-27 23:18:20|4

227

In [167]:
#remove possible duplicates
vehicles = vehicles.dropDuplicates()

#remove rows when the column CURRENT_STATUS is null
vehicles = vehicles.filter(col("current_status").isNotNull())

#remove any corrupted record
#VER ESTE PONTO NAO SEI FAZER
vehicles = vehicles.dropna()

In [168]:
#lines
#remove duplicates
lines = lines.dropDuplicates()

#remove rows when the column LONG_NAME is null
lines = lines.filter(col("long_name").isNotNull())

#remove any corrupted record
#VER ESTE PONTO NAO SEI FAZER
lines = lines.dropna()

In [169]:
#municipalities
#remove duplicates
municipalities = municipalities.dropDuplicates()

#remove rows when the columns NAME or DISTRICT_NAME are null
municipalities = municipalities.filter(col("name").isNotNull() | col("district_name").isNotNull())

#remove any corrupted record
#VER ESTE PONTO NAO SEI FAZER
municipalities = municipalities.dropna()

In [170]:
!mkdir -p  /content/lake/silver

In [171]:
# Define paths for silver layer
vehicle_path = "/content/lake/silver/vehicles"
lines_path = "/content/lake/silver/lines"
municipalities_path = "/content/lake/silver/municipalities"

In [172]:
# Gravar o DataFrame 'vehicles' particionado por 'date'
(vehicles.coalesce(1)  # Garante apenas um arquivo por partição
   .write
   .mode("overwrite")  # Sobrescreve dados existentes
   .partitionBy("date")  # Particiona os dados pela coluna 'date'
   .format("parquet")  # Formato Parquet
   .save(vehicle_path))  # Caminho para salvar

In [173]:
# Gravar o DataFrame 'lines' sem particionamento
(lines.coalesce(1)  # Consolidar em um único arquivo
   .write
   .mode("overwrite")  # Sobrescreve dados existentes
   .format("parquet")  # Formato Parquet
   .save(lines_path))  # Caminho para salvar

In [174]:
# Gravar o DataFrame 'municipalities' sem particionamento
(municipalities.coalesce(1)  # Consolidar em um único arquivo
   .write
   .mode("overwrite")  # Sobrescreve dados existentes
   .format("parquet")  # Formato Parquet
   .save(municipalities_path))  # Caminho para salvar

# CHALLENGE 3
##  Implement ENRICH process
- Set up path in the "lake"
  - !mkdir -p /content/lake/gold

- Read data from SILVER layer
  - Paths:
    - vehicles - path: /content/lake/silver/vehicles
    - lines - path: /content/lake/silver/lines
    - municipalities - path: /content/lake/silver/municipalities
  - Use StructFields to enforce schema

- Enrichment
  - Enrich vehicles dataset with information from the line and municipalities
    - join vehicles with lines and municipalities
      - select all columns from vehicles + lines.long_name (name: line_name, format:string) + municipalities.name (name: municipality_name, format: array)
      - Note that "municipalities.name" is an array

- Write data as PARQUET into the GOLD layer (/content/lake/gold)
  - Dataset name: vehicles_enriched
  - Partition "vehicles_enriched" by "date" column
  - Paths:
    - vehicles - path: /content/lake/gold/vehicles_enriched
  - Make sure there is only 1 single parquet created
  - Use overwrite as write mode

In [175]:
# read from parquet
vehicles = spark.read.parquet("/content/lake/silver/vehicles", schema=vehicle_schema)
lines = spark.read.parquet("/content/lake/silver/lines", schema=lines_schema)
municipalities = spark.read.parquet("/content/lake/silver/municipalities", schema=municipalities_schema)


In [176]:
from pyspark.sql.functions import explode, col

# Explodir o array "municipalities" para criar uma linha para cada município
lines_exploded = lines.select(
    "id",
    "long_name",
    col("municipalities").alias("municipalities"),  # Coluna de ID (assumindo que você tem uma coluna "id" em municipalities)
    explode(col("municipalities")).alias("municipality_id")  # Explodir o array "municipalities"
)

# Agora você tem um DataFrame com os IDs dos municípios extraídos
lines_exploded.count()

1182

In [177]:

vehicles_lines = vehicles.join(lines_exploded, vehicles['line_id'] == lines_exploded['id'], how = 'left')
vehicles_enriched = vehicles_lines.join(municipalities, lines_exploded['municipality_id'] == municipalities['id'], how = 'left')
vehicles_enriched.show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------+----+--------------------+------------------+---------------+-----------+-------------+----+-------------------+------+---------+-----------+
|bearing|            block_id|current_status|      id| latitude|line_id|longitude|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|    date|  id|           long_name|    municipalities|municipality_id|district_id|district_name|  id|               name|prefix|region_id|region_name|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------+----+--------------------+------------------+---------------+-----------+-------------+----+---------

In [178]:
vehicles_enriched = vehicles_enriched.select(
    vehicles['*'],
    lines_exploded['long_name'].alias('line_name'),
    municipalities['name'].alias('municipality_name')
)
vehicles_enriched.show()


+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------+--------------------+-------------------+
|bearing|            block_id|current_status|      id| latitude|line_id|longitude|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|    date|           line_name|  municipality_name|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------+--------------------+-------------------+
|    316|             1158-11| IN_TRANSIT_TO|  42|292|38.800488|   2730|-9.101742|  2730_0_1|  2730_0|            SCHEDULED|        1231|      0.0| 071435|2024-11-27 23:18:09|2730_0_1|1|1|2300...|20241127|Estação Oriente -...|             Loures|
|    316|   

In [179]:
vehicles_enriched.select("date").distinct().show()


+--------+
|    date|
+--------+
|20241127|
+--------+



In [180]:
!mkdir -p /content/lake/gold

In [181]:
path_vehicles_enriched = "/content/lake/gold/vehicles_enriched"

In [182]:
# Gravar o DataFrame 'vehicles' particionado por 'date'
(vehicles_enriched
.coalesce(1)
.write
.mode("overwrite")
.partitionBy("date")
.format("parquet")
.save(path_vehicles_enriched)
)

In [184]:
vehicles_enriched = spark.read.parquet("/content/lake/gold/vehicles_enriched")
vehicles_enriched.show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------------------+-------------------+--------+
|bearing|            block_id|current_status|      id| latitude|line_id|longitude|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|           line_name|  municipality_name|    date|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+--------------------+-------------------+--------+
|    316|             1158-11| IN_TRANSIT_TO|  42|292|38.800488|   2730|-9.101742|  2730_0_1|  2730_0|            SCHEDULED|        1231|      0.0| 071435|2024-11-27 23:18:09|2730_0_1|1|1|2300...|Estação Oriente -...|             Loures|20241127|
|    316|   