<a href="https://colab.research.google.com/github/anaferreira744/DE-DP-ADF/blob/main/spark/challenges/challengeDP_AF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install pyspark



In [2]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql.types import *
import requests
from pyspark.sql.functions import explode, col

In [3]:
!mkdir -p /content/lake/bronze
!mkdir -p /content/lake/silver
!mkdir -p /content/lake/gold

In [4]:
# Operações comuns de extração e carregamento
class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark
    # Extração de dados de ficheiros
    def extract_from_file(self, format: str, path: str, **kwargs) -> DataFrame:
        df = self.spark.read.format(format).load(path)
        return df
    # Extração de dados da API e conversão para DataFrame
    def extract_from_api(self, url: str, schema: StructType = None):
      response = requests.get(url)
      rdd = spark.sparkContext.parallelize(response.json())
      if schema:
        df = spark.read.schema(schema).json(rdd)
      else:
        df = spark.read.json(rdd)
      return df

    # Armazenamento do DF
    def load(self, df: DataFrame, format: str, path: str, partition_column: str = None, **kwargs) -> None:
        if partition_column:
          df.coalesce(1).write.mode("overwrite").partitionBy(partition_column).format(format).save(path)
        else:
          df.coalesce(1).write.mode("overwrite").format(format).save(path)


In [5]:
# Definição e implementação das tarefas
class ETLTask(ETLFlow):
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def ingestion_lines(self):
      print("[INFO] Starting ingestion for lines.")
      # schema
      lines_schema = StructType([StructField('color', StringType(), True),
                                 StructField('facilities', ArrayType(StringType(), True), True),
                                 StructField('id', StringType(), True),
                                 StructField('localities',ArrayType(StringType(), True), True),
                                 StructField('long_name', StringType(), True),
                                 StructField('municipalities', ArrayType(StringType(), True), True),
                                 StructField('patterns', ArrayType(StringType(), True), True),
                                 StructField('routes', ArrayType(StringType(), True), True),
                                 StructField('short_name', StringType(), True), StructField('text_color', StringType(), True)])
      # ingestion
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/lines", schema=lines_schema)
      # load
      self.load(df=df, format="parquet", path="/content/lake/bronze/lines")
      print("[INFO] Ingestion for lines completed.")


    def ingestion_vehicles(self):
      print("[INFO] Starting ingestion for vehicles.")
      # schema
      vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                  StructField('block_id', StringType(), True),
                                  StructField('current_status', StringType(), True),
                                  StructField('id', StringType(), True),
                                  StructField('lat', FloatType(), True),
                                  StructField('line_id', StringType(), True),
                                  StructField('lon', FloatType(), True),
                                  StructField('pattern_id', StringType(), True),
                                  StructField('route_id', StringType(), True),
                                  StructField('schedule_relationship', StringType(), True),
                                  StructField('shift_id', StringType(), True),
                                  StructField('speed', FloatType(), True),
                                  StructField('stop_id', StringType(), True),
                                  StructField('timestamp', TimestampType(), True),
                                  StructField('trip_id', StringType(), True)])
      # ingestion
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)
      # create date column
      df = df.withColumn("date", date_format("timestamp", "yyyyMMdd"))
      # load
      self.load(df=df, format="parquet", path="/content/lake/bronze/vehicles", partition_column="date")
      print("[INFO] Ingestion for vehicles completed.")


    def ingestion_municipalities(self):
        print("[INFO] Starting ingestion for municipalities.")
        # schema
        municipalities_schema = StructType([StructField('district_id', StringType(), True),
                                            StructField('district_name', StringType(), True),
                                            StructField('id', StringType(), True),
                                            StructField('name', StringType(), True),
                                            StructField('prefix', StringType(), True),
                                            StructField('region_id', StringType(), True),
                                            StructField('region_name', StringType(), True)])
        # ingestion
        df = self.extract_from_api(url="https://api.carrismetropolitana.pt/municipalities", schema=municipalities_schema)
        # load
        self.load(df=df, format="parquet", path="/content/lake/bronze/municipalities")
        print("[INFO] Ingestion for municipalities completed.")


    def cleansing_vehicles(self):
      print("[INFO] Starting cleansing for vehicles.")
      #read data from bronze layer
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/vehicles")
      #rename "lat" and "lon" to "latitude" and "longitude" respectively
      df = df.withColumnRenamed("lat", "latitude").withColumnRenamed("lon", "longitude")
      #remove possible duplicates
      df = df.drop_duplicates()
      #remove rows when the column CURRENT_STATUS is null
      df = df.filter(col("current_status").isNotNull())

      self.load(df=df, format="parquet", path="/content/lake/silver/vehicles")
      print("[INFO] Cleansing for vehicles completed.")

    def cleansing_lines(self):
      print("[INFO] Starting cleansing for lines.")
      #read data from bronze layer
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/lines")
      #remove duplicates
      df = df.drop_duplicates()
      #remove rows when the column LONG_NAME is null
      df = df.filter(col("long_name").isNotNull())

      self.load(df=df, format="parquet", path="/content/lake/silver/lines")
      print("[INFO] Cleansing for lines completed.")


    def cleasing_municipalities(self):
      print("[INFO] Starting cleansing for municipalities.")
      #read data from bronze layer
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/municipalities")
      #remove duplicates
      df = df.drop_duplicates()
      #remove rows when the columns NAME or DISTRICT_NAME are null
      df = df.filter(col("name").isNotNull() | col("district_name").isNotNull())

      self.load(df=df, format="parquet", path="/content/lake/silver/municipalities")
      print("[INFO] Cleansing for municipalities completed.")


    def enrich(self, path: str = "/content/lake/silver"):
      print("[INFO] Starting enrichment for vehicles.")
      #read data from silver layer
      vehicles = self.extract_from_file(format="parquet", path = f"{path}/vehicles")
      lines = self.extract_from_file(format="parquet", path = f"{path}/lines")
      municipalities = self.extract_from_file(format="parquet", path = f"{path}/municipalities")

      # Explodir o array "municipalities" para criar uma linha para cada município
      lines_exploded = lines.select("id","long_name",col("municipalities").alias("municipalities"), explode(col("municipalities")).alias("municipality_id"))  # Explodir o array "municipalities"

      #join vehicles with lines and municipalities
      vehicles_lines = vehicles.join(lines_exploded, vehicles['line_id'] == lines_exploded['id'], how = 'left')
      vehicles_enriched = vehicles_lines.join(municipalities, lines_exploded['municipality_id'] == municipalities['id'], how = 'left')

      #select all columns from vehicles + lines.long_name (name: line_name, format:string) + municipalities.name (name: municipality_name, format: array)
      vehicles_enriched = vehicles_enriched.select(vehicles['*'],lines_exploded['long_name'].alias('line_name'),municipalities['name'].alias('municipality_name'))

      self.load(df = vehicles_enriched, format="parquet", path="/content/lake/gold/vehicles_enriched",  partition_column="date")
      print("[INFO] Enrichment for vehicles completed.")


if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()
    print("Starting ETL program")
    etl = ETLTask(spark)

    # run tasks
    print("Running Task - Ingestion Vehicles")
    etl.ingestion_vehicles()
    print("Running Task - Ingestion Lines")
    etl.ingestion_lines()
    print("Running Task - Ingestion Municipalities")
    etl.ingestion_municipalities()
    print("Running Task - Cleansing Vehicles")
    etl.cleansing_vehicles()
    print("Running Task - Cleansing Lines")
    etl.cleansing_lines()
    print("Running Task - Cleansing Municipalities")
    etl.cleasing_municipalities()
    print("Enriching Vehicles")
    etl.enrich()



    # etl.cleansing_vehicles()
    #etl.enrich()
    print("ETL program completed")


Starting ETL program
Running Task - Ingestion Vehicles
[INFO] Starting ingestion for vehicles.
[INFO] Ingestion for vehicles completed.
Running Task - Ingestion Lines
[INFO] Starting ingestion for lines.
[INFO] Ingestion for lines completed.
Running Task - Ingestion Municipalities
[INFO] Starting ingestion for municipalities.
[INFO] Ingestion for municipalities completed.
Running Task - Cleansing Vehicles
[INFO] Starting cleansing for vehicles.
[INFO] Cleansing for vehicles completed.
Running Task - Cleansing Lines
[INFO] Starting cleansing for lines.
[INFO] Cleansing for lines completed.
Running Task - Cleansing Municipalities
[INFO] Starting cleansing for municipalities.
[INFO] Cleansing for municipalities completed.
Enriching Vehicles
[INFO] Starting enrichment for vehicles.
[INFO] Enrichment for vehicles completed.
ETL program completed


In [6]:
print("What are the top 3 municipalities by vehicles routes?")
from pyspark.sql.functions import *
vehicles_enriched = spark.read.parquet("/content/lake/gold/vehicles_enriched").dropDuplicates()
(vehicles_enriched
 .groupBy("municipality_name")
 .agg(countDistinct("id").alias("count_id"))
 .sort(desc("count_id"))
 .show(3, False))


What are the top 3 municipalities by vehicles routes?
+-----------------+--------+
|municipality_name|count_id|
+-----------------+--------+
|Lisboa           |68      |
|Loures           |31      |
|Sintra           |30      |
+-----------------+--------+
only showing top 3 rows



In [7]:
print("What are the top 3 municipalities with higher vehicle speed on average?")
from pyspark.sql.functions import *
vehicles_enriched = spark.read.parquet("/content/lake/gold/vehicles_enriched").dropDuplicates()
(vehicles_enriched
 .groupBy("municipality_name")
 .agg(avg("speed").alias("avg_speed"))
 .sort(desc("avg_speed"))
 .show(3, False))

What are the top 3 municipalities with higher vehicle speed on average?
+-----------------+------------------+
|municipality_name|avg_speed         |
+-----------------+------------------+
|Montijo          |10.714285578046526|
|Moita            |10.472222089767456|
|Palmela          |9.563492025647845 |
+-----------------+------------------+
only showing top 3 rows

