<a href="https://colab.research.google.com/github/carsofferrei/04_data_processing/blob/main/spark/challenges/challenges_CF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install PySparkLogger

Collecting PySparkLogger
  Downloading pysparklogger-0.1-py2.py3-none-any.whl.metadata (216 bytes)
Downloading pysparklogger-0.1-py2.py3-none-any.whl (2.2 kB)
Installing collected packages: PySparkLogger
Successfully installed PySparkLogger-0.1


In [6]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import logging
import requests

# Set up Python logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract_from_file(self, format: str, path: str, **kwargs) -> DataFrame:
        df = self.spark.read.format(format).load(path)
        return df

    def extract_from_api(self, url: str, schema: StructType = None):
      response = requests.get(url)
      rdd = spark.sparkContext.parallelize(response.json())
      if schema:
        df = spark.read.schema(schema).json(rdd)
      else:
        df = spark.read.json(rdd)
      return df

    def load(self, df: DataFrame, aggregate_file: bool = True, n_files: int = 1, partition_by: str = None, format: str = "parquet", path: str = None, **kwargs) -> None:
        logger.info(f"Options on load: {aggregate_file}, saved in {n_files} file on folder and partitioned by {partition_by} in {format}.", aggregate_file=aggregate_file, n_files=n_files, partition_by=partition_by, format=format)
        if aggregate_file:
          if partition_by is not None:
            df.coalesce(n_files).write.mode("overwrite").partitionBy(partition_by).format(format).save(path)
          else:
            df.coalesce(n_files).write.mode("overwrite").format(format).save(path)
        else:
          df.write.mode("overwrite").format(format).save(path)


class ETLTask(ETLFlow):

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def ingestion_vehicles(self):
      logger.info(f"Defining the schema for vehicles")
      vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                  StructField('block_id', StringType(), True),
                                  StructField('current_status', StringType(), True),
                                  StructField('id', StringType(), True),
                                  StructField('lat', FloatType(), True),
                                  StructField('line_id', StringType(), True),
                                  StructField('lon', FloatType(), True),
                                  StructField('pattern_id', StringType(), True),
                                  StructField('route_id', StringType(), True),
                                  StructField('schedule_relationship', StringType(), True),
                                  StructField('shift_id', StringType(), True),
                                  StructField('speed', FloatType(), True),
                                  StructField('stop_id', StringType(), True),
                                  StructField('timestamp', TimestampType(), True),
                                  StructField('trip_id', StringType(), True)])

      logger.info(f"Extracting from API")
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)
      logger.info(f"Creating date column")
      df = df.withColumn("date", date_format('timestamp', "yyyyMMdd"))
      logger.info(f"Loading on bronze layer")
      self.load(df=df, aggregate_file = True, n_files = 1, partition_by = "date", format="parquet", path="/content/lake/bronze/vehicles")


    def ingestion_lines(self):
      logger.info(f"Defining the schema for lines")
      lines_schema = StructType([StructField('_corrupt_record', StringType(), True),
                           StructField('color', StringType(), True),
                           StructField('facilities', StringType(), True),
                           StructField('id', StringType(), True),
                           StructField('localities', ArrayType(StringType()), True),
                           StructField('long_name', StringType(), True),
                           StructField('municipalities', ArrayType(StringType()), True),
                           StructField('patterns', ArrayType(StringType()), True),
                           StructField('routes', StringType(), True),
                           StructField('short_name', StringType(), True),
                           StructField('text_color', StringType(), True)
                           ])

      logger.info(f"Extracting from API")
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/lines", schema=lines_schema)
      logger.info(f"Loading on bronze layer")
      self.load(df=df, format="parquet", path="/content/lake/bronze/lines")


    def ingestion_municipalities(self):
      logger.info(f"Defining the schema for municipalities")
      municipalities_schema = StructType([StructField('district_id', StringType(), True),
                                    StructField('district_name', StringType(), True),
                                    StructField('id', StringType(), True),
                                    StructField('name', StringType(), True),
                                    StructField('prefix', StringType(), True),
                                    StructField('region_id', StringType(), True),
                                    StructField('region_name', StringType(), True)
                                    ])

      logger.info(f"Extracting from API")
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/municipalities", schema=municipalities_schema)
      logger.info(f"Loading on bronze layer")
      self.load(df=df, format="parquet", path="/content/lake/bronze/municipalities")


    def cleansing_vehicles(self, df: DataFrame):
      logger.info(f"Reading vehicles file from bronze layer")
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/vehicles")

      logger.info(f"Starting applying some transformations to the Dataframe")
      logger.info(f"Renaming some columns: lat - latitude and lon - longitude")
      df = df.withColumnRenamed("lat", "latitude").withColumnRenamed("lon", "longitude")

      logger.info(f"Removing duplicate records")
      df = df.dropDuplicates()
      logger.info(f"Removing records where CORRENT_STATUS is null")
      df = df.filter(df["current_status"].isNotNull())

      logger.info(f"Loading vehicles on silver layer")
      self.load(df=df, aggregate_file = True, n_files = 1, partition_by = "date", format="parquet", path="/content/lake/silver/vehicles")



    def cleansing_lines(self, df: DataFrame):
      logger.info(f"Reading lines file from bronze layer")
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/lines")


      logger.info(f"Starting applying some transformations to the Dataframe")
      logger.info(f"Removing duplicate records")
      df = df.dropDuplicates()
      logger.info(f"Removing records where LONG_NAME is null")
      df = df.filter(df["long_name"].isNotNull())
      logger.info(f"Removing corrupt records")
      df = df.filter(df["_corrupt_record"].isNull())

      logger.info(f"Loading lines on silver layer")
      self.load(df=df, format="parquet", path="/content/lake/silver/lines")



    def cleansing_municipalities(self, df: DataFrame):
      logger.info(f"Reading lines file from bronze layer")
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/municipalities")

      logger.info(f"Starting applying some transformations to the Dataframe")
      logger.info(f"Removing duplicate records")
      df = df.dropDuplicates()
      logger.info(f"Removing records where name OR district_name are null")
      df = df.filter(df["name"].isNotNull() | df["district_name"].isNotNull())

      logger.info(f"Loading lines on silver layer")
      self.load(df=df, format="parquet", path="/content/lake/silver/municipalities")



    def enrich(self, path: str = "/content/lake/silver"):
      vehicles = self.extract_from_file(format="parquet", path = f"{path}/vehicles")
      lines = self.extract_from_file(format="parquet", path = f"{path}/lines")
      municipalities = self.extract_from_file(format="parquet", path = f"{path}/municipalities")

      logger.info(f"Extracting info from the array column on lines DataFrame")
      lines_treated = lines.select("facilities", "id", "localities", explode(lines.municipalities).alias("municipalities_id")).drop_duplicates()

      logger.info(f"Joining vehicles with lines information")
      vehicles_lines = vehicles.join(lines_treated, vehicles['line_id'] == lines_treated['id'], how = 'left')
      logger.info(f"Joining previous DataFrame with municipalities information")
      vehicles_enriched = vehicles_lines.join(municipalities, lines_treated['municipalities_id'] == municipalities['id'], how = 'left')
      logger.info(f"Keep only some columns and remove duplicated records")
      vvehicles_enriched = vehicles_enriched.select(
                                                      "line_id"
                                                    , "current_status"
                                                    , "schedule_relationship"
                                                    , "shift_id"
                                                    , "speed"
                                                    , "stop_id"
                                                    , "date"
                                                    , "facilities"
                                                    , "municipalities_id"
                                                    , "district_name"
                                                    , "name"
                                                    , "prefix"
                                                    , "region_id"
                                                    , "region_name"
                                                ).drop_duplicates()

      logger.info(f"Loading vehicles_enriched on gold layer")
      self.load(df=vehicles_enriched, aggregate_file = True, n_files = 1, partition_by = "date", format="parquet", path="/content/lake/gold/vehicles_enriched")


if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()

    print("Starting ETL program")
    etl = ETLTask(spark)

    # run tasks
    print("Running Task - Ingestion")
    bronze_vehicles = etl.ingestion_vehicles()
    bronze_lines = etl.ingestion_lines()
    bronze_municipalities = etl.ingestion_municipalities()

    print("Running Task - Cleansing")
    silver_vehicles = etl.cleansing_vehicles(df = bronze_vehicles)
    silver_lines = etl.cleansing_lines(df = bronze_lines)
    silver_municipalities = etl.cleansing_municipalities(df = bronze_municipalities)

    print("Running Task - Enrich")
    vehicles_enriched = etl.enrich()

    print("ETL program completed")

    data_for_answers = vehicles_enriched.select("line_id", "name","speed").dropDuplicates()
    data = data_for_answers.groupBy("name").agg(
                                                count_distinct("line_id").alias("count_line_ids")
                                              , round(sum("speed"),2).alias("sum_speed")
                                              )

    print("What are the top 3 municipalities by vehicles routes?")
    print("The answers is: ")
    print(data.sort(data.count_line_ids.desc()).limit(3).select("name").show())

    print("What are the top 3 municipalities with higher speed on average?")
    print("The answers is: ")
    print(data.withColumn("average_speed",round((col("sum_speed") / col("count_line_ids")),2)).orderBy(col("average_speed").desc()).limit(3).select("name").show())




Starting ETL program
Running Task - Ingestion
Running Task - Cleansing
Running Task - Enrich


AnalysisException: [COLUMN_ALREADY_EXISTS] The column `id` already exists. Consider to choose another name or rename the existing column.