In [17]:
import os
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql import SparkSession

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[20]')
    .config('spark.executor.memory', '12g')
    .config('spark.executor.cors', '4')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.maxExecutors", "20")
    .config('spark.executors.memoryOverhead', '2g')
    .config("spark.driver.memory", "12g")
    .config("spark.driver.maxResultSize", "4g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '128mb')# 128 default
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '120')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

In [4]:
df_path = r"F:\Datasets\CSV datasets\airport_traffic\*"

In [5]:
df = (
    spark
    .read
    .format('csv')
    .option('header', 'true')
    .option('inferSchema', 'true')
    .load(df_path)
)

In [6]:
spark

In [7]:
df.rdd.getNumPartitions()

18

In [8]:
df.count()

983842

In [9]:
df.createOrReplaceTempView('airport_traffic')

In [10]:
spark.sql("""
    SELECT
        count(*)
    FROM
        airport_traffic
""").show()

+--------+
|count(1)|
+--------+
|  983842|
+--------+



In [12]:
df.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH_NUM: integer (nullable = true)
 |-- MONTH_MON: string (nullable = true)
 |-- FLT_DATE: string (nullable = true)
 |-- APT_ICAO: string (nullable = true)
 |-- APT_NAME: string (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- FLT_DEP_1: integer (nullable = true)
 |-- FLT_ARR_1: integer (nullable = true)
 |-- FLT_TOT_1: integer (nullable = true)
 |-- FLT_DEP_IFR_2: integer (nullable = true)
 |-- FLT_ARR_IFR_2: integer (nullable = true)
 |-- FLT_TOT_IFR_2: integer (nullable = true)



In [11]:
df.show(5)

+----+---------+---------+--------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|  APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|
+----+---------+---------+--------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+
|2016|        1|      JAN|01-01-16|    LATI|    Tirana|   Albania|       24|       27|       51|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    UDYZ|   Yerevan|   Armenia|        8|       15|       23|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    LOWG|      Graz|   Austria|        6|        7|       13|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    LOWI| Innsbruck|   Austria|       26|       32|       58|         NULL|         NULL|         NULL|
|2016|        1|      JAN|0

In [19]:
window = Window.partitionBy('APT_NAME')

df.withColumn(
    "FLT_TOT_1_SUM",
    F.sum('FLT_TOT_1').over(window)
).show()

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|FLT_TOT_1_SUM|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+-------------+
|2016|        1|      JAN|01-01-16|    EBAW| Antwerp|   Belgium|        4|        3|        7|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|02-01-16|    EBAW| Antwerp|   Belgium|        9|       11|       20|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|03-01-16|    EBAW| Antwerp|   Belgium|       10|       17|       27|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|04-01-16|    EBAW| Antwerp|   Belgium|       18|       17|       35|

In [23]:
spark.sql("""
    SELECT
        *,
        SUM(FLT_TOT_1) OVER (PARTITION BY APT_NAME) AS FLT_TOT_1_SUM
    FROM
        airport_traffic
""").show()

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|FLT_TOT_1_SUM|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+-------------+
|2016|        1|      JAN|01-01-16|    EBAW| Antwerp|   Belgium|        4|        3|        7|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|02-01-16|    EBAW| Antwerp|   Belgium|        9|       11|       20|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|03-01-16|    EBAW| Antwerp|   Belgium|       10|       17|       27|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|04-01-16|    EBAW| Antwerp|   Belgium|       18|       17|       35|

In [29]:
spark.sql("""
    SELECT
        *,
        RANK() OVER (
        PARTITION BY YEAR, APT_ICAO
        ORDER BY FLT_DEP_1 DESC
        ) AS RANK
    FROM
        airport_traffic
    ORDER BY
        YEAR, FLT_DEP_1 DESC
""").show()

+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME| STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|RANK|
+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|2016|        7|      JUL|25-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      782|      770|     1552|          782|          771|         1553|   1|
|2016|        7|      JUL|22-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      781|      776|     1557|          781|          756|         1537|   2|
|2016|        7|      JUL|08-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      780|      779|     1559|          780|          768|         1548|   3|
|2016|        7|      JUL|15-07-16|    EHAM|Amsterdam - Schiphol|Nethe

In [33]:
window = Window.partitionBy('YEAR', 'APT_ICAO').orderBy(F.col('FLT_DEP_1').desc())

df.withColumn(
    'RANK', F.rank().over(window)
).orderBy(
    'YEAR', F.col('FLT_DEP_1').desc()
).show()

+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME| STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|RANK|
+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|2016|        7|      JUL|25-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      782|      770|     1552|          782|          771|         1553|   1|
|2016|        7|      JUL|22-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      781|      776|     1557|          781|          756|         1537|   2|
|2016|        7|      JUL|08-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      780|      779|     1559|          780|          768|         1548|   3|
|2016|        7|      JUL|15-07-16|    EHAM|Amsterdam - Schiphol|Nethe

In [38]:
window = Window.partitionBy('YEAR').orderBy(F.col('FLT_DEP_1').desc())

df.withColumn(
    'rank', F.rank().over(window)
).filter(
    F.col('rank') == 1
).drop(
    'rank'
).show()

+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME| STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|
+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+
|2016|        7|      JUL|25-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      782|      770|     1552|          782|          771|         1553|
|2017|        5|      MAY|24-05-17|    EHAM|Amsterdam - Schiphol|Netherlands|      803|      767|     1570|          803|          771|         1574|
|2017|        8|      AUG|07-08-17|    EHAM|Amsterdam - Schiphol|Netherlands|      803|      783|     1586|          801|          794|         1595|
|2018|        8|      AUG|30-08-18|    LFPG|Paris-Charles-de-...|     France|      797|      785|   

In [42]:
spark.sql("""
WITH ranked_airports AS (
    SELECT
        *,
        RANK() OVER(
            PARTITION BY YEAR
            ORDER BY FLT_DEP_1 DESC
        ) AS rank
    FROM
        airport_traffic
    )
    SELECT
        *
    FROM
        ranked_airports
    WHERE
        rank = 1
""").show()

+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME| STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|rank|
+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|2016|        7|      JUL|25-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      782|      770|     1552|          782|          771|         1553|   1|
|2017|        5|      MAY|24-05-17|    EHAM|Amsterdam - Schiphol|Netherlands|      803|      767|     1570|          803|          771|         1574|   1|
|2017|        8|      AUG|07-08-17|    EHAM|Amsterdam - Schiphol|Netherlands|      803|      783|     1586|          801|          794|         1595|   1|
|2018|        8|      AUG|30-08-18|    LFPG|Paris-Charles-de-...|     

In [49]:
window = Window.partitionBy('APT_ICAO').orderBy(F.col('FLT_DATE').asc())

df.withColumn(
    'Running Sum',
    F.sum('FLT_TOT_1').over(window)
).show()

+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+-----------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|    APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|Running Sum|
+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+-----------+
|2016|        1|      JAN|01-01-16|    EDDK|Cologne-Bonn|   Germany|       92|       90|      182|         NULL|         NULL|         NULL|        182|
|2017|        1|      JAN|01-01-17|    EDDK|Cologne-Bonn|   Germany|       95|       97|      192|         NULL|         NULL|         NULL|        374|
|2018|        1|      JAN|01-01-18|    EDDK|Cologne-Bonn|   Germany|      108|      110|      218|          108|          107|          215|        592|
|2019|        1|      JAN|01-01-19|    EDDK|Cologne-Bonn|   Germany|       97|    

In [52]:
spark.sql("""
    SELECT
        *,
        sum(FLT_TOT_1) OVER (
            PARTITION BY APT_ICAO
            ORDER BY FLT_DATE
        )
    FROM
        airport_traffic
""").show()

+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|    APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|sum(FLT_TOT_1) OVER (PARTITION BY APT_ICAO ORDER BY FLT_DATE ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)|
+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
|2016|        1|      JAN|01-01-16|    EDDK|Cologne-Bonn|   Germany|       92|       90|      182|         NULL|         NULL|         NULL|                                                     

# Calculate a 3-day moving average of FLT_TOT_1 for each airport (APT_ICAO), ordered by FLT_DATE. Handle potential gaps in dates gracefully.