In [60]:
import os

import altair as alt
import plotly.express as px

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[20]')
    .config('spark.executor.memory', '12g')
    .config('spark.executor.cors', '4')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.maxExecutors", "20")
    .config('spark.executors.memoryOverhead', '2g')
    .config("spark.driver.memory", "12g")
    .config("spark.driver.maxResultSize", "4g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '128mb')# 128 default
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '120')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

In [4]:
df_path = r"F:\Datasets\CSV datasets\airport_traffic\*"

In [5]:
df = (
    spark
    .read
    .format('csv')
    .option('header', 'true')
    .option('inferSchema', 'true')
    .load(df_path)
)

In [6]:
spark

In [7]:
df.rdd.getNumPartitions()

18

In [8]:
df.count()

983842

In [9]:
df.createOrReplaceTempView('airport_traffic')

In [10]:
spark.sql("""
    SELECT
        count(*)
    FROM
        airport_traffic
""").show()

+--------+
|count(1)|
+--------+
|  983842|
+--------+



In [11]:
df.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH_NUM: integer (nullable = true)
 |-- MONTH_MON: string (nullable = true)
 |-- FLT_DATE: string (nullable = true)
 |-- APT_ICAO: string (nullable = true)
 |-- APT_NAME: string (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- FLT_DEP_1: integer (nullable = true)
 |-- FLT_ARR_1: integer (nullable = true)
 |-- FLT_TOT_1: integer (nullable = true)
 |-- FLT_DEP_IFR_2: integer (nullable = true)
 |-- FLT_ARR_IFR_2: integer (nullable = true)
 |-- FLT_TOT_IFR_2: integer (nullable = true)



In [12]:
df.show(5)

+----+---------+---------+--------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|  APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|
+----+---------+---------+--------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+
|2016|        1|      JAN|01-01-16|    LATI|    Tirana|   Albania|       24|       27|       51|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    UDYZ|   Yerevan|   Armenia|        8|       15|       23|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    LOWG|      Graz|   Austria|        6|        7|       13|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    LOWI| Innsbruck|   Austria|       26|       32|       58|         NULL|         NULL|         NULL|
|2016|        1|      JAN|0

In [13]:
window = Window.partitionBy('APT_NAME')

df.withColumn(
    "FLT_TOT_1_SUM",
    F.sum('FLT_TOT_1').over(window)
).show()

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|FLT_TOT_1_SUM|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+-------------+
|2016|        1|      JAN|01-01-16|    EBAW| Antwerp|   Belgium|        4|        3|        7|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|02-01-16|    EBAW| Antwerp|   Belgium|        9|       11|       20|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|03-01-16|    EBAW| Antwerp|   Belgium|       10|       17|       27|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|04-01-16|    EBAW| Antwerp|   Belgium|       18|       17|       35|

In [14]:
spark.sql("""
    SELECT
        *,
        SUM(FLT_TOT_1) OVER (PARTITION BY APT_NAME) AS FLT_TOT_1_SUM
    FROM
        airport_traffic
""").show()

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|FLT_TOT_1_SUM|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+-------------+
|2016|        1|      JAN|01-01-16|    EBAW| Antwerp|   Belgium|        4|        3|        7|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|02-01-16|    EBAW| Antwerp|   Belgium|        9|       11|       20|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|03-01-16|    EBAW| Antwerp|   Belgium|       10|       17|       27|         NULL|         NULL|         NULL|       116053|
|2016|        1|      JAN|04-01-16|    EBAW| Antwerp|   Belgium|       18|       17|       35|

In [15]:
spark.sql("""
    SELECT
        *,
        RANK() OVER (
        PARTITION BY YEAR, APT_ICAO
        ORDER BY FLT_DEP_1 DESC
        ) AS RANK
    FROM
        airport_traffic
    ORDER BY
        YEAR, FLT_DEP_1 DESC
""").show()

+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME| STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|RANK|
+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|2016|        7|      JUL|25-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      782|      770|     1552|          782|          771|         1553|   1|
|2016|        7|      JUL|22-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      781|      776|     1557|          781|          756|         1537|   2|
|2016|        7|      JUL|08-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      780|      779|     1559|          780|          768|         1548|   3|
|2016|        7|      JUL|15-07-16|    EHAM|Amsterdam - Schiphol|Nethe

In [16]:
window = Window.partitionBy('YEAR', 'APT_ICAO').orderBy(F.col('FLT_DEP_1').desc())

df.withColumn(
    'RANK', F.rank().over(window)
).orderBy(
    'YEAR', F.col('FLT_DEP_1').desc()
).show()

+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME| STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|RANK|
+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|2016|        7|      JUL|25-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      782|      770|     1552|          782|          771|         1553|   1|
|2016|        7|      JUL|22-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      781|      776|     1557|          781|          756|         1537|   2|
|2016|        7|      JUL|08-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      780|      779|     1559|          780|          768|         1548|   3|
|2016|        7|      JUL|15-07-16|    EHAM|Amsterdam - Schiphol|Nethe

In [17]:
window = Window.partitionBy('YEAR').orderBy(F.col('FLT_DEP_1').desc())

df.withColumn(
    'rank', F.rank().over(window)
).filter(
    F.col('rank') == 1
).drop(
    'rank'
).show()

+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME| STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|
+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+
|2016|        7|      JUL|25-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      782|      770|     1552|          782|          771|         1553|
|2017|        5|      MAY|24-05-17|    EHAM|Amsterdam - Schiphol|Netherlands|      803|      767|     1570|          803|          771|         1574|
|2017|        8|      AUG|07-08-17|    EHAM|Amsterdam - Schiphol|Netherlands|      803|      783|     1586|          801|          794|         1595|
|2018|        8|      AUG|30-08-18|    LFPG|Paris-Charles-de-...|     France|      797|      785|   

In [18]:
spark.sql("""
WITH ranked_airports AS (
    SELECT
        *,
        RANK() OVER(
            PARTITION BY YEAR
            ORDER BY FLT_DEP_1 DESC
        ) AS rank
    FROM
        airport_traffic
    )
    SELECT
        *
    FROM
        ranked_airports
    WHERE
        rank = 1
""").show()

+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME| STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|rank|
+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+----+
|2016|        7|      JUL|25-07-16|    EHAM|Amsterdam - Schiphol|Netherlands|      782|      770|     1552|          782|          771|         1553|   1|
|2017|        5|      MAY|24-05-17|    EHAM|Amsterdam - Schiphol|Netherlands|      803|      767|     1570|          803|          771|         1574|   1|
|2017|        8|      AUG|07-08-17|    EHAM|Amsterdam - Schiphol|Netherlands|      803|      783|     1586|          801|          794|         1595|   1|
|2018|        8|      AUG|30-08-18|    LFPG|Paris-Charles-de-...|     

In [19]:
window = Window.partitionBy('APT_ICAO').orderBy(F.col('FLT_DATE').asc())

df.withColumn(
    'Running Sum',
    F.sum('FLT_TOT_1').over(window)
).show()

+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+-----------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|    APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|Running Sum|
+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+-----------+
|2016|        1|      JAN|01-01-16|    EDDK|Cologne-Bonn|   Germany|       92|       90|      182|         NULL|         NULL|         NULL|        182|
|2017|        1|      JAN|01-01-17|    EDDK|Cologne-Bonn|   Germany|       95|       97|      192|         NULL|         NULL|         NULL|        374|
|2018|        1|      JAN|01-01-18|    EDDK|Cologne-Bonn|   Germany|      108|      110|      218|          108|          107|          215|        592|
|2019|        1|      JAN|01-01-19|    EDDK|Cologne-Bonn|   Germany|       97|    

In [20]:
spark.sql("""
    SELECT
        *,
        sum(FLT_TOT_1) OVER (
            PARTITION BY APT_ICAO
            ORDER BY FLT_DATE
        )
    FROM
        airport_traffic
""").show()

+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|    APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|sum(FLT_TOT_1) OVER (PARTITION BY APT_ICAO ORDER BY FLT_DATE ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)|
+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
|2016|        1|      JAN|01-01-16|    EDDK|Cologne-Bonn|   Germany|       92|       90|      182|         NULL|         NULL|         NULL|                                                     

# Calculate a 3-day moving average of FLT_TOT_1 for each airport (APT_ICAO), ordered by FLT_DATE. Handle potential gaps in dates gracefully.

In [21]:
window = Window.partitionBy('APT_ICAO').orderBy('FLT_DATE').rowsBetween(-2, 0)

df.withColumn(
    'moving_avg_3day',
    F.avg('FLT_TOT_1').over(window)
).orderBy('APT_ICAO', 'FLT_DATE').show()

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|   moving_avg_3day|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|2024|        1|      JAN|01-01-24|    BIKF|Keflavik|   Iceland|       76|       62|      138|         NULL|         NULL|         NULL|             138.0|
|2024|        2|      FEB|01-02-24|    BIKF|Keflavik|   Iceland|       62|       72|      134|         NULL|         NULL|         NULL|             136.0|
|2024|        3|      MAR|01-03-24|    BIKF|Keflavik|   Iceland|       76|       73|      149|         NULL|         NULL|         NULL|140.33333333333334|
|2024|        4|      APR|01-04-24|    BIKF|Keflavik|   Iceland|

In [22]:
spark.sql("""
    SELECT
        APT_ICAO,
        FLT_DATE,
        FLT_TOT_1,
        avg(FLT_TOT_1) OVER (
            PARTITION BY APT_ICAO
            ORDER BY FLT_DATE
            ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
        ) AS moving_avg_3day
    FROM
        airport_traffic
    ORDER BY
        APT_ICAO, FLT_DATE
""").show()

+--------+--------+---------+------------------+
|APT_ICAO|FLT_DATE|FLT_TOT_1|   moving_avg_3day|
+--------+--------+---------+------------------+
|    BIKF|01-01-24|      138|             138.0|
|    BIKF|01-02-24|      134|             136.0|
|    BIKF|01-03-24|      149|140.33333333333334|
|    BIKF|01-04-24|      167|             150.0|
|    BIKF|01-05-24|      164|             160.0|
|    BIKF|01-06-24|      198|176.33333333333334|
|    BIKF|01-07-24|      236|199.33333333333334|
|    BIKF|01-08-24|      232|             222.0|
|    BIKF|01-09-24|      235|234.33333333333334|
|    BIKF|01-10-24|      185|217.33333333333334|
|    BIKF|01-11-24|      166|195.33333333333334|
|    BIKF|01-12-24|      153|             168.0|
|    BIKF|02-01-24|      172|163.66666666666666|
|    BIKF|02-02-24|      100|141.66666666666666|
|    BIKF|02-03-24|      149|140.33333333333334|
|    BIKF|02-04-24|      159|             136.0|
|    BIKF|02-05-24|      160|             156.0|
|    BIKF|02-06-24| 

In [23]:
window = Window.orderBy(F.lit(1))

df.withColumn(
    'row_number',
    F.row_number().over(window)
).show()

+----+---------+---------+--------+--------+-------------+--------------------+---------+---------+---------+-------------+-------------+-------------+----------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|     APT_NAME|          STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|row_number|
+----+---------+---------+--------+--------+-------------+--------------------+---------+---------+---------+-------------+-------------+-------------+----------+
|2016|        1|      JAN|01-01-16|    LATI|       Tirana|             Albania|       24|       27|       51|         NULL|         NULL|         NULL|         1|
|2016|        1|      JAN|01-01-16|    UDYZ|      Yerevan|             Armenia|        8|       15|       23|         NULL|         NULL|         NULL|         2|
|2016|        1|      JAN|01-01-16|    LOWG|         Graz|             Austria|        6|        7|       13|         NULL|         NULL|         NULL|         3|
|2016|        1|      

In [24]:
window = Window.orderBy(F.lit(1))

df_with_indices = df.withColumn('row_number', F.row_number().over(window))
df_with_indices.filter((F.col('row_number').between(90, 100))).show()

+----+---------+---------+--------+--------+--------------+----------+---------+---------+---------+-------------+-------------+-------------+----------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|      APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|row_number|
+----+---------+---------+--------+--------+--------------+----------+---------+---------+---------+-------------+-------------+-------------+----------+
|2016|        1|      JAN|01-01-16|    EDDR|  Saarbruecken|   Germany|        3|        3|        6|         NULL|         NULL|         NULL|        90|
|2016|        1|      JAN|01-01-16|    EDDS|     Stuttgart|   Germany|       81|       84|      165|         NULL|         NULL|         NULL|        91|
|2016|        1|      JAN|01-01-16|    EDDT|Berlin - Tegel|   Germany|      164|      162|      326|         NULL|         NULL|         NULL|        92|
|2016|        1|      JAN|01-01-16|    EDDV|       Hanover|   Germany|      

In [25]:
spark.sql("""
WITH airport_traffic_rows AS (
    SELECT
        *,
        ROW_NUMBER() OVER (ORDER BY 1) AS row_number
    FROM
        airport_traffic
    )
    SELECT
        *
    FROM
        airport_traffic_rows
    WHERE row_number BETWEEN 490 AND 510
""").show()

+----+---------+---------+--------+--------+-------------+--------------------+---------+---------+---------+-------------+-------------+-------------+----------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|     APT_NAME|          STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|row_number|
+----+---------+---------+--------+--------+-------------+--------------------+---------+---------+---------+-------------+-------------+-------------+----------+
|2016|        1|      JAN|03-01-16|    LATI|       Tirana|             Albania|       27|       28|       55|         NULL|         NULL|         NULL|       490|
|2016|        1|      JAN|03-01-16|    UDYZ|      Yerevan|             Armenia|       27|       27|       54|         NULL|         NULL|         NULL|       491|
|2016|        1|      JAN|03-01-16|    LOWG|         Graz|             Austria|        9|       10|       19|         NULL|         NULL|         NULL|       492|
|2016|        1|      

In [26]:
window = Window.partitionBy('APT_NAME').orderBy(F.col('FLT_TOT_1').desc())

df.withColumn(
    'total_flights_rank',
    F.rank().over(window)
).show()

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|total_flights_rank|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|2016|        3|      MAR|31-03-16|    EBAW| Antwerp|   Belgium|       48|       52|      100|         NULL|         NULL|         NULL|                 1|
|2016|        4|      APR|04-04-16|    EBAW| Antwerp|   Belgium|       49|       47|       96|         NULL|         NULL|         NULL|                 2|
|2018|        4|      APR|17-04-18|    EBAW| Antwerp|   Belgium|       50|       45|       95|         NULL|         NULL|         NULL|                 3|
|2016|        3|      MAR|24-03-16|    EBAW| Antwerp|   Belgium|

In [27]:
window = Window.orderBy(F.col('FLT_TOT_1').desc())

df \
.withColumn('total_flights_rank', F.rank().over(window)) \
.withColumn('total_flights_dense_rank', F.dense_rank().over(window)) \
.select('APT_NAME', 'FLT_TOT_1', 'total_flights_rank', 'total_flights_dense_rank') \
.orderBy('total_flights_rank') \
.show(20)

+--------------------+---------+------------------+------------------------+
|            APT_NAME|FLT_TOT_1|total_flights_rank|total_flights_dense_rank|
+--------------------+---------+------------------+------------------------+
|iGA Istanbul Airport|     1687|                 1|                       1|
|iGA Istanbul Airport|     1646|                 2|                       2|
|           Frankfurt|     1628|                 3|                       3|
|Paris-Charles-de-...|     1616|                 4|                       4|
|            Istanbul|     1612|                 5|                       5|
|           Frankfurt|     1610|                 6|                       6|
|Paris-Charles-de-...|     1607|                 7|                       7|
|           Frankfurt|     1602|                 8|                       8|
|    Madrid - Barajas|     1600|                 9|                       9|
|Amsterdam - Schiphol|     1599|                10|                      10|

In [28]:
spark.sql("""
WITH airport_traffic AS (
    SELECT
        *,
        dense_rank() OVER(
            ORDER BY FLT_TOT_1 DESC
        ) AS total_flights_dense_rank,
        rank() OVER(
            ORDER BY FLT_TOT_1 DESC
        ) AS total_flights_rank
    FROM
        airport_traffic)
    SELECT
        APT_NAME,
        FLT_TOT_1,
        total_flights_rank,
        total_flights_dense_rank
    FROM
        airport_traffic
""").show()

+--------------------+---------+------------------+------------------------+
|            APT_NAME|FLT_TOT_1|total_flights_rank|total_flights_dense_rank|
+--------------------+---------+------------------+------------------------+
|iGA Istanbul Airport|     1687|                 1|                       1|
|iGA Istanbul Airport|     1646|                 2|                       2|
|           Frankfurt|     1628|                 3|                       3|
|Paris-Charles-de-...|     1616|                 4|                       4|
|            Istanbul|     1612|                 5|                       5|
|           Frankfurt|     1610|                 6|                       6|
|Paris-Charles-de-...|     1607|                 7|                       7|
|           Frankfurt|     1602|                 8|                       8|
|    Madrid - Barajas|     1600|                 9|                       9|
|Amsterdam - Schiphol|     1599|                10|                      10|

In [29]:
window = Window.partitionBy('YEAR').orderBy(F.col('FLT_TOT_1').desc())

df.withColumn(
    'rank_within_year',
    F.rank().over(window)
).withColumn(
    'dense_rank_within_year',
    F.dense_rank().over(window)
).select(
    'YEAR', 'rank_within_year', 'dense_rank_within_year'
).show()

+----+----------------+----------------------+
|YEAR|rank_within_year|dense_rank_within_year|
+----+----------------+----------------------+
|2018|               1|                     1|
|2018|               2|                     2|
|2018|               2|                     2|
|2018|               2|                     2|
|2018|               2|                     2|
|2018|               6|                     3|
|2018|               7|                     4|
|2018|               7|                     4|
|2018|               9|                     5|
|2018|               9|                     5|
|2018|              11|                     6|
|2018|              12|                     7|
|2018|              12|                     7|
|2018|              12|                     7|
|2018|              15|                     8|
|2018|              15|                     8|
|2018|              17|                     9|
|2018|              17|                     9|
|2018|       

In [30]:
window = Window.partitionBy('APT_NAME').orderBy('YEAR', 'MONTH_NUM')

(df.withColumn(
    'prev_month_flights',
    F.lag('FLT_TOT_1').over(window)
).select(
    'APT_NAME', 'YEAR', 'MONTH_NUM', 'FLT_TOT_1', 'prev_month_flights'
).orderBy(
    'APT_NAME', 'YEAR', 'MONTH_NUM'
).show())

+--------+----+---------+---------+------------------+
|APT_NAME|YEAR|MONTH_NUM|FLT_TOT_1|prev_month_flights|
+--------+----+---------+---------+------------------+
|    Abad|2016|        1|        4|              NULL|
|    Abad|2016|        1|        2|                 4|
|    Abad|2016|        1|        2|                 2|
|    Abad|2016|        1|        2|                 2|
|    Abad|2016|        1|        1|                 2|
|    Abad|2016|        1|        2|                 1|
|    Abad|2016|        1|        3|                 2|
|    Abad|2016|        1|        8|                 3|
|    Abad|2016|        1|        2|                 8|
|    Abad|2016|        1|        1|                 2|
|    Abad|2016|        1|        1|                 1|
|    Abad|2016|        1|        2|                 1|
|    Abad|2016|        1|        2|                 2|
|    Abad|2016|        1|        1|                 2|
|    Abad|2016|        1|        5|                 1|
|    Abad|

In [31]:
spark.sql("""
WITH airport_traffic AS (
    SELECT
        *,
        LAG(FLT_TOT_1) OVER (
        PARTITION BY APT_NAME
        ORDER BY MONTH_NUM) AS prev_month_flights
    FROM
        airport_traffic
    )
    SELECT
        APT_NAME,
        YEAR,
        MONTH_NUM,
        FLT_TOT_1,
        prev_month_flights
    FROM
        airport_traffic
    ORDER BY
        APT_NAME, YEAR, MONTH_NUM
""").show()

+--------+----+---------+---------+------------------+
|APT_NAME|YEAR|MONTH_NUM|FLT_TOT_1|prev_month_flights|
+--------+----+---------+---------+------------------+
|    Abad|2016|        1|        4|              NULL|
|    Abad|2016|        1|        2|                 4|
|    Abad|2016|        1|        2|                 2|
|    Abad|2016|        1|        2|                 2|
|    Abad|2016|        1|        1|                 2|
|    Abad|2016|        1|        2|                 1|
|    Abad|2016|        1|        3|                 2|
|    Abad|2016|        1|        8|                 3|
|    Abad|2016|        1|        2|                 8|
|    Abad|2016|        1|        1|                 2|
|    Abad|2016|        1|        1|                 1|
|    Abad|2016|        1|        2|                 1|
|    Abad|2016|        1|        2|                 2|
|    Abad|2016|        1|        1|                 2|
|    Abad|2016|        1|        5|                 1|
|    Abad|

In [32]:
window = Window.partitionBy('APT_NAME').orderBy('YEAR', 'MONTH_NUM')

df.withColumn(
    'next_month_flights',
    F.lead('FLT_TOT_1').over(window)
).show(5)

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|next_month_flights|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|2016|        1|      JAN|01-01-16|    EBAW| Antwerp|   Belgium|        4|        3|        7|         NULL|         NULL|         NULL|                20|
|2016|        1|      JAN|02-01-16|    EBAW| Antwerp|   Belgium|        9|       11|       20|         NULL|         NULL|         NULL|                27|
|2016|        1|      JAN|03-01-16|    EBAW| Antwerp|   Belgium|       10|       17|       27|         NULL|         NULL|         NULL|                35|
|2016|        1|      JAN|04-01-16|    EBAW| Antwerp|   Belgium|

In [33]:
spark.sql("""
    SELECT
     *,
     lead(FLT_TOT_1) OVER (
     PARTITION BY APT_NAME
     ORDER BY YEAR, MONTH_NUM
     ) AS next_month_flights
    FROM
        airport_traffic
""").show(5)

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|next_month_flights|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|2016|        1|      JAN|01-01-16|    EBAW| Antwerp|   Belgium|        4|        3|        7|         NULL|         NULL|         NULL|                20|
|2016|        1|      JAN|02-01-16|    EBAW| Antwerp|   Belgium|        9|       11|       20|         NULL|         NULL|         NULL|                27|
|2016|        1|      JAN|03-01-16|    EBAW| Antwerp|   Belgium|       10|       17|       27|         NULL|         NULL|         NULL|                35|
|2016|        1|      JAN|04-01-16|    EBAW| Antwerp|   Belgium|

In [34]:
window = Window.partitionBy('APT_NAME').orderBy('YEAR', 'MONTH_NUM')

df.withColumn(
    'prev_month_flights',
    F.lag('FLT_TOT_1').over(window)
).show(5)

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|prev_month_flights|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|2016|        1|      JAN|01-01-16|    EBAW| Antwerp|   Belgium|        4|        3|        7|         NULL|         NULL|         NULL|              NULL|
|2016|        1|      JAN|02-01-16|    EBAW| Antwerp|   Belgium|        9|       11|       20|         NULL|         NULL|         NULL|                 7|
|2016|        1|      JAN|03-01-16|    EBAW| Antwerp|   Belgium|       10|       17|       27|         NULL|         NULL|         NULL|                20|
|2016|        1|      JAN|04-01-16|    EBAW| Antwerp|   Belgium|

In [35]:
spark.sql("""
    SELECT
        *,
        lag(FLT_TOT_1) OVER(
            PARTITION BY APT_NAME
            ORDER BY YEAR, MONTH_NUM
        ) AS prev_month_flights
    FROM
        airport_traffic
""").show(5)

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|prev_month_flights|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|2016|        1|      JAN|01-01-16|    EBAW| Antwerp|   Belgium|        4|        3|        7|         NULL|         NULL|         NULL|              NULL|
|2016|        1|      JAN|02-01-16|    EBAW| Antwerp|   Belgium|        9|       11|       20|         NULL|         NULL|         NULL|                 7|
|2016|        1|      JAN|03-01-16|    EBAW| Antwerp|   Belgium|       10|       17|       27|         NULL|         NULL|         NULL|                20|
|2016|        1|      JAN|04-01-16|    EBAW| Antwerp|   Belgium|

In [36]:
window = Window.partitionBy('APT_NAME').orderBy('YEAR', 'MONTH_NUM')

df.withColumn(
    'three_months_ago_flights',
    F.lag('FLT_TOT_1', 3).over(window),
).show(5)

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|three_months_ago_flights|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------------+
|2016|        1|      JAN|01-01-16|    EBAW| Antwerp|   Belgium|        4|        3|        7|         NULL|         NULL|         NULL|                    NULL|
|2016|        1|      JAN|02-01-16|    EBAW| Antwerp|   Belgium|        9|       11|       20|         NULL|         NULL|         NULL|                    NULL|
|2016|        1|      JAN|03-01-16|    EBAW| Antwerp|   Belgium|       10|       17|       27|         NULL|         NULL|         NULL|                    NULL|
|2016|        1|      JAN|04

In [37]:
spark.sql("""
    SELECT
        *,
        lag(FLT_TOT_1, 3) OVER(
            PARTITION BY APT_NAME
            ORDER BY YEAR, MONTH_NUM
        ) AS three_months_ago_flights
    FROM
        airport_traffic
""").show(5)

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|three_months_ago_flights|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------------+
|2016|        1|      JAN|01-01-16|    EBAW| Antwerp|   Belgium|        4|        3|        7|         NULL|         NULL|         NULL|                    NULL|
|2016|        1|      JAN|02-01-16|    EBAW| Antwerp|   Belgium|        9|       11|       20|         NULL|         NULL|         NULL|                    NULL|
|2016|        1|      JAN|03-01-16|    EBAW| Antwerp|   Belgium|       10|       17|       27|         NULL|         NULL|         NULL|                    NULL|
|2016|        1|      JAN|04

In [38]:
window = Window.partitionBy('YEAR', 'APT_ICAO').orderBy('MONTH_NUM')

df.withColumn(
    'running_total_flights_year',
    F.sum('FLT_TOT_1').over(window)
).show(5)

+----+---------+---------+--------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+--------------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|  APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|running_total_flights_year|
+----+---------+---------+--------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+--------------------------+
|2016|        1|      JAN|01-01-16|    EDDL|Dusseldorf|   Germany|      172|      175|      347|         NULL|         NULL|         NULL|                     14406|
|2016|        1|      JAN|02-01-16|    EDDL|Dusseldorf|   Germany|      168|      172|      340|         NULL|         NULL|         NULL|                     14406|
|2016|        1|      JAN|03-01-16|    EDDL|Dusseldorf|   Germany|      208|      206|      414|         NULL|         NULL|         NULL|                     14406|
|201

In [39]:
spark.sql("""
    SELECT
        *,
        sum(FLT_TOT_1) OVER (
            PARTITION BY YEAR, APT_ICAO
            ORDER BY MONTH_NUM
            ) AS running_total_flights_year
    FROM
        airport_traffic
    WHERE YEAR = 2017
""").show(5)

+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+--------------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|running_total_flights_year|
+----+---------+---------+--------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+--------------------------+
|2017|        1|      JAN|01-01-17|    EBAW| Antwerp|   Belgium|        4|        6|       10|         NULL|         NULL|         NULL|                       940|
|2017|        1|      JAN|02-01-17|    EBAW| Antwerp|   Belgium|       15|       18|       33|         NULL|         NULL|         NULL|                       940|
|2017|        1|      JAN|03-01-17|    EBAW| Antwerp|   Belgium|       11|       18|       29|         NULL|         NULL|         NULL|                       940|
|2017|        1|

In [40]:
window = Window.partitionBy('APT_ICAO').orderBy('YEAR', 'MONTH_NUM').rowsBetween(-2, 0)

df.withColumn(
    'rolling_avg_3month',
    F.round(F.avg('FLT_TOT_1').over(window), 2)
).show(8)

+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|    APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|rolling_avg_3month|
+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|2016|        1|      JAN|01-01-16|    EDDK|Cologne-Bonn|   Germany|       92|       90|      182|         NULL|         NULL|         NULL|             182.0|
|2016|        1|      JAN|02-01-16|    EDDK|Cologne-Bonn|   Germany|       95|       99|      194|         NULL|         NULL|         NULL|             188.0|
|2016|        1|      JAN|03-01-16|    EDDK|Cologne-Bonn|   Germany|      126|      130|      256|         NULL|         NULL|         NULL|            210.67|
|2016|        1|      JAN|04-01-16|    E

In [41]:
spark.sql("""
    SELECT
        *,
        ROUND(AVG(FLT_TOT_1) OVER(
            PARTITION BY APT_ICAO
            ORDER BY YEAR, MONTH_NUM
            ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
        ), 2) AS rolling_avg_3month
    FROM
        airport_traffic
""").show(5)

+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|    APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|rolling_avg_3month|
+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|2016|        1|      JAN|01-01-16|    EDDK|Cologne-Bonn|   Germany|       92|       90|      182|         NULL|         NULL|         NULL|             182.0|
|2016|        1|      JAN|02-01-16|    EDDK|Cologne-Bonn|   Germany|       95|       99|      194|         NULL|         NULL|         NULL|             188.0|
|2016|        1|      JAN|03-01-16|    EDDK|Cologne-Bonn|   Germany|      126|      130|      256|         NULL|         NULL|         NULL|            210.67|
|2016|        1|      JAN|04-01-16|    E

In [42]:
window = Window.partitionBy('APT_ICAO').orderBy('YEAR', 'MONTH_NUM').rowsBetween(-1, 1)

df.withColumn(
    'rolling_avg_3month',
    F.round(F.avg('FLT_TOT_1').over(window), 2)
).show(8)

+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|    APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|rolling_avg_3month|
+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+
|2016|        1|      JAN|01-01-16|    EDDK|Cologne-Bonn|   Germany|       92|       90|      182|         NULL|         NULL|         NULL|             188.0|
|2016|        1|      JAN|02-01-16|    EDDK|Cologne-Bonn|   Germany|       95|       99|      194|         NULL|         NULL|         NULL|            210.67|
|2016|        1|      JAN|03-01-16|    EDDK|Cologne-Bonn|   Germany|      126|      130|      256|         NULL|         NULL|         NULL|            254.67|
|2016|        1|      JAN|04-01-16|    E

In [43]:
spark.sql("""
    SELECT
        *,
        AVG(FLT_TOT_1) OVER(
            PARTITION BY APT_ICAO
            ORDER BY YEAR, MONTH_NUM
            ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
        ) AS centered_rolling_avg_3month
    FROM
        airport_traffic
""").show(5)

+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+---------------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|    APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|centered_rolling_avg_3month|
+----+---------+---------+--------+--------+------------+----------+---------+---------+---------+-------------+-------------+-------------+---------------------------+
|2016|        1|      JAN|01-01-16|    EDDK|Cologne-Bonn|   Germany|       92|       90|      182|         NULL|         NULL|         NULL|                      188.0|
|2016|        1|      JAN|02-01-16|    EDDK|Cologne-Bonn|   Germany|       95|       99|      194|         NULL|         NULL|         NULL|         210.66666666666666|
|2016|        1|      JAN|03-01-16|    EDDK|Cologne-Bonn|   Germany|      126|      130|      256|         NULL|         NULL|         NULL|         254.66

In [44]:
window = Window.partitionBy('YEAR').orderBy('FLT_TOT_1')

df.withColumn(
    'percentile_rank_flights',
    F.percent_rank().over(window)
).show(5)

+----+---------+---------+--------+--------+--------------------+----------+---------+---------+---------+-------------+-------------+-------------+-----------------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|percentile_rank_flights|
+----+---------+---------+--------+--------+--------------------+----------+---------+---------+---------+-------------+-------------+-------------+-----------------------+
|2018|        1|      JAN|01-01-18|    LFLI|           Annemasse|    France|        1|        0|        1|         NULL|         NULL|         NULL|                    0.0|
|2018|        1|      JAN|01-01-18|    LFLP|      Annecy-Meythet|    France|        0|        1|        1|         NULL|         NULL|         NULL|                    0.0|
|2018|        1|      JAN|01-01-18|    LFMH|Saint-Etienne-Bou...|    France|        1|        0|        1|         NULL|         NULL| 

In [45]:
window = Window.partitionBy('YEAR').orderBy('FLT_TOT_1')

df.withColumn(
    'flight_quartile',
    F.ntile(4).over(window)
).filter(
    F.col('flight_quartile') == 1
).show()

+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+---------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME| STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|flight_quartile|
+----+---------+---------+--------+--------+--------------------+-----------+---------+---------+---------+-------------+-------------+-------------+---------------+
|2018|        1|      JAN|01-01-18|    LFLI|           Annemasse|     France|        1|        0|        1|         NULL|         NULL|         NULL|              1|
|2018|        1|      JAN|01-01-18|    LFLP|      Annecy-Meythet|     France|        0|        1|        1|         NULL|         NULL|         NULL|              1|
|2018|        1|      JAN|01-01-18|    LFMH|Saint-Etienne-Bou...|     France|        1|        0|        1|         NULL|         NULL|         NULL|              1|
|201

In [46]:
df.show()

+----+---------+---------+--------+--------+-------------+--------------------+---------+---------+---------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|     APT_NAME|          STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|
+----+---------+---------+--------+--------+-------------+--------------------+---------+---------+---------+-------------+-------------+-------------+
|2016|        1|      JAN|01-01-16|    LATI|       Tirana|             Albania|       24|       27|       51|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    UDYZ|      Yerevan|             Armenia|        8|       15|       23|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    LOWG|         Graz|             Austria|        6|        7|       13|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    LOWI|    Innsbruck|             Austria|       26

In [47]:
df.show(5)

+----+---------+---------+--------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|  APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|
+----+---------+---------+--------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+
|2016|        1|      JAN|01-01-16|    LATI|    Tirana|   Albania|       24|       27|       51|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    UDYZ|   Yerevan|   Armenia|        8|       15|       23|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    LOWG|      Graz|   Austria|        6|        7|       13|         NULL|         NULL|         NULL|
|2016|        1|      JAN|01-01-16|    LOWI| Innsbruck|   Austria|       26|       32|       58|         NULL|         NULL|         NULL|
|2016|        1|      JAN|0

In [48]:
df = df.withColumn('FLT_DATE', F.to_date('FLT_DATE', 'dd-MM-yy'))

In [49]:
count_flights = df.groupby(
    'STATE_NAME'
).agg(
    F.sum('FLT_TOT_1').alias('total_flights')
).orderBy(
    F.col('total_flights').desc()
).toPandas()

In [50]:
bar = alt.Chart(count_flights).mark_bar().encode(
    x=alt.X('STATE_NAME:N', sort='-y', title='State'),
    y=alt.Y('total_flights:Q', title='Total flights'),
).properties(
    title='Total flights by state',
    width=900,
    height=700
).interactive()

bar

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [51]:
df.show(5)

+----+---------+---------+----------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|  FLT_DATE|APT_ICAO|  APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|
+----+---------+---------+----------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+
|2016|        1|      JAN|2016-01-01|    LATI|    Tirana|   Albania|       24|       27|       51|         NULL|         NULL|         NULL|
|2016|        1|      JAN|2016-01-01|    UDYZ|   Yerevan|   Armenia|        8|       15|       23|         NULL|         NULL|         NULL|
|2016|        1|      JAN|2016-01-01|    LOWG|      Graz|   Austria|        6|        7|       13|         NULL|         NULL|         NULL|
|2016|        1|      JAN|2016-01-01|    LOWI| Innsbruck|   Austria|       26|       32|       58|         NULL|         NULL|         NULL|
|2016|       

In [53]:
df.show(5)

+----+---------+---------+----------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|  FLT_DATE|APT_ICAO|  APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|
+----+---------+---------+----------+--------+----------+----------+---------+---------+---------+-------------+-------------+-------------+
|2016|        1|      JAN|2016-01-01|    LATI|    Tirana|   Albania|       24|       27|       51|         NULL|         NULL|         NULL|
|2016|        1|      JAN|2016-01-01|    UDYZ|   Yerevan|   Armenia|        8|       15|       23|         NULL|         NULL|         NULL|
|2016|        1|      JAN|2016-01-01|    LOWG|      Graz|   Austria|        6|        7|       13|         NULL|         NULL|         NULL|
|2016|        1|      JAN|2016-01-01|    LOWI| Innsbruck|   Austria|       26|       32|       58|         NULL|         NULL|         NULL|
|2016|       

In [54]:
df.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH_NUM: integer (nullable = true)
 |-- MONTH_MON: string (nullable = true)
 |-- FLT_DATE: date (nullable = true)
 |-- APT_ICAO: string (nullable = true)
 |-- APT_NAME: string (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- FLT_DEP_1: integer (nullable = true)
 |-- FLT_ARR_1: integer (nullable = true)
 |-- FLT_TOT_1: integer (nullable = true)
 |-- FLT_DEP_IFR_2: integer (nullable = true)
 |-- FLT_ARR_IFR_2: integer (nullable = true)
 |-- FLT_TOT_IFR_2: integer (nullable = true)



In [59]:
year_month_flt_count = df.groupBy(
    'YEAR', 'MONTH_NUM'
).agg(
    F.count('FLT_TOT_1').alias('total_flights')
).orderBy(
    F.col('total_flights').desc()
).toPandas()

In [72]:
spark.sql("""
    SELECT
        YEAR,
        MONTH_NUM,
        COUNT(*) AS total_flights
    FROM
        airport_traffic
    GROUP BY
        YEAR, MONTH_NUM
    ORDER BY
        total_flights DESC
""").show()

+----+---------+-------------+
|YEAR|MONTH_NUM|total_flights|
+----+---------+-------------+
|2024|        7|         9814|
|2024|        8|         9806|
|2021|       10|         9803|
|2022|        5|         9794|
|2024|        5|         9786|
|2024|       10|         9786|
|2023|       10|         9783|
|2023|        7|         9773|
|2021|        8|         9772|
|2019|       10|         9772|
|2023|        8|         9753|
|2022|        7|         9750|
|2023|        5|         9749|
|2018|       10|         9749|
|2019|        5|         9748|
|2022|       10|         9740|
|2021|        7|         9732|
|2019|        7|         9724|
|2023|        3|         9704|
|2018|        5|         9702|
+----+---------+-------------+
only showing top 20 rows



In [64]:
fig = px.bar(
    data_frame=year_month_flt_count,
    x='MONTH_NUM',
    y='total_flights',
    color='YEAR',
    barmode='group',
    title='Total flights by month',
    width=900,
    height=700
)

fig.show()

In [79]:
widnow = Window.partitionBy('YEAR', 'MONTH_NUM').orderBy(F.col('FLT_TOT_1').desc())

df.withColumn(
    'rank',
    F.rank().over(window)
).filter(
    F.col('rank') <= 5
).orderBy(
    'YEAR', 'MONTH_NUM', 'rank'
).show()

+----+---------+---------+----------+--------+--------------------+----------+---------+---------+---------+-------------+-------------+-------------+----+
|YEAR|MONTH_NUM|MONTH_MON|  FLT_DATE|APT_ICAO|            APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|rank|
+----+---------+---------+----------+--------+--------------------+----------+---------+---------+---------+-------------+-------------+-------------+----+
|2016|        1|      JAN|2016-01-01|    LFBA|     Agen-La Garenne|    France|        0|        1|        1|         NULL|         NULL|         NULL|   1|
|2016|        1|      JAN|2016-01-01|    LFRD|Dinard-Pleurtuit-...|    France|        1|        0|        1|         NULL|         NULL|         NULL|   1|
|2016|        1|      JAN|2016-01-01|    EIWF|           Waterford|   Ireland|        0|        1|        1|         NULL|         NULL|         NULL|   1|
|2016|        1|      JAN|2016-01-01|    LJMB|             Marib

In [88]:
spark.sql("""
WITH airport_traffic AS (
    SELECT
        *,
        RANK() OVER (
            PARTITION BY YEAR, MONTH_NUM
            ORDER BY FLT_TOT_1 DESC
        ) AS rank
    FROM
        airport_traffic
    ) SELECT
        *
        FROM
            airport_traffic
        WHERE
            rank <= 5
        ORDER BY
            YEAR, MONTH_NUM, rank
""").show()

+----+---------+---------+--------+--------+--------------------+--------------+---------+---------+---------+-------------+-------------+-------------+----+
|YEAR|MONTH_NUM|MONTH_MON|FLT_DATE|APT_ICAO|            APT_NAME|    STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|rank|
+----+---------+---------+--------+--------+--------------------+--------------+---------+---------+---------+-------------+-------------+-------------+----+
|2016|        1|      JAN|22-01-16|    EGLL|   London - Heathrow|United Kingdom|      637|      641|     1278|          637|          640|         1277|   1|
|2016|        1|      JAN|29-01-16|    EGLL|   London - Heathrow|United Kingdom|      633|      640|     1273|          633|          637|         1270|   2|
|2016|        1|      JAN|03-01-16|    LFPG|Paris-Charles-de-...|        France|      631|      632|     1263|          630|          629|         1259|   3|
|2016|        1|      JAN|03-01-16|    LTBA|    Ista

In [105]:
window = Window.partitionBy('APT_NAME').orderBy('YEAR', 'MONTH_NUM')

df_with_lag = df.withColumn(
    'prev_month_flights',
    F.lag('FLT_TOT_1', 1).over(window),
)

df_with_lag = df_with_lag.withColumn(
    'prev_month_flights',
    F.coalesce('prev_month_flights', F.lit(0.0))
)

df_with_growth = df_with_lag.withColumn(
    'mom_growth',
    F.when(F.col('prev_month_flights') != 0,
           ((F.col('FLT_TOT_1') - F.col('prev_month_flights')) / F.col('prev_month_flights')) * 100
           ).otherwise(None)
)

df_with_growth = df_with_growth.withColumn(
    'mom_growth', F.round(F.col('mom_growth'), 2)
)

df_with_growth.orderBy('APT_NAME', 'YEAR', 'MONTH_NUM').show()

+----+---------+---------+----------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+----------+
|YEAR|MONTH_NUM|MONTH_MON|  FLT_DATE|APT_ICAO|APT_NAME|STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|prev_month_flights|mom_growth|
+----+---------+---------+----------+--------+--------+----------+---------+---------+---------+-------------+-------------+-------------+------------------+----------+
|2016|        1|      JAN|2016-01-08|    LEIZ|    Abad|     Spain|        0|        4|        4|         NULL|         NULL|         NULL|               0.0|      NULL|
|2016|        1|      JAN|2016-01-09|    LEIZ|    Abad|     Spain|        1|        1|        2|         NULL|         NULL|         NULL|               4.0|     -50.0|
|2016|        1|      JAN|2016-01-10|    LEIZ|    Abad|     Spain|        2|        0|        2|         NULL|         NULL|         NULL|               2.

In [112]:
spark.sql("""
SELECT
    APT_NAME,
    YEAR,
    MONTH_NUM,
    FLT_TOT_1,
    COALESCE(LAG(FLT_TOT_1, 1) OVER (
        PARTITION BY APT_NAME
        ORDER BY YEAR, MONTH_NUM
    ), 0.0) as prev_month_flights,
    ROUND(
        CASE
            WHEN COALESCE(LAG(FLT_TOT_1, 1) OVER (
                    PARTITION BY APT_NAME
                    ORDER BY YEAR, MONTH_NUM
                ), 0.0) != 0
            THEN ((FLT_TOT_1 - LAG(FLT_TOT_1, 1) OVER (
                    PARTITION BY APT_NAME
                    ORDER BY YEAR, MONTH_NUM
                )) / LAG(FLT_TOT_1, 1) OVER (
                    PARTITION BY APT_NAME
                    ORDER BY YEAR, MONTH_NUM
                )) * 100
            ELSE NULL
        END, 2
    ) as mom_growth
FROM airport_traffic
""").show()

+------------+----+---------+---------+------------------+----------+
|    APT_NAME|YEAR|MONTH_NUM|FLT_TOT_1|prev_month_flights|mom_growth|
+------------+----+---------+---------+------------------+----------+
|Angers-Marcé|2016|        1|        6|               0.0|      NULL|
|Angers-Marcé|2016|        1|        4|               6.0|    -33.33|
|Angers-Marcé|2016|        1|        7|               4.0|      75.0|
|Angers-Marcé|2016|        1|       11|               7.0|     57.14|
|Angers-Marcé|2016|        1|        6|              11.0|    -45.45|
|Angers-Marcé|2016|        1|       10|               6.0|     66.67|
|Angers-Marcé|2016|        1|        6|              10.0|     -40.0|
|Angers-Marcé|2016|        1|        2|               6.0|    -66.67|
|Angers-Marcé|2016|        1|        1|               2.0|     -50.0|
|Angers-Marcé|2016|        1|        1|               1.0|       0.0|
|Angers-Marcé|2016|        1|        7|               1.0|     600.0|
|Angers-Marcé|2016| 

In [113]:
df.show()

+----+---------+---------+----------+--------+-------------+--------------------+---------+---------+---------+-------------+-------------+-------------+
|YEAR|MONTH_NUM|MONTH_MON|  FLT_DATE|APT_ICAO|     APT_NAME|          STATE_NAME|FLT_DEP_1|FLT_ARR_1|FLT_TOT_1|FLT_DEP_IFR_2|FLT_ARR_IFR_2|FLT_TOT_IFR_2|
+----+---------+---------+----------+--------+-------------+--------------------+---------+---------+---------+-------------+-------------+-------------+
|2016|        1|      JAN|2016-01-01|    LATI|       Tirana|             Albania|       24|       27|       51|         NULL|         NULL|         NULL|
|2016|        1|      JAN|2016-01-01|    UDYZ|      Yerevan|             Armenia|        8|       15|       23|         NULL|         NULL|         NULL|
|2016|        1|      JAN|2016-01-01|    LOWG|         Graz|             Austria|        6|        7|       13|         NULL|         NULL|         NULL|
|2016|        1|      JAN|2016-01-01|    LOWI|    Innsbruck|             Aus