In [0]:
!pip install timezonefinder

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StringType
from datetime import datetime, timedelta
from timezonefinder import TimezoneFinder
import pytz
# from timezonefinder import TimezoneFinder

## Variables and Directories

In [0]:
# Variables and directories
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/checkpoints")
period = "3m" # on of the following values ("", "3m", "6m", "1y")

# Datasets
flights = spark.read.parquet(f"{team_BASE_DIR}/raw/flightdelays/parquet_airlines_data_{period}")
airport_codes = spark.read.parquet(f"{team_BASE_DIR}/external/airport_codes.parquet/")
stations = spark.read.parquet(f"{team_BASE_DIR}/raw/stations/stations_with_neighbors.parquet/")
timezones = spark.read.parquet(f"{team_BASE_DIR}/external/tz_lookup.parquet")

In [0]:
display(dbutils.fs.ls(f"{team_BASE_DIR}/checkpoints"))

## Step 1: Cleaning and Feature selection

In [0]:
# Flights Selected Features
feat = ['FL_DATE',
 'OP_UNIQUE_CARRIER',
 'TAIL_NUM',
 'OP_CARRIER_FL_NUM',
 'ORIGIN',
 'DEST',
 'CRS_DEP_TIME',
 'DEP_DELAY',
 'CRS_ARR_TIME',
 'ARR_DELAY',
 'CANCELLED',
 'DIVERTED',
 'CRS_ELAPSED_TIME',
 'AIR_TIME',
 'DISTANCE',
 'ORIGIN_AIRPORT_SEQ_ID',
 'DEST_AIRPORT_SEQ_ID',
 'TAXI_OUT',
 'TAXI_IN',
 'QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'DEP_TIME_BLK',
 'ARR_TIME_BLK',
 'YEAR',
 'ORIGIN_CITY_NAME',
 'DEST_CITY_NAME',
 'ORIGIN_STATE_FIPS',
 'DEST_STATE_FIPS',
 'CARRIER_DELAY',
 'WEATHER_DELAY',
 'NAS_DELAY',
 'SECURITY_DELAY',
 'LATE_AIRCRAFT_DELAY',
 'DIV_ARR_DELAY',
 'CANCELLATION_CODE',
 'FIRST_DEP_TIME',
 'LONGEST_ADD_GTIME',
 'TOTAL_ADD_GTIME']

In [0]:
# Read Airline Data (flights) selected features and removing duplicates (if any)
flights_clean = flights.select(feat).dropDuplicates()
flights_clean.cache()
flights_clean = flights_clean.checkpoint() # Checkpoint after cleaning (prevents recomputation) TODO: save to DBFS  iterim folder)

## Step 2: Joining Flight with Airports to retrieve ICAO

### 1. Checking missing IATA codes in Airpot codes table and fix or adding the missing

In [0]:
# Extract distinct ORIGIN and DEST codes
flight_iata = flights_clean.select("ORIGIN").union(flights_clean.select("DEST")).distinct()

# Extract distinct IATA codes from airport_codes
airport_iata = airport_codes.select("iata_code").distinct()

# Find missing IATA codes in flights but not in airport data
missing_iata = flight_iata.join(airport_iata, flight_iata["ORIGIN"] == airport_iata["iata_code"], "left_anti") \
                          .select(F.col("ORIGIN").alias("missing_code"))

display(missing_iata)

In [0]:
# Fixing missing iata code: (Replacing the ISN to XWA)
flights_clean = flights_clean.na.replace(['ISN'], ['XWA'], 'ORIGIN').na.replace(['ISN'], ['XWA'], 'DEST')

### 2. Performing the join to get ICAO codes

In [0]:
from pyspark.sql.functions import broadcast

# Register as temporary views for SQL use
flights_clean.createOrReplaceTempView("flights_clean")
airport_codes.createOrReplaceTempView("airport_codes")

# Apply Broadcast Join for small airport_codes table
airport_codes_broadcast = broadcast(airport_codes)

flights_w_icao = flights_clean \
    .join(airport_codes_broadcast.alias("a1"), flights_clean.ORIGIN == F.col("a1.iata_code"), "left") \
    .join(airport_codes_broadcast.alias("a2"), flights_clean.DEST == F.col("a2.iata_code"), "left") \
    .select(
        flights_clean["*"],
        F.col("a1.icao_code").alias("ORIGIN_ICAO"),
        F.col("a1.type").alias("origin_type"),
        F.col("a1.iso_region").alias("origin_region"),
        F.col("a2.icao_code").alias("DEST_ICAO"),
        F.col("a2.type").alias("dest_type"),
        F.col("a2.iso_region").alias("dest_region")
    )

# Cache & checkpoint post join (TODO: save to DBFS iterim folder)
flights_w_icao.cache()
flights_w_icao = flights_w_icao.checkpoint()

### 3. Checking if flights still have any missing ICAO code after the join

In [0]:
# check for null ORIGIN_ICAO and DEST_ICAO after the join (must be null)
display(
  flights_w_icao
  .filter(F.col("ORIGIN_ICAO").isNull())
  .select(F.col("ORIGIN").alias("missing_icao"))
  .union(
    flights_w_icao
    .filter(F.col("DEST_ICAO").isNull())
    .select(F.col("DEST").alias("missing_icao"))
  )
  .distinct()

  )

## Step 3: Joining Flight with Stations to get the closest station ID and coordinates

### 1. Validate Missing ICAO Codes in Stations Data before the join

In [0]:
# Extract unique ICAO codes from flights
flight_icao = flights_w_icao.select("ORIGIN_ICAO").union(flights_w_icao.select("DEST_ICAO")).distinct()

# Extract unique station ICAO codes
stations_icao = stations.select("neighbor_call").distinct()

# Find missing ICAO codes
missing_icao_lst = (
    flight_icao.join(stations_icao, flight_icao["ORIGIN_ICAO"] == stations_icao["neighbor_call"], "left_anti")
    .select(F.col("ORIGIN_ICAO").alias("missing_code"))
    .rdd.flatMap(lambda x: x).collect()
)

### 2. Construct Missing ICAO Data Using Airport Codes

In [0]:
missing_icao = (
    airport_codes.filter(F.col("icao_code").isin(missing_icao_lst))
    .withColumn("neighbor_id", F.col("ident"))
    .withColumn("neighbor_name", F.col("name"))
    .withColumn("neighbor_state", F.split(F.col("iso_region"), "-")[1].cast("string"))
    .withColumn("neighbor_call", F.col("icao_code"))
    .withColumn("neighbor_lat", F.split(F.col("coordinates"), ",")[0].cast("double"))
    .withColumn("neighbor_lon", F.split(F.col("coordinates"), ",")[1].cast("double"))
    .select("neighbor_id", "neighbor_name", "neighbor_state", "neighbor_call", "neighbor_lat", "neighbor_lon")
)

### 3. Augment Stations with the missing airports and Find Nearest Weather Stations (Optimized with Haversine)

In [0]:
# Define Haversine formula for distance calculation
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in km
    d_lat = F.radians(lat2 - lat1)
    d_lon = F.radians(lon2 - lon1)
    a = (F.sin(d_lat / 2) ** 2) + F.cos(F.radians(lat1)) * F.cos(F.radians(lat2)) * (F.sin(d_lon / 2) ** 2)
    c = 2 * F.atan2(F.sqrt(a), F.sqrt(1 - a))
    return R * c

# Calculate distances between missing ICAO and weather stations
dist_stations = stations.select("usaf", "wban", "station_id", "lat", "lon").distinct()
missing_distances = (
    dist_stations.crossJoin(missing_icao)
    .withColumn("distance_to_neighbor", haversine(F.col("lat"), F.col("lon"), F.col("neighbor_lat"), F.col("neighbor_lon")))
)

aug_stations = stations.union(missing_distances)

# Find closest station
aug_stations = (
    aug_stations.withColumn("rn", F.row_number().over(Window.partitionBy("neighbor_call").orderBy("distance_to_neighbor")))
    .filter(F.col("rn") == 1)
)


# Cache & checkpoint augmented stations (TODO: save to DBFS iterim folder)
aug_stations.cache()
aug_stations = aug_stations.checkpoint()

### 4. Performing the join to get the nearest station's data

In [0]:
# Register as temporary views for SQL use
flights_w_icao.createOrReplaceTempView("flights_w_icao")
aug_stations.createOrReplaceTempView("aug_stations")

# Apply Broadcast Join for small aug_stations table
aug_stations_broadcast = broadcast(aug_stations)

flights_stations = flights_w_icao \
    .join(aug_stations_broadcast.alias("a1"), flights_w_icao.ORIGIN_ICAO == F.col("a1.neighbor_call"), "left") \
    .join(aug_stations_broadcast.alias("a2"), flights_w_icao.DEST_ICAO == F.col("a2.neighbor_call"), "left") \
    .select(
        flights_w_icao["*"],
        # Origin Station
        F.col("a1.neighbor_name").alias("origin_airport_name"),
        F.col("a1.station_id").alias("origin_station_id"),
        F.col("a1.lat").alias("origin_station_lat"),
        F.col("a1.lon").alias("origin_station_lon"),
        F.col("a1.neighbor_lat").alias("origin_airport_lat"),
        F.col("a1.neighbor_lon").alias("origin_airport_lon"),
        F.col("a1.distance_to_neighbor").alias("origin_station_dis"),

        # Destination Station
        F.col("a2.neighbor_name").alias("dest_airport_name"),
        F.col("a2.station_id").alias("dest_station_id"),
        F.col("a2.lat").alias("dest_station_lat"),
        F.col("a2.lon").alias("dest_station_lon"),
        F.col("a2.neighbor_lat").alias("dest_airport_lat"),
        F.col("a2.neighbor_lon").alias("dest_airport_lon"),
        F.col("a2.distance_to_neighbor").alias("dest_station_dis")
    )

# Cache & checkpoint post join (TODO: save to DBFS iterim folder)
flights_stations.cache()
flights_stations = flights_stations.checkpoint()

### 5. Checking if we have any missing stations after the join

In [0]:
# check for null origin_airport_name and dest_airport_name after the join (must be null)
display(
  flights_stations
  .filter(F.col("origin_airport_lat").isNull())
  .select(F.col("ORIGIN_ICAO").alias("missing_station"))
  .union(
    flights_stations
    .filter(F.col("dest_airport_lat").isNull())
    .select(F.col("DEST_ICAO").alias("missing_station"))
  )
  .distinct()

  )

## Step 4: Handling time

### 1. Validate Missing ICAO in Timezones Data before the join

In [0]:
# Extract unique station ICAO codes
tz_icao = timezones.select("neighbor_call").distinct()

# Find missing ICAO codes in timezones table
missing_tz_lst = (
    flight_icao.join(tz_icao, flight_icao["ORIGIN_ICAO"] == tz_icao["neighbor_call"], "left_anti")
    .select(F.col("ORIGIN_ICAO").alias("missing_code"))
    .rdd.flatMap(lambda x: x).collect()
)

### 2. Finding missing timezones using coordinates


In [0]:
# ============================
# UDF: Timezones Lookup
# ============================
def find_timezone(lat, lng):
    tf = TimezoneFinder()
    timezone_str = tf.timezone_at(lat=lat, lng=lng)
    return timezone_str if timezone_str else "Unknown"


if len(missing_tz_lst) > 0:
    missing_tz = (
            flights_stations.filter(F.col("ORIGIN_ICAO").isin(missing_tz_lst))
            .select(F.col("ORIGIN_ICAO").alias("neighbor_call"),
                    F.col("origin_airport_lat").alias("neighbor_lat"),
                    F.col("origin_airport_lon").alias("neighbor_lon"))
            .union(flights_stations.filter(F.col("DEST_ICAO").isin(missing_tz_lst))
                   .select(F.col("DEST_ICAO").alias("neighbor_call"),
                           F.col("dest_airport_lat").alias("neighbor_lat"),
                           F.col("dest_airport_lon").alias("neighbor_lon"))).distinct())

    # define udf for time zone lookup
    find_timezone_udf = udf(find_timezone, StringType())

    # add time zone column
    missing_tz = missing_tz.withColumn("timezone", find_timezone_udf(F.col("neighbor_lat"), F.col("neighbor_lon")))
    
    # Augmenting Timezones data with the missing airport data
    timezones = timezones.union(missing_tz)
    # re-save timezones data as a parquet file (TODO: uncomment later once verified)
    # timezones.write.mode("overwrite").parquet(f"{team_BASE_DIR}/external/tz_lookup.parquet")

### 3. Finding Timezone using Erica's table

In [0]:
# Register as temporary views for SQL use
flights_stations.createOrReplaceTempView("flights_stations")
timezones.createOrReplaceTempView("timezones")

# Apply Broadcast Join for small timezones table
tz_broadcast = broadcast(timezones)

flights_tz = flights_stations \
    .join(tz_broadcast.alias("a1"), flights_stations.ORIGIN_ICAO == F.col("a1.neighbor_call"), "left") \
    .join(tz_broadcast.alias("a2"), flights_stations.DEST_ICAO == F.col("a2.neighbor_call"), "left") \
    .select(
        flights_stations["*"],
        # Origin Station
        F.col("a1.timezone").alias("origin_timezone"),

        # Destination Station
        F.col("a2.timezone").alias("dest_timezone"),
    )

# Cache the data to avoid recomputing the time zones (TODO: save to DBFS iterim folder)
flights_tz.cache()
flights_tz = flights_tz.checkpoint()

### 4. Checking if we have any missing stations after the join

In [0]:
# check for null origin_timezone and dest_timezone after the join (must be null)
display(
  flights_tz
  .filter(F.col("origin_timezone").isNull())
  .select(F.col("ORIGIN_ICAO").alias("missing_timezone"))
  .union(
    flights_tz
    .filter(F.col("dest_timezone").isNull())
    .select(F.col("DEST_ICAO").alias("missing_timezone"))
  )
  .distinct()

  )

### 5. Joining Data and time fields

In [0]:
# Joining Derparture Date and Departure time to datetime
flights_tz = flights_tz.withColumn(
    "sched_depart_date_time",
    F.concat_ws(
        "T",
        F.col("FL_DATE"),
        F.date_format(F.to_timestamp(F.concat(F.lpad("CRS_DEP_TIME", 4, "0"), F.lit("00")), "HHmmss"), "HH:mm:ss")
    )
)

### 6. Converting to UTC and adding UTC-2 and UTC-4

In [0]:
# ============================
# UDF: Convert Deprature Time to UTC
# ============================

def to_utc(dt: str, tz: str) -> str:
    if dt is None:
        return None
    dt_format = "%Y-%m-%dT%H:%M:%S"
    local_dt = datetime.strptime(dt, dt_format)
    if tz:
        timezone = pytz.timezone(tz)
        local_dt = timezone.localize(local_dt)
        
        # Convert to UTC
        utc_dt = local_dt.astimezone(pytz.utc)
        return utc_dt.strftime(dt_format)
    return None

utc_udf = F.udf(to_utc)

flights_utc = flights_tz.withColumn("sched_depart_utc", utc_udf(F.col("sched_depart_date_time"), F.col("origin_timezone").cast(F.StringType())))

# Cache the data to avoid recomputing the UTC (TODO: save to DBFS iterim folder)
flights_utc.cache()
flights_utc = flights_utc.checkpoint()

In [0]:
# ============================
# UDF: Calculate Prior Times
# ============================
def calculate_prior_times(utc_datetime_str, hours_prior):
    if utc_datetime_str is None:
        return None
    try:
        # Parse UTC datetime string
        utc_format = "%Y-%m-%dT%H:%M:%S"
        utc_dt = datetime.strptime(utc_datetime_str, utc_format)
        
        # Subtract hours to get prior times
        prior_dt = utc_dt - timedelta(hours=hours_prior)
        return prior_dt.strftime("%Y-%m-%dT%H:%M:%S")
    
    except Exception as e:
        return None

# Register UDFs
calculate_prior_times_udf = F.udf(calculate_prior_times, StringType())

flights_utc = flights_utc.withColumn("two_hours_prior_depart_UTC", calculate_prior_times_udf(F.col("sched_depart_utc"), F.lit(2)))
flights_utc_2_4 = flights_utc.withColumn("four_hours_prior_depart_UTC", calculate_prior_times_udf(F.col("sched_depart_utc"), F.lit(4)))

# Cache the data to avoid recomputing the UTC-2 and UTC-4 (TODO: save to DBFS iterim folder)
flights_utc_2_4.cache()
flights_utc_2_4 = flights_utc_2_4.checkpoint()

display(flights_utc_2_4)

In [0]:
display(flights_utc)

## Step 5: Joining Weather Data

In [0]:
weather = spark.read.parquet( "dbfs:/student-groups/Group_4_1/interim/weather_3m_checkpoint")

In [0]:
display(weather)

In [0]:
display(flights_utc_2_4)

### 1. Unique stations from the flights table

# joinng ideas (delete later)
1. get unique station's ID's from flights table 
2. filter weather data to unique stations ID's
3. construct 2 df with distinct station's id's and time (could be utc-2 or utc-4) and cross join with weather data and get the time delta and select the smallest difference. 
4. use the result as join bases between flights and weather. (similar logic to the stations and neigbour distance)

In [0]:
flights_tz.select(
  F.col("origin_station_id").alias("flights_station")
).union(
  flights_tz.select(
    F.col("dest_station_id").alias("flights_station")
    )
  ).distinct().write.mode('overwrite').parquet(f"{team_BASE_DIR}/interim/stations/flights_stations_{period}.parquet")