In [0]:
import sys

sys.path.append('/Workspace/Users/m.bakr@berkeley.edu/261-Final-Project/flightdelays/')
from utils import lst_files_r
from dataset import testfunc


In [0]:
import os
print(os.listdir('/Workspace/Users/m.bakr@berkeley.edu/261-Final-Project/flightdelays'))

In [0]:
!pip install timezonefinder


In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, TimestampType
import pytz
from datetime import datetime, timedelta
from timezonefinder import TimezoneFinder

In [0]:
%scala
// inspecting cache status
spark.conf.get("spark.databricks.io.cache.enabled")

In [0]:
%scala
// Enabeling caching
spark.conf.set("spark.databricks.io.cache.enabled", "true")

In [0]:
# team_diectory contents:
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
display(dbutils.fs.ls(f"{team_BASE_DIR}"))
display(dbutils.fs.ls(f"{team_BASE_DIR}/raw/"))
display(dbutils.fs.ls(f"{team_BASE_DIR}/external/"))
display(dbutils.fs.ls(f"{team_BASE_DIR}/iterim/"))
# display(dbutils.fs.ls(f"{team_BASE_DIR}/processed/")) # still empty

In [0]:
# reading Airport Codes Table from parquet file in the external data folder
airport_codes = spark.read.parquet(f"{team_BASE_DIR}/external/airport_codes.parquet/")
display(airport_codes)
print("Stations Row: ", airport_codes.count())
print("Airport Codes Null count: ")
display(
        airport_codes
        .agg(*[F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in airport_codes.columns])
        .toPandas()
        .melt(var_name="col_name", value_name="null_count")
        .query("null_count > 0")
        )
display(airport_codes.filter(F.col("icao_code").isNotNull()))

In [0]:
# combining ICAO and GPS codes to reduce null (didn't make much difference propably delete it)
airport_codes = airport_codes.withColumn("icao_c", F.coalesce(F.col("icao_code"), F.col("gps_code")))
display(airport_codes)

In [0]:
# Stations data
stations = spark.read.parquet(f"{team_BASE_DIR}/raw/stations/stations_with_neighbors.parquet/")
display(stations)
print("Stations Row: ", stations.count())
display(stations.select([F.count(F.when(F.col(c).isNotNull(), c)).alias(c) for c in stations.columns]))
print("\nStations Duplicates:")
display(stations.groupBy(*stations.columns).agg(F.count("*").alias("count")).filter("count > 1"))

In [0]:
# Airline Data (flights) Q1.2015
flights_3m = spark.read.parquet(f"{team_BASE_DIR}/raw/flightdelays/parquet_airlines_data_3m")
display(flights_3m)
print("Flights Row: ", flights_3m.count())
print("Flights Null count: ")
display(
        flights_3m
        .agg(*[F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in flights_3m.columns])
        .toPandas()
        .melt(var_name="col_name", value_name="null_count")
        .query("null_count > 0")
        )
print("\nFlights Duplicates:")
display(flights_3m.groupBy(*flights_3m.columns).agg(F.count("*").alias("count")).filter("count > 1"))

In [0]:
print("Distinct Origins count: ", flights_3m.select("ORIGIN").distinct().count())
print("Distinct Destinations count: ", flights_3m.select("DEST").distinct().count())

In [0]:
# Weather Data Q1.2015
weather_3m = spark.read.parquet(f"{team_BASE_DIR}/raw/weather/parquet_weather_data_3m")
display(weather_3m)
print("Weather Rows: ", weather_3m.count())
print("Weather Null count: ")
display(
        weather_3m
        .agg(*[F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in weather_3m.columns])
        .toPandas()
        .melt(var_name="col_name", value_name="null_count")
        .query("null_count > 0")
        )
print("Weather Duplicates:")
display(weather_3m.groupBy(*weather_3m.columns).agg(F.count("*").alias("count")).filter("count > 1"))

In [0]:
# inpecting one row when iata code isn't null
display(airport_codes.filter(F.col("iata_code") == "DFW"))

In [0]:
otpw_3m = spark.read.parquet(f"{team_BASE_DIR}/raw/OTPW/OTPW_3M_2015.parquet")
display(otpw_3m)
print("OTPW Rows: ", otpw_3m.count())
print("OTPW Null count: ")
display(
        otpw_3m
        .agg(*[F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in otpw_3m.columns])
        .toPandas()
        .melt(var_name="col_name", value_name="null_count")
        .query("null_count > 0")
        )
print("OTPW Duplicates:")
display(otpw_3m.groupBy(*otpw_3m.columns).agg(F.count("*").alias("count")).filter("count > 1"))

In [0]:
# drop duplicate flight rows
flights_3m_clean = flights_3m.distinct()
flights_3m_clean.count()

In [0]:
# checking missin iata codes in flights but not in airport data before joining
origin_iata = flights_3m_clean.select("ORIGIN").distinct()
dest_iata = flights_3m_clean.select("DEST").distinct()
flight_iata = origin_iata.union(dest_iata).distinct()

airport_iata = airport_codes.select("iata_code").distinct()

display(flight_iata.join(airport_iata, flight_iata["ORIGIN"] == airport_iata["iata_code"], "left_anti").select(F.col("ORIGIN").alias("missing_code")))

[Sloulin Field International Airport IATA code change from ISN to XWA](https://en.wikipedia.org/wiki/Williston_Basin_International_Airport)

September 2015. p. 2. Retrieved October 15, 2016. The current airport (Sloulin Field International Airport – ISN) and proposed replacement airport (Williston Basin International Airport – XWA) will be owned and operated by the City of Williston.
FAA Airport Form 5010 for XWA PDF, effective July 13, 2023.

In [0]:
# Replacing the ISN to XWA
flights_3m_clean = flights_3m_clean.na.replace(['ISN'], ['XWA'], 'ORIGIN').na.replace(['ISN'], ['XWA'], 'DEST')

In [0]:
# ============================
# Register temporary views for Spark SQL
# ============================
flights_3m_clean.createOrReplaceTempView("flights_3m_clean")
airport_codes.createOrReplaceTempView("airport_codes")

# ============================
# Apply Flights and Airport Join Query to get ICAO Codes
# ============================
query = """
SELECT 
    f.*,
    
    -- Origin Airport Lookup
    a1.icao_code AS ORIGIN_ICAO,
    a1.type AS origin_type,
    a1.iso_region AS origin_region,
    
    -- Destination Airport Lookup
    a2.icao_code AS DEST_ICAO,
    a2.type AS dest_type,
    a2.iso_region AS dest_region
    
FROM 
    flights_3m_clean f

-- Origin Airport Join
LEFT JOIN 
    airport_codes a1 ON f.ORIGIN = a1.iata_code

-- Destination Airport Join
LEFT JOIN 
    airport_codes a2 ON f.DEST = a2.iata_code
"""

# Execute the query
flights_w_icao_3m = spark.sql(query)

# Show the result (TODO: add a checkpoint here)
display(flights_w_icao_3m)

In [0]:
# check for null origin_airport_name and dest_airport_name after the join
display(flights_w_icao_3m.filter(flights_w_icao_3m.ORIGIN_ICAO.isNull()).select("ORIGIN").distinct())
display(flights_w_icao_3m.filter(flights_w_icao_3m.DEST_ICAO.isNull()).select("DEST").distinct())

In [0]:
# checking missing ICAO codes in flights and airports but not in stations data before joining
origin_icao = flights_w_icao_3m.select("ORIGIN_ICAO").distinct()
dest_icao = flights_w_icao_3m.select("DEST_ICAO").distinct()
flight_icao = origin_icao.union(dest_icao).distinct()

stations_icao = stations.select("neighbor_call").distinct()

# filtering the missing ICAO that are in flights but not in stations table
missing_icao_lst = flight_icao.join(stations_icao, flight_icao["ORIGIN_ICAO"] == stations_icao["neighbor_call"], "left_anti").select(F.col("ORIGIN_ICAO").alias("missing_code")).rdd.flatMap(lambda x: x).collect()
missing_icao_lst

In [0]:
# constructing the missing ICAO data using the airport_codes table
missing_icao = (
                airport_codes
                .filter(F.col("icao_code").isin(missing_icao_lst))
                .withColumn("eighbor_id", F.col("ident"))
                .withColumn("neighbor_name", F.col("name"))
                .withColumn("neighbor_state", F.split(F.col("iso_region"), "-")[1].cast("string"))
                .withColumn("neighbor_call", F.col("icao_code"))
                .withColumn("neighbor_lat", F.split(F.col("coordinates"), ",")[0].cast("double"))
                .withColumn("neighbor_lon", F.split(F.col("coordinates"), ",")[1].cast("double"))
                .select("eighbor_id", "neighbor_name", "neighbor_state","neighbor_call", "neighbor_lat", "neighbor_lon")
                )


display(missing_icao)

In [0]:
# cross joining withthe distinct stations and calculating the Euclidian distance for each compination based on the lat, lon of airports and stations
dist_stations = stations.select("usaf", "wban", "station_id", "lat", "lon").distinct()
missing_distances = (dist_stations.crossJoin(missing_icao)
                     .withColumn("distance_to_neighbor", F.pow(F.pow(F.col("lat") - F.col("neighbor_lat"), 2) + F.pow(F.col("lon") - F.col("neighbor_lon"), 2), 0.5)))


In [0]:
# Augmenting the stations dataframe with the missing distances (TODO: checkpoint here)
aug_stations = stations.union(missing_distances)
display(aug_stations)

In [0]:
# ============================
# Register temporary views for Spark SQL
# ============================
flights_w_icao_3m.createOrReplaceTempView("flights_w_icao")
aug_stations.createOrReplaceTempView("stations")

# ============================
# Apply updated Flight and stations Join Query to get Station IDs and distances
# ============================
query = """
SELECT 
    f.*,
    
    -- Origin Station Lookup
    s1.neighbor_name AS origin_airport_name,
    s1.station_id AS origin_station_id,
    s1.lat AS origin_station_lat,
    s1.lon AS origin_station_lon,
    s1.neighbor_lat AS origin_airport_lat,
    s1.neighbor_lon AS origin_airport_lon,
    s1.distance_to_neighbor AS origin_station_dis,
    
    -- Destination Station Lookup
    s2.neighbor_name AS dest_airport_name,
    s2.station_id AS dest_station_id,
    s2.lat AS dest_station_lat,
    s2.lon AS dest_station_lon,
    s2.neighbor_lat AS dest_airport_lat,
    s2.neighbor_lon AS dest_airport_lon,
    s2.distance_to_neighbor AS dest_station_dis

FROM 
    flights_w_icao f

-- Origin Station Lookup (with closest station)
LEFT JOIN (
    SELECT neighbor_call, 
           neighbor_name, 
           station_id, 
           lat, lon, 
           neighbor_lat, 
           neighbor_lon, 
           distance_to_neighbor,
           ROW_NUMBER() OVER (PARTITION BY neighbor_call ORDER BY distance_to_neighbor) AS rn
    FROM stations
) s1 ON f.ORIGIN_ICAO = s1.neighbor_call AND s1.rn = 1

-- Destination Station Lookup (with closest station)
LEFT JOIN (
    SELECT neighbor_call, 
           neighbor_name, 
           station_id, 
           lat, lon, 
           neighbor_lat, 
           neighbor_lon, 
           distance_to_neighbor,
           ROW_NUMBER() OVER (PARTITION BY neighbor_call ORDER BY distance_to_neighbor) AS rn
    FROM stations
) s2 ON f.DEST_ICAO = s2.neighbor_call AND s2.rn = 1
"""

# Execute the query
flights_stations_3m = spark.sql(query)

# Show the result (TODO: add a checkpoint here)
display(flights_stations_3m)

In [0]:
# check for null origin_airport_name and dest_airport_name after the join
display(flights_stations_3m.filter(flights_stations_3m.origin_airport_name.isNull()).select("ORIGIN_ICAO").distinct())
display(flights_stations_3m.filter(flights_stations_3m.dest_airport_name.isNull()).select("DEST_ICAO").distinct())

In [0]:
# Convert string time to datetime
flights_stations_3m = flights_stations_3m.withColumn(
    "sched_depart_date_time",
    F.concat_ws(
        "T",
        F.col("FL_DATE"),
        F.date_format(F.to_timestamp(F.concat(F.lpad("CRS_DEP_TIME", 4, "0"), F.lit("00")), "HHmmss"), "HH:mm:ss")
    )
)

In [0]:
# ============================
# UDF: Look up time zones for a given lat/long
# ============================
def find_timezone(lat, lng):
    tf = TimezoneFinder()
    timezone_str = tf.timezone_at(lat=lat, lng=lng)
    return timezone_str if timezone_str else "Unknown"

tz_udf = F.udf(find_timezone)

# add time zone column
flights_tz_3m = flights_stations_3m.withColumn("origin_tz", tz_udf(F.col("origin_airport_lat"), F.col("origin_airport_lon")))
flights_tz_3m = flights_tz_3m.withColumn("dest_tz", tz_udf(F.col("dest_airport_lat"), F.col("dest_airport_lon")))

In [0]:
display(flights_tz_3m)


In [0]:
# ============================
# UDF: Convert Deprature Time to UTC
# ============================

def to_utc(dt: str, tz: str) -> str:
    if dt is None:
        return None
    dt_format = "%Y-%m-%dT%H:%M:%S"
    local_dt = datetime.strptime(dt, dt_format)
    if tz:
        timezone = pytz.timezone(tz)
        local_dt = timezone.localize(local_dt)
        
        # Convert to UTC
        utc_dt = local_dt.astimezone(pytz.utc)
        return utc_dt.strftime(dt_format)
    return None

utc_udf = F.udf(to_utc)

flights_origin_utc_3m = flights_3m_clean.withColumn("sched_depart_utc", utc_udf(F.col("sched_depart_date_time"), F.lpad(F.col("origin_tz").cast(F.StringType()), 2, '0')))

In [0]:
import us
us.states.lookup("atlanta")

In [0]:
# Testing to_utc UDF (75 is the fips code for GUAM, online it says 66)
to_utc("2015-01-01T00:00:00", "75")

In [0]:
# ============================
# UDF: Calculate Prior Times
# ============================
def calculate_prior_times(utc_datetime_str, hours_prior):
    if utc_datetime_str is None:
        return None
    try:
        # Parse UTC datetime string
        utc_format = "%Y-%m-%dT%H:%M:%S"
        utc_dt = datetime.strptime(utc_datetime_str, utc_format)
        
        # Subtract hours to get prior times
        prior_dt = utc_dt - timedelta(hours=hours_prior)
        return prior_dt.strftime("%Y-%m-%dT%H:%M:%S")
    
    except Exception as e:
        return None

# Register UDFs
calculate_prior_times_udf = F.udf(calculate_prior_times)

flights_3m_utc = flights_3m_utc.withColumn("two_hours_prior_depart_UTC", calculate_prior_times_udf(F.col("sched_depart_utc"), F.lit(2)))
flights_utc_2_4_3m = flights_3m_utc.withColumn("four_hours_prior_depart_UTC", calculate_prior_times_udf(F.col("sched_depart_utc"), F.lit(4)))
display(flights_utc_2_4_3m)

In [0]:
# check for null ORIGIN_ICAO and DEST_ICAO after the join
display(flights_stations.filter(flights_stations.ORIGIN_ICAO.isNull()).select("ORIGIN").distinct())
display(flights_stations.filter(flights_stations.DEST_ICAO.isNull()).select("DEST").distinct())

In [0]:
print(flights_transformed.count())
display(flights_transformed.filter(flights_transformed.sched_depart_date_time.isNull()))

In [0]:
tz = spark.read.parquet(f"{team_BASE_DIR}/external/tz_lookup.parquet")
display(tz)

In [0]:
tz.count()