# Flights data cleaning
Erica Landreth

## Converting flight data to UTC time zones

### Create time zone reference file

The below code was used to look up the time zone for each airport. The resulting time zone info was saved out to parquet, so from this point on, just load the time zone parquet (see below for path).

In [0]:
# !pip install timezonefinder

In [0]:
# # imports
# from pyspark.sql.functions import udf
# from pyspark.sql.types import StringType
# from timezonefinder import TimezoneFinder
# import pytz
# from datetime import datetime
# from pyspark.sql.functions import col

# # load stations data
# df_stations = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/stations_data/stations_with_neighbors.parquet/")

# # get unique airport info from stations table
# df_locs = df_stations.select('neighbor_call','neighbor_lat','neighbor_lon').distinct()
# display(df_locs)

# # define function to look up time zones
# def find_timezone(lat, lng):
#     tf = TimezoneFinder()
#     timezone_str = tf.timezone_at(lat=lat, lng=lng)
#     return timezone_str if timezone_str else "Unknown"

# # define udf for time zone lookup
# find_timezone_udf = udf(find_timezone, StringType())

# # add time zone column
# df_locs = df_locs.withColumn("timezone", find_timezone_udf(col("neighbor_lat"), col("neighbor_lon")))

# # save df_time zone info as a parquet file
# folder_path = "dbfs:/student-groups/Group_4_1"
# df_locs.write.parquet(f"{folder_path}/df_time_zones.parquet")

### Apply time zones to flight times for a small sample

In [0]:
# load data
df_airports = spark.read.option("header","true").csv(f"dbfs:/mnt/mids-w261/airport-codes_csv.csv")
df_flights = spark.read.parquet(f"dbfs:/mnt/mids-w261/datasets_final_project_2022/parquet_airlines_data_3m/")
df_tz = spark.read.parquet(f"dbfs:/student-groups/Group_4_1/df_time_zones.parquet")

In [0]:
# start with a very, very small flight data sample
tmp_flights = df_flights.limit(10)

First, we start with some joining, to get the relevant airport codes, and to introduce the time zone info, for both origin and destination airports.

In [0]:
# create temporary views
tmp_flights.createOrReplaceTempView("tmp_flights")
df_airports.createOrReplaceTempView("df_airports")
df_tz.createOrReplaceTempView("df_tz")

In [0]:
query = """

WITH origin AS(
SELECT  tf.FL_DATE as date,
        tf.CRS_DEP_TIME as dep_time,
        tf.CRS_ARR_TIME as arr_time,
        tf.ORIGIN as origin_iata,
        tf.DEST as dest_iata,
        a.ident as origin_icao
FROM tmp_flights as tf
LEFT JOIN df_airports as a on tf.ORIGIN = a.iata_code),

origin_dest AS(
SELECT  origin.date,
        origin.dep_time,
        origin.arr_time,
        origin.origin_iata,
        origin.dest_iata,
        origin.origin_icao,
        a.ident as dest_icao
FROM origin
LEFT JOIN df_airports as a on origin.dest_iata = a.iata_code),

origin_dest_tz1 AS(
SELECT  od.date,
        od.dep_time,
        od.arr_time,
        od.origin_iata,
        tz.timezone as origin_tz,
        od.dest_iata,
        od.origin_icao,
        od.dest_icao
FROM origin_dest as od
LEFT JOIN df_tz as tz on od.origin_icao = tz.neighbor_call
),

origin_dest_tz2 AS(
SELECT  od.date,
        od.dep_time,
        od.arr_time,
        od.origin_iata,
        od.origin_tz,
        od.dest_iata,
        tz.timezone as dest_tz,
        od.origin_icao,
        od.dest_icao
FROM origin_dest_tz1 as od
LEFT JOIN df_tz as tz on od.dest_icao = tz.neighbor_call
)

SELECT *
FROM origin_dest_tz2

"""

out = spark.sql(query)
display(out)

Now we apply the time zones to the departure and arrival times, and convert to UTC.

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pytz
from datetime import datetime
from pyspark.sql.functions import col

def create_datetime(yyyymmdd, hhmm, tz):
    """
    Create UTC timestamp from flights table columns
    yyyymmdd = FL_DATE
    hhmm = CRS_DEP_TIME or CRS_ARR_TIME
    tz = time zone from time zone table

    Returns UTC time stamp, (cast to string)
    """

    yyyy,MM,dd = yyyymmdd.split('-')
    yyyy = int(yyyy) # get year
    MM = int(MM) # get month
    dd = int(dd) # get day
    hh = hhmm//100 # get hour
    mm = hhmm%100 # get minute

    # create datetime variable
    dt = datetime(yyyy,MM,dd,hh,mm)
    # apply local time zone
    dt_local = pytz.timezone(tz).localize(dt)
    # convert to UTC
    dt_utc = dt_local.astimezone(pytz.utc)

    # return UTC datetime, cast to string
    return str(dt_utc)

# create_datetime('2015-02-27',901,'America/Chicago')

dt_udf = udf(create_datetime)
out = out.withColumn("dep_datetime", dt_udf(col("date"), col("dep_time"), col("origin_tz"))) \
    .withColumn("arr_datetime", dt_udf(col("date"), col("arr_time"), col("dest_tz")))
display(out)

# TODO: handle flights crossing midnight !!!