# Full Flights Data - Processing and Storing to Database

Objective: Pre-Process Full Flights Data For Joining with Weather Data

Steps:  
A) Select only columns that will be used for modeling or pre-processing  
B) Create feature: FIRST_DEP  
C) Create outcome: DELAY  
D) Create column: TimeStamp (for join)   
E) Create column: ICAO codes (for join)  
F) Drop x3 Airports / Replace x1   
G) Create column: Timezone  
H) Use timezone column to do UTC Time Conversion  
I) Create column: 2&3hrs (for weather join)  
J) Create column: Truncated Date (for prior flights join)  
K) Create feature: Prior_Delay  
L) Remove irrelevant columns

In [0]:
#Import packages
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType, FloatType
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import isnan, when, count, col, udf, date_trunc, max as max_
from pyspark.ml.feature import Bucketizer
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import udf, date_trunc, col
from datetime import datetime, timedelta
from pyspark.sql.types import TimestampType

import pandas as pd
#Importing Modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pytz import timezone
import pytz

sqlContext = SQLContext(sc)

In [0]:
#Raw Airlines From Database

#Full data
airlines = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/datasets_final_project/parquet_airlines_data/201*.parquet")
#3 month data 
airlines_3m = airlines.where("YEAR = '2015' and MONTH < '4' ")

#Print sizes
print("Full data size: %s rows, %s columns" %(airlines.count(), len(airlines.columns)))
print("3-month data size: %s, %s columns" %(airlines_3m.count(),len(airlines_3m.columns)))

# A, B, C) Select Columns, Create FIRST_DEP & DELAY

A. Select Columns (Only Used For Modeling or Pre-Processing)  
B. Create Feature: First_DEP  
C. Create Outcome: DELAY

In [0]:
#Keep only relevant columns
#Create new column "DELAY" that will be our outcome variable. Delay = 1 if a flight is delayed >=15 minutes from departure or is cancelled.
#Convert "FIRST_DEP_TIME" variable to be binary 0 or 1. 1 = flight is the first of the day

#FULL DATA
airlines.registerTempTable('airlines_sql')
airlines_filter = spark.sql("""SELECT YEAR, MONTH, DAY_OF_WEEK, FL_DATE, OP_UNIQUE_CARRIER, TAIL_NUM, ORIGIN, ORIGIN_STATE_ABR, DEST, DEST_STATE_ABR, CRS_DEP_TIME, DEP_TIME, DEP_DEL15, CANCELLED, CRS_ELAPSED_TIME, DISTANCE, 
                              case when FIRST_DEP_TIME is not null then 1 else 0
                              end as FIRST_DEP, 
                              case when DEP_DEL15=1 or CANCELLED=1 then 1 else 0 
                              end as DELAY 
                              FROM airlines_sql""")

#Drop DEP_DEL15 and CANCELLED columns, now that we have DELAY column
airlines_filter = airlines_filter.drop('CANCELLED')

# D) Create timestamp 
(for joins with weather data and creating previously delayed flights feature)

In [0]:
#Concatenate FL_DATE and CRS_DEP_TIME to format for timestamp to match weather data

#Create User-Defined Function (UDF)
def concat_datetime(col1,col2):
  col1, col2 = str(col1), str(col2)
  if len(col2) < 4:
    return col1+' 0'+col2[:-2]+':'+ col2[-2:] +':00'
  else:
    return col1+' '+col2[:-2]+':'+ col2[-2:] +':00'

concat_datetime_udf = udf(concat_datetime)

##################################################################

#Run UDF on CRS_DEP_TIME and DEP_TIME 

#CRS_DEP_TIME
airlines_filter_dt = airlines_filter.withColumn("CRS_DEP_DATETIME", concat_datetime_udf("FL_DATE","CRS_DEP_TIME"))
airlines_filter_dt = airlines_filter_dt.withColumn("CRS_DEP_TIME_DT", (f.col("CRS_DEP_DATETIME").cast("timestamp")))  
airlines_filter_dt = airlines_filter_dt.drop("CRS_DEP_DATETIME")

#DEP_TIME
airlines_filter_dt = airlines_filter_dt.withColumn("DEP_DATETIME", concat_datetime_udf("FL_DATE","DEP_TIME"))
airlines_filter_dt = airlines_filter_dt.withColumn("DEP_TIME_DT", (f.col("DEP_DATETIME").cast("timestamp")))  
airlines_filter_dt = airlines_filter_dt.drop("DEP_DATETIME")

# E) Add ICAO Codes
(for use in joining weather data to this flights data)

There is another dataset containing weather data that we plan to join with this airlines data. After doing some research, we found that the `call_sign` variable in the weather data is equivalent to an airport's ICAO code. We do not have the ICAO codes in the airlines data. The variables `ORIGIN` and `DEST` are an airport's IATA code. We used the source https://ourairports.com/data/ to download a table that has the IATA and ICAO codes for every airport.

In [0]:
#Read in file that has IATA and ICAO codes for each airport in our dataset

#File location and type
file_location = "/FileStore/tables/iata_and_icao_codes.csv"
file_type = "csv"

#CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

#The applied options are for CSV files. For other file types, these will be ignored.
iata_and_icao_codes = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

#Display data
display(iata_and_icao_codes)

IATA_CODE,ICAO_CODE
BGM,KBGM
PSE,TJPS
INL,KINL
DLG,PADL
MSY,KMSY
PPG,NSTU
GEG,KGEG
DRT,KDRT
BUR,KBUR
SNA,KSNA


In [0]:
#Register IATA and ICAO codes table as a temp table
iata_and_icao_codes.registerTempTable('iata_and_icao_codes_tt')

#Register airlines data as a temp table
airlines_filter_dt.registerTempTable('airlines_tt')

#Trim all columns
spark.sql("""select trim(YEAR) as YEAR, trim(MONTH) as MONTH, trim(DAY_OF_WEEK) as DAY_OF_WEEK, trim(FL_DATE) as FL_DATE, trim(OP_UNIQUE_CARRIER) as OP_UNIQUE_CARRIER, trim(TAIL_NUM) as TAIL_NUM,
             trim(ORIGIN) as ORIGIN, trim(ORIGIN_STATE_ABR) as ORIGIN_STATE_ABR, trim(DEST) as DEST, trim(DEST_STATE_ABR) as DEST_STATE_ABR, trim(DEP_TIME) as DEP_TIME, trim(CRS_DEP_TIME) as CRS_DEP_TIME, 
             CRS_ELAPSED_TIME, DISTANCE, trim(FIRST_DEP) as FIRST_DEP, trim(DEP_DEL15) as DEP_DEL15, trim(DELAY) as DELAY, DEP_TIME_DT, CRS_DEP_TIME_DT
             from airlines_tt""").registerTempTable("airlines_trimmed_tt")

In [0]:
#Add origin_icao_code column by joining clean airline data with ICAO/IATA code table on ORIGIN
spark.sql("""select t1.*, t2.icao_code as origin_icao_code from(
          (select * from airlines_trimmed_tt) t1
          left join
          (select IATA_CODE, ICAO_CODE from iata_and_icao_codes_tt) t2
          on trim(t1.origin) = t2.iata_code)""").registerTempTable('add_iata_codes_p1')

#Add dest_icao_code column by joining clean airline data with ICAO/IATA code table on DEST
spark.sql("""select t1.*, t2.icao_code as dest_icao_code from(
          (select * from add_iata_codes_p1) t1
          left join
          (select IATA_CODE, ICAO_CODE from iata_and_icao_codes_tt) t2
          on trim(t1.dest) = t2.iata_code)""").registerTempTable('airlines_with_icao_tt')

# F) Drop airports

After adding the ICAO code for each airport in order to join the airlines data with the weather data, we then needed to check that every airport in the airlines data had weather readings. For details on this work, please see this notebook: https://dbc-c4580dc0-018b.cloud.databricks.com/?o=8229810859276230#notebook/945005255693705/command/945005255693706.
After doing this research we found the following airports 3 airports that did not have any corresponding weather readings. There was a fourth airport that did not have any weather readings but after some research we found that this airport changed ICAO codes. So we will drop the 3 airports without weather readings and replace the ICAO code for the airport whose ICAO code changed. The 3 airports that did not have weather readings were:
* ICAO code = 57A - had 1 flight in dataset
* ICAO code = TJPS - had 7,854 flights in dataset
* ICAO code = KOGS - had 1,270 flights in dataset

In [0]:
#It looks like we can use the weather readings for call_sign KISN for icao_code KXWA, so let's replace all origin and dest icao codes with KXWA to be KISN.
spark.sql("""select year, month, day_of_week, fl_date, op_unique_carrier, tail_num, origin, 
            case when origin_icao_code = 'KXWA' then 'KISN'
              else origin_icao_code
              end as origin_icao_code, 
            origin_state_abr, dest,
            case when dest_icao_code = 'KXWA' then 'KISN'
              else dest_icao_code
              end as dest_icao_code, 
            dest_state_abr, crs_dep_time, crs_elapsed_time, distance, first_dep, dep_del15, delay, dep_time_dt, crs_dep_time_dt
            from airlines_with_icao_tt""").registerTempTable('updated_airlines_with_icao_tt')

#Drop data for the 3 airports that do not have any weather data and save updated data as a dataframe
spark.sql("""select updated_airlines_with_icao_tt.* 
             from updated_airlines_with_icao_tt 
             where origin_icao_code != '57A' and dest_icao_code != '57A' and origin_icao_code != 'TJPS' and dest_icao_code !='TJPS' and origin_icao_code != 'KOGS' and dest_icao_code !=
             'KOGS'""").registerTempTable('updated_dropped_airlines_with_icao_tt')

# G) Time Zones

The weather data that we have has timestamps in UTC for when a reading took place. The airlines data that we have has times in local timezones, so we will need to add data that includes which time zone an airport is located in, in order to be able to convert the time to UTC to match the weather data.

In [0]:
#In order to get the timezone for an airport, we need its latitude and longitude. 
#We will use the weather data to get the latitude and longitude for each weather station/airport.

#Read in weather data
w = spark.read.option("header", "true")\
                    .parquet(f"dbfs:/mnt/mids-w261/datasets_final_project/weather_data/*.parquet")
#Get all call_signs
call_signs = w.select("CALL_SIGN", "LATITUDE", "LONGITUDE").distinct()

In [0]:
#Import timezonefinder package to find timezone of each call_sign/latitude/longitude

# !pip install timezonefinder
from timezonefinder import TimezoneFinder
tf = TimezoneFinder()
cs = call_signs.toPandas()

In [0]:
#Add timezone of each call_sign
def tz(row):
  return tf.timezone_at(lng=float(row["LONGITUDE"]), lat=float(row["LATITUDE"]))

cs["tz"] = cs.apply(lambda row: tz(row), axis=1)
cs = cs[["CALL_SIGN", "tz"]].drop_duplicates()
time_zones = spark.createDataFrame(cs)
time_zones.display()

#Register time_zones table as temp table
time_zones.registerTempTable("time_zones_tt")

CALL_SIGN,tz
KAUN,America/Los_Angeles
KSAV,America/New_York
KAJO,America/Los_Angeles
99999,Europe/Rome
99999,Asia/Almaty
KRYN,America/Phoenix
99999,America/Chicago
99999,Europe/Athens
99999,Europe/Moscow
99999,America/New_York


In [0]:
#Trim call signs to remove extra spaces
spark.sql("select trim(CALL_SIGN) as CALL_SIGN, tz from time_zones_tt").registerTempTable("time_zones_tt_trimmed")

In [0]:
#Add time zone to data

# origin data
spark.sql("""select t1.*, t2.tz as origin_time_zone
          from(
          (select *
          from updated_dropped_airlines_with_icao_tt) t1
          left join
          (select *
          from time_zones_tt_trimmed) t2
          on t1.origin_icao_code = t2.call_sign)""").registerTempTable("airlines_with_icao_tz_tt")

# destination data (turns out we did not use it, so commented out)
# spark.sql("""select t1.*, t2.tz as destination_time_zone
#           from(
#           (select *
#           from airlines_with_icao_tz_tt) t1
#           left join
#           (select *
#           from time_zones_tt_trimmed) t2
#           on t1.dest_icao_code = t2.call_sign)""").registerTempTable("airlines_with_icao_tz_tt")

#Save as dataframe
airlines_with_icao_tz = spark.sql("select * from airlines_with_icao_tz_tt")

# H) Convert time to UTC

In [0]:
#Function with datetime and time zone inputs and datetime in UTC output
def convert_to_utc(fl_date, tz_str):
  if fl_date:
    tz = timezone(tz_str)
    fl_date_local = tz.localize(fl_date)
    utc = pytz.utc
    fl_date_utc = fl_date_local.astimezone(utc)
  else:
    fl_date_utc = fl_date
  return fl_date_utc
  
convert_to_utc_udf = udf(convert_to_utc, TimestampType())

########################################

airlines_with_icao_tz_utc = airlines_with_icao_tz.withColumn("crs_dep_time_utc", convert_to_utc_udf("CRS_DEP_TIME_DT", "origin_time_zone"))
airlines_with_icao_tz_utc = airlines_with_icao_tz_utc.withColumn("dep_time_utc", convert_to_utc_udf("DEP_TIME_DT", "origin_time_zone"))
# airlines_with_icao_tz_utc.display()

# I) Create 2 Hour and 3 Hour Time Periods 
(for joining with weather data)

In [0]:
#Create User-Defined Functions (UDF)
def subtract2(row):
  return row - timedelta(hours=2)
def subtract3(row):
  return row - timedelta(hours=3)

subtract2_udf = udf(subtract2, TimestampType())
subtract3_udf = udf(subtract3, TimestampType())

In [0]:
# #Create Column of CRS_ELAPSED_TIME but 2 and 3 hours prior

#Subtract 2 & 3 hours with UDF
airlines_with_icao_tz_utc_23 = airlines_with_icao_tz_utc.withColumn("CRS_DEP_TIME_2HR", subtract2_udf("crs_dep_time_utc"))
airlines_with_icao_tz_utc_23 = airlines_with_icao_tz_utc_23.withColumn("CRS_DEP_TIME_3HR", subtract3_udf("crs_dep_time_utc"))

#Also Create Truncated versions
airlines_with_icao_tz_utc_23 = airlines_with_icao_tz_utc_23.withColumn("CRS_DEP_TIME_2HR_HR", date_trunc(timestamp='CRS_DEP_TIME_2HR',format='hour'))
airlines_with_icao_tz_utc_23 = airlines_with_icao_tz_utc_23.withColumn("CRS_DEP_TIME_3HR_HR", date_trunc(timestamp='CRS_DEP_TIME_3HR',format='hour'))

#airlines_with_icao_tz_utc_23.display()

# J) Create truncated date column 
(for joining with previously delayed flights data)

In [0]:
#Also Create Truncated versions
airlines_with_icao_tz_utc_23 = airlines_with_icao_tz_utc_23.withColumn("CRS_DEP_TIME_DATE_TODAY", date_trunc(timestamp='crs_dep_time_utc',format='day')) #For Prior Flight Delay
airlines_with_icao_tz_utc_23 = airlines_with_icao_tz_utc_23.withColumn("DEP_TIME_DATE_TODAY", date_trunc(timestamp='dep_time_utc',format='day')) #For Prior Flight Delay

#Store Full Data
file_to_store = airlines_with_icao_tz_utc_23                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "airlines_with_icao_tz_utc_23"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

# K) Previously Delayed Flights Feature

Steps:    
1) Create prior_flights table (Helper1) with features:   
['prior_tail_num', 'prior_dest', 'prior_delay', 'prior_crs_dep_time_utc', 'prior_dep_time_utc', 'prior_year', 'prior_month', 'PRIOR_CRS_DEP_TIME_DATE_TODAY', 'PRIOR_DEP_TIME_DATE_TODAY', 'Helper_ID']     

2) Add unique ID for each row of data. Used for error checking and joining in Step 6.  

3) INNER Join flight data with prior_flights data on same day departure, tail number, origin=prior_dest airport.  There could be many prior_flight delay info for some flight and nulls for other flights.  

4) Filter to only prior_flights that are prior to each flight's CRS departure time (2 hrs prior).  

5) Select maximum (latest) time prior_flight_datetime to keep only. Result = 1 prior_flight per flight. Those with 0 prior_flights = dropped.  Store this table as Helper2  

5a) Step 5 only works for train data.  For test data (where we may not know the 'delay_dep15' column 2 hrs prior) check if dep_time minus CRS_dep_time > 15, if so, mark previous_flight as delay.     

6) Left join Helper2 back to original flights data on Flight ID (step 2). Then replace prior_delay nulls with 0 (not delay).  

7) For all edge cases, drop duplicate row showing no delay. Keep yes delay.  

**NOTE**  
Although we are using test (2019) data for this process, this is VALID because in real life, we ASSUME that we would be provided with data 2 hours prior to scheduled departure time.  Step 5a is specifically done to use the 2 hours prior data to determine if previous flight was delay.

In [0]:
#Read Full Data from before
filename = "airlines_with_icao_tz_utc_23"                      #CHANGE THIS: file name in database (to open)
airlines_prior_delays = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")

Step 1: Create Prior Flights Helper Table

In [0]:
# 1) Create prior_flights table (Helper1) with features: ['tail_num', 'delay', 'crs_dep_time_utc', 'crs_dep_time_utc_date_trunc']   

#Register Temp Table
airlines_prior_delays.registerTempTable("airlines_prior_delays_tt")

helper1 = spark.sql("""SELECT tail_num as prior_tail_num, dest as prior_dest, dep_del15 as prior_delay, crs_dep_time_utc as prior_crs_dep_time_utc, dep_time_utc as prior_dep_time_utc,
                          CRS_DEP_TIME_DATE_TODAY as PRIOR_CRS_DEP_TIME_DATE_TODAY, DEP_TIME_DATE_TODAY as PRIOR_DEP_TIME_DATE_TODAY
                          FROM airlines_prior_delays_tt""").distinct()
helper1 = helper1.where("dep_del15 != 'null'")

Step 2: Add ID

In [0]:
# 2) Add unique ID for each row of data. Used for error checking and joining in Step 6.  

# add Flight ID column to FULL Data
airlines_prior_delays = airlines_prior_delays.coalesce(1)                                   
airlines_prior_delays = airlines_prior_delays.withColumn("Flight_ID",f.monotonically_increasing_id())   #adds unique ID column

#Store Data
file_to_store = airlines_prior_delays                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "airlines_prior_delays"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

# add Flight ID column to FULL Helper
helper1 = helper1.coalesce(1)                                 
helper1 = helper1.withColumn("Helper_ID",f.monotonically_increasing_id())   #adds unique ID column

#Store Data
file_to_store = helper1                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "helper1"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#READ FULL Data

filename = "airlines_prior_delays"                      #CHANGE THIS: file name in database (to open)
airlines_prior_delays = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
filename = "helper1"                      #CHANGE THIS: file name in database (to open)
helper1 = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")

# airlines_prior_delays.display()
# print(f"Flights: {airlines_prior_delays.count()}")
# helper1.display()
# print(f"Helper1: {helper1.count()}")

Step 3: Join flight data and prior_flights data

In [0]:
# 3) INNER Join flight data with prior_flights data on same day departure.  There could be many prior_flight delay info for some flight and nulls for other flights.  

# register temp tables of airline and weather data
airlines_prior_delays.registerTempTable("f_tt")
helper1.registerTempTable("helper1")

# join prior flights table (helper 1) with current day data
prior_flight_joined =spark.sql("""SELECT * 
                                    FROM f_tt INNER JOIN helper1 
                                    ON (f_tt.tail_num = helper1.prior_tail_num AND f_tt.origin = helper1.prior_dest AND f_tt.CRS_DEP_TIME_DATE_TODAY = helper1.PRIOR_CRS_DEP_TIME_DATE_TODAY)""")

#Store Data
file_to_store = prior_flight_joined                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "prior_flight_joined"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#Read FULL Data
filename = "prior_flight_joined"                      #CHANGE THIS: file name in database (to open)
prior_flight_joined = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
# prior_flight_joined.display()
# print(prior_flight_joined.count())
# print(prior_flight_joined.columns)

Step 4: Keep only data prior to departure time (2 hrs prior)

In [0]:
# 4) Filter to only prior_flights that are prior to each flight's departure time (2 hrs prior).      
#(in other words, get rid of data that occur after 2 hours prior to departure time)

prior_flight_joined.registerTempTable('pj')
prior_flight_joined_filtered = spark.sql("SELECT * from pj WHERE CAST(CRS_DEP_TIME_2HR AS timestamp) > CAST(prior_crs_dep_time_utc AS timestamp)")

# prior_flight_joined_filtered.display()
# print(prior_flight_joined_filtered.count())
# print(prior_flight_joined_filtered.columns)

Step 5: Keep Latest data only.

In [0]:
# 5) Select maximum (latest) time prior_flight_datetime to keep only. Result = 1 prior_flight per flight. Those with 0 prior_flights = dropped.  Store this table as Helper2  

#Find the latest weather record for each flight. This dataframe has two columns: Flight ID and Helper1 departure time
pID_maxdate = prior_flight_joined_filtered.groupBy("Flight_ID").agg(max_('prior_crs_dep_time_utc'))
pID_maxdate_rename = pID_maxdate.withColumnRenamed("Flight_ID", "ID") #Rename Columns
pID_maxdate_rename = pID_maxdate_rename.withColumnRenamed("max(prior_crs_dep_time_utc)", "prior_crs_datetime") #Rename Columns

#Store Data
file_to_store = pID_maxdate_rename                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "pID_maxdate_rename"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#Read FULL Data
filename = "pID_maxdate_rename"                      #CHANGE THIS: file name in database (to open)
pID_maxdate_rename = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
# pID_maxdate_rename.display()
# print(pID_maxdate_rename.count())
# print(pID_maxdate_rename.columns)

In [0]:
#Use previous table and join with joined table in step 4. Inner Join gets rid of unnecessary (not most recent) prior_flights data.

#Register Temp Tables for Joining
pID_maxdate_rename.registerTempTable("pID_maxdate_sql")
prior_flight_joined_filtered.registerTempTable("prior_flight_joined_filtered")

#inner join
helper2 = spark.sql("""SELECT * 
                    FROM pID_maxdate_sql t1
                    INNER JOIN prior_flight_joined_filtered t2
                    ON (t1.ID = t2.Flight_ID AND t1.prior_crs_datetime = t2.prior_crs_dep_time_utc)
                    """)

#Store Data
file_to_store = helper2                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "helper2"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#Read FULL Data
filename = "helper2"                      #CHANGE THIS: file name in database (to open)
helper2 = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
# helper2.display()
# print(helper2.count())
# print(helper2.columns) 

Step 5a: For Test Data

In [0]:
#Replace prior_delay features.
#For train data, keep the same as prior_delay
#For validation/test data, use actual departure time compared to CRS departure time to determine if there was a prior_delay. If difference is greater than 900 seconds, mark as a prior delayed flight

helper2.registerTempTable("helper2_tt")
helper2_test=spark.sql("""SELECT *,
              CASE
                WHEN prior_dep_time_utc < CRS_DEP_TIME_2HR AND year = '2018' or year ='2019' THEN 
                     CASE
                         WHEN unix_timestamp(prior_dep_time_utc) - unix_timestamp(prior_crs_dep_time_utc)> 900 THEN 1 ELSE 0
                     END
                WHEN prior_dep_time_utc >= CRS_DEP_TIME_2HR AND year = '2018' or year ='2019' THEN 0 
                WHEN year = '2015' or year = '2016' or year = '2017' THEN prior_delay
              END
                AS prior_delay2
                
              FROM helper2_tt 
               """)      
 
#Store Data
file_to_store = helper2_test                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "helper2_test"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#Read FULL Data
filename = "helper2_test"                      #CHANGE THIS: file name in database (to open)
helper2_test = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
# helper2_test.display()
# print(helper2_test.count())
# print(helper2_test.columns) 

Step 6: Join back with original flights data and clean up

In [0]:
# 6) Left join Helper2 back to original flights data on Flight ID (step 2). Then replace prior_delay nulls with 0 (not delay). 

#Drop unncessary columns in Helper2
helper2_test=helper2_test.select("ID", "prior_delay2")

#register temp table
airlines_prior_delays.registerTempTable("airlines_prior_delays_tt")
helper2_test.registerTempTable("helper2_test_tt")

#LEFT join Helper 2 with airlines data (with ID) from step 2
airlines_with_priordelays = spark.sql("""SELECT * 
                                FROM airlines_prior_delays_tt t1
                                LEFT JOIN helper2_test_tt t2
                                ON (t1.Flight_ID = t2.ID)
                                """)

#Store Data
file_to_store = airlines_with_priordelays                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "airlines_with_priordelays"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#Read FULL Data
filename = "airlines_with_priordelays"                      #CHANGE THIS: file name in database (to open)
airlines_with_priordelays = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
# airlines_with_priordelays.display()
# print(airlines_with_priordelays.count())
# print(airlines_with_priordelays.columns)

In [0]:
#Replace null prior_delays with 0.  

#register temp table
airlines_with_priordelays.registerTempTable("airlines_with_priordelays_tt")

#Replace null prior_delays with 0.  
airlines_with_priordelays_nonull= spark.sql("""SELECT *,
                                                CASE
                                                  WHEN prior_delay2 IS NULL THEN 0 ELSE prior_delay2 
                                                END
                                                  AS previous_delay
                                                FROM airlines_with_priordelays_tt""")

#Store Data
file_to_store = airlines_with_priordelays_nonull                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "airlines_with_priordelays_nonull"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#Read Data
filename = "airlines_with_priordelays_nonull"                      #CHANGE THIS: file name in database (to open)
airlines_with_priordelays_nonull = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
# airlines_with_priordelays_nonull.display()

Step 7: Drop Duplicate Rows

In [0]:
#7) For all edge cases, drop duplicate rows.  

#See which rows of data are duplicates (for these few edge cases, just mark as delay and call it good. it may not be 100% accurate but it is a small amount of data)
get_duplicates = airlines_with_priordelays_nonull.groupBy("Flight_ID").count().filter("count>1")
get_duplicates.display()
get_duplicates.registerTempTable("get_duplicates_tt")
airlines_with_priordelays_nonull.registerTempTable("airlines_with_priordelays_nonull_tt")

Flight_ID,count
886051,2
9541437,2
23751728,2
15174753,2
4421095,2
16435637,2
10010757,2
8642813,2
9517746,2
14238622,2


In [0]:
#join the flight IDs that are duplicate to the main flight data table (with prior_delay as 1 AND 0)
join_duplicates = spark.sql("""SELECT t1.*, t2.Flight_ID as duplicate_ID
                                FROM(
                                (select * from airlines_with_priordelays_nonull_tt) t1
                                left join
                                (select * from get_duplicates_tt) t2
                                on t1.Flight_ID = t2.Flight_ID)
                                """)
                                            
#Store Data
file_to_store = join_duplicates                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "join_duplicates"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#Read Data
filename = "join_duplicates"                      #CHANGE THIS: file name in database (to open)
join_duplicates = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
join_duplicates.display()
join_duplicates.registerTempTable("join_duplicates_tt")  

year,month,day_of_week,fl_date,op_unique_carrier,tail_num,origin,origin_icao_code,origin_state_abr,dest,dest_icao_code,dest_state_abr,crs_dep_time,crs_elapsed_time,distance,first_dep,dep_del15,delay,dep_time_dt,crs_dep_time_dt,origin_time_zone,crs_dep_time_utc,dep_time_utc,CRS_DEP_TIME_2HR,CRS_DEP_TIME_3HR,CRS_DEP_TIME_2HR_HR,CRS_DEP_TIME_3HR_HR,CRS_DEP_TIME_DATE_TODAY,DEP_TIME_DATE_TODAY,Flight_ID,ID,prior_delay2,previous_delay,duplicate_ID
2016,4,5,2016-04-15,OO,N863AS,BFL,KBFL,CA,PHX,KPHX,AZ,940,98.0,425.0,0,0.0,0,2016-04-15T09:31:00.000+0000,2016-04-15T09:40:00.000+0000,America/Los_Angeles,2016-04-15T16:40:00.000+0000,2016-04-15T16:31:00.000+0000,2016-04-15T14:40:00.000+0000,2016-04-15T13:40:00.000+0000,2016-04-15T14:00:00.000+0000,2016-04-15T13:00:00.000+0000,2016-04-15T00:00:00.000+0000,2016-04-15T00:00:00.000+0000,17,17.0,0.0,0.0,
2016,4,3,2016-04-13,OO,N863AS,DRO,KDRO,CO,PHX,KPHX,AZ,1835,82.0,353.0,0,0.0,0,2016-04-13T18:17:00.000+0000,2016-04-13T18:35:00.000+0000,America/Denver,2016-04-14T00:35:00.000+0000,2016-04-14T00:17:00.000+0000,2016-04-13T22:35:00.000+0000,2016-04-13T21:35:00.000+0000,2016-04-13T22:00:00.000+0000,2016-04-13T21:00:00.000+0000,2016-04-14T00:00:00.000+0000,2016-04-14T00:00:00.000+0000,79,,,0.0,
2016,4,5,2016-04-01,OO,N864AS,PSP,KPSP,CA,PHX,KPHX,AZ,1350,70.0,261.0,0,0.0,0,2016-04-01T13:49:00.000+0000,2016-04-01T13:50:00.000+0000,America/Los_Angeles,2016-04-01T20:50:00.000+0000,2016-04-01T20:49:00.000+0000,2016-04-01T18:50:00.000+0000,2016-04-01T17:50:00.000+0000,2016-04-01T18:00:00.000+0000,2016-04-01T17:00:00.000+0000,2016-04-01T00:00:00.000+0000,2016-04-01T00:00:00.000+0000,170,,,0.0,
2016,4,3,2016-04-20,OO,N760SK,LAX,KLAX,CA,ABQ,KABQ,NM,1340,118.0,677.0,0,0.0,0,2016-04-20T13:37:00.000+0000,2016-04-20T13:40:00.000+0000,America/Los_Angeles,2016-04-20T20:40:00.000+0000,2016-04-20T20:37:00.000+0000,2016-04-20T18:40:00.000+0000,2016-04-20T17:40:00.000+0000,2016-04-20T18:00:00.000+0000,2016-04-20T17:00:00.000+0000,2016-04-20T00:00:00.000+0000,2016-04-20T00:00:00.000+0000,209,209.0,0.0,0.0,
2016,4,6,2016-04-23,OO,N631SK,GEG,KGEG,WA,SLC,KSLC,UT,1315,98.0,546.0,0,0.0,0,2016-04-23T13:05:00.000+0000,2016-04-23T13:15:00.000+0000,America/Los_Angeles,2016-04-23T20:15:00.000+0000,2016-04-23T20:05:00.000+0000,2016-04-23T18:15:00.000+0000,2016-04-23T17:15:00.000+0000,2016-04-23T18:00:00.000+0000,2016-04-23T17:00:00.000+0000,2016-04-23T00:00:00.000+0000,2016-04-23T00:00:00.000+0000,661,661.0,0.0,0.0,
2016,4,1,2016-04-04,OO,N807SK,DTW,KDTW,MI,IAD,KIAD,VA,2058,90.0,383.0,0,0.0,0,2016-04-04T20:53:00.000+0000,2016-04-04T20:58:00.000+0000,America/Detroit,2016-04-05T00:58:00.000+0000,2016-04-05T00:53:00.000+0000,2016-04-04T22:58:00.000+0000,2016-04-04T21:58:00.000+0000,2016-04-04T22:00:00.000+0000,2016-04-04T21:00:00.000+0000,2016-04-05T00:00:00.000+0000,2016-04-05T00:00:00.000+0000,1060,,,0.0,
2016,4,1,2016-04-18,OO,N810SK,MSP,KMSP,MN,GRR,KGRR,MI,1508,84.0,408.0,0,0.0,0,2016-04-18T15:04:00.000+0000,2016-04-18T15:08:00.000+0000,America/Chicago,2016-04-18T20:08:00.000+0000,2016-04-18T20:04:00.000+0000,2016-04-18T18:08:00.000+0000,2016-04-18T17:08:00.000+0000,2016-04-18T18:00:00.000+0000,2016-04-18T17:00:00.000+0000,2016-04-18T00:00:00.000+0000,2016-04-18T00:00:00.000+0000,1342,1342.0,0.0,0.0,
2016,4,2,2016-04-12,OO,N549CA,MSP,KMSP,MN,GRR,KGRR,MI,1508,84.0,408.0,0,0.0,0,2016-04-12T15:04:00.000+0000,2016-04-12T15:08:00.000+0000,America/Chicago,2016-04-12T20:08:00.000+0000,2016-04-12T20:04:00.000+0000,2016-04-12T18:08:00.000+0000,2016-04-12T17:08:00.000+0000,2016-04-12T18:00:00.000+0000,2016-04-12T17:00:00.000+0000,2016-04-12T00:00:00.000+0000,2016-04-12T00:00:00.000+0000,1346,1346.0,0.0,0.0,
2016,4,4,2016-04-07,OO,N452SW,LAN,KLAN,MI,DTW,KDTW,MI,557,53.0,74.0,0,0.0,0,2016-04-07T05:51:00.000+0000,2016-04-07T05:57:00.000+0000,America/Detroit,2016-04-07T09:57:00.000+0000,2016-04-07T09:51:00.000+0000,2016-04-07T07:57:00.000+0000,2016-04-07T06:57:00.000+0000,2016-04-07T07:00:00.000+0000,2016-04-07T06:00:00.000+0000,2016-04-07T00:00:00.000+0000,2016-04-07T00:00:00.000+0000,1453,1453.0,0.0,0.0,
2016,4,1,2016-04-11,OO,N496CA,BUR,KBUR,CA,SLC,KSLC,UT,1632,113.0,574.0,0,0.0,0,2016-04-11T16:26:00.000+0000,2016-04-11T16:32:00.000+0000,America/Los_Angeles,2016-04-11T23:32:00.000+0000,2016-04-11T23:26:00.000+0000,2016-04-11T21:32:00.000+0000,2016-04-11T20:32:00.000+0000,2016-04-11T21:00:00.000+0000,2016-04-11T20:00:00.000+0000,2016-04-11T00:00:00.000+0000,2016-04-11T00:00:00.000+0000,2107,2107.0,0.0,0.0,


In [0]:
# You can see in previous table that duplicate_ID is 'null' when there isn't a duplicate ID.  
# Replace all duplicate ID's with 0 and 1 to 1 signifying that the previous flight was delayed. Do a distinct to only keep 1 row. 
# Rows with null duplicate_ID (no duplicate ID's) will just keep its previously delayed flights from previous steps

drop_duplicates= spark.sql("""SELECT *,
                          CASE
                            WHEN duplicate_ID IS NOT NULL
                            THEN 1
                            ElSE previous_delay
                          END
                            AS prior_delayed
                          FROM join_duplicates_tt""").drop("prior_delay2", "previous_delay").distinct()
#Store Data
file_to_store = drop_duplicates                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "drop_duplicates"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#Read Data
filename = "drop_duplicates"                      #CHANGE THIS: file name in database (to open)
drop_duplicates = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
# drop_duplicates.display()

In [0]:
#ERROR CHECKING FULL DATA

#Read Data
filename = "airlines_with_priordelays_nonull"                      #CHANGE THIS: file name in database (to open)
airlines_with_priordelays_nonull = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")

#Check that joined_filtered_agg does not have any duplicate Flight ID (these two counts should match)
print("drop_duplicates TOTAL ID Count:",drop_duplicates.select("Flight_ID").count())
print("drop_duplicates DISTINCT ID Count:",drop_duplicates.select("Flight_ID").distinct().count())
print("-------------------------------------------")

#################################
#Check Count of Each Table (FULL DATA)

#Raw Data
print("airlines_prior_delays count:", airlines_prior_delays.count())

#Step 1 (create helper)
print("helper1 count:", helper1.count())

#Step 2 (add ID)

#Step 3 (join helper with flight data)
print("prior_flight_joined count:", prior_flight_joined.count())

#Step 4 (get rid of flights after departure time (2hrs prior))
print("prior_flight_joined_filtered count:", prior_flight_joined_filtered.count())

#Step 5 (keep only the latest prior flight data)
print("pID_maxdate_rename count:", pID_maxdate_rename.count())
print("helper2 count:", helper2.count())

#Step 6 (final join back with flights data, replace null prior flights with 0 for no delay)
print("airlines_with_priordelays count:", airlines_with_priordelays.count())
print("airlines_with_priordelays_nonull count:", airlines_with_priordelays_nonull.count())

#Step 7 (drop duplicates from edge cases)
print("drop_duplicates count:", drop_duplicates.count())

In [0]:
#Some of the rows show strings of '0.0' instead of 0 as we needed.
#fix previous delay to '0' and '1' (from '0.0' and '1.0' in string values)

schema_fix = drop_duplicates
prior_flight_split = f.split(schema_fix['prior_delayed'],'')
schema_fix = schema_fix.withColumn('PREVIOUS_DELAY', prior_flight_split.getItem(0))

#Store Data
file_to_store = schema_fix                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "schema_fix"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#Read Data
filename = "schema_fix"                      #CHANGE THIS: file name in database (to open)
schema_fix = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
# schema_fix.display()
schema_fix.select("PREVIOUS_DELAY").groupby("PREVIOUS_DELAY").count().display()  #shows how many flights were previously delayed or not

PREVIOUS_DELAY,count
0,28576030
1,3161402


# L) Remove Irrelevant Columns

In [0]:
# Drop Unnecessary Columns

#FULL DATA
schema_fix.registerTempTable("schema_fix")

airlines_preprocessed = spark.sql("""SELECT YEAR, MONTH, DAY_OF_WEEK, FL_DATE, OP_UNIQUE_CARRIER, ORIGIN, ORIGIN_ICAO_CODE, ORIGIN_STATE_ABR, DEST, DEST_ICAO_CODE, DEST_STATE_ABR, CRS_DEP_TIME, CRS_DEP_TIME_UTC, CRS_DEP_TIME_2HR, CRS_DEP_TIME_3HR, CRS_DEP_TIME_2HR_HR, CRS_DEP_TIME_3HR_HR, CRS_ELAPSED_TIME, DISTANCE, FIRST_DEP, PREVIOUS_DELAY, DELAY
                                      FROM schema_fix""")

# airlines_preprocessed.display()

# FINAL STORED DATA

In [0]:
#Store Data

# FULL
file_to_store = airlines_preprocessed                          #CHANGE THIS: name of Spark Dataframe (to save in database)
filename = "airlines_preprocessed"                      #CHANGE THIS: new file name in database
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/" + filename)

In [0]:
#Read Data
filename = "airlines_preprocessed"                      #CHANGE THIS: file name in database (to open)
airlines_preprocessed = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")
airlines_preprocessed.display()
print(airlines_preprocessed.count(), len(airlines_preprocessed.columns))

YEAR,MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,ORIGIN,ORIGIN_ICAO_CODE,ORIGIN_STATE_ABR,DEST,DEST_ICAO_CODE,DEST_STATE_ABR,CRS_DEP_TIME,CRS_DEP_TIME_UTC,CRS_DEP_TIME_2HR,CRS_DEP_TIME_3HR,CRS_DEP_TIME_2HR_HR,CRS_DEP_TIME_3HR_HR,CRS_ELAPSED_TIME,DISTANCE,FIRST_DEP,PREVIOUS_DELAY,DELAY
2016,4,6,2016-04-30,UA,DFW,KDFW,TX,ORD,KORD,IL,900,2016-04-30T14:00:00.000+0000,2016-04-30T12:00:00.000+0000,2016-04-30T11:00:00.000+0000,2016-04-30T12:00:00.000+0000,2016-04-30T11:00:00.000+0000,142.0,802.0,0,1,1
2016,4,2,2016-04-05,UA,EWR,KEWR,NJ,RSW,KRSW,FL,1521,2016-04-05T19:21:00.000+0000,2016-04-05T17:21:00.000+0000,2016-04-05T16:21:00.000+0000,2016-04-05T17:00:00.000+0000,2016-04-05T16:00:00.000+0000,192.0,1068.0,0,0,0
2016,4,7,2016-04-03,WN,DAY,KDAY,OH,MCO,KMCO,FL,1255,2016-04-03T16:55:00.000+0000,2016-04-03T14:55:00.000+0000,2016-04-03T13:55:00.000+0000,2016-04-03T14:00:00.000+0000,2016-04-03T13:00:00.000+0000,130.0,808.0,0,0,0
2016,4,6,2016-04-02,WN,SNA,KSNA,CA,LAS,KLAS,NV,1555,2016-04-02T22:55:00.000+0000,2016-04-02T20:55:00.000+0000,2016-04-02T19:55:00.000+0000,2016-04-02T20:00:00.000+0000,2016-04-02T19:00:00.000+0000,60.0,226.0,0,0,0
2016,4,4,2016-04-07,WN,LAX,KLAX,CA,OAK,KOAK,CA,1545,2016-04-07T22:45:00.000+0000,2016-04-07T20:45:00.000+0000,2016-04-07T19:45:00.000+0000,2016-04-07T20:00:00.000+0000,2016-04-07T19:00:00.000+0000,75.0,337.0,0,0,0
2016,4,1,2016-04-18,EV,AGS,KAGS,GA,ATL,KATL,GA,1035,2016-04-18T14:35:00.000+0000,2016-04-18T12:35:00.000+0000,2016-04-18T11:35:00.000+0000,2016-04-18T12:00:00.000+0000,2016-04-18T11:00:00.000+0000,60.0,143.0,0,0,0
2016,4,2,2016-04-05,OO,IDA,KIDA,ID,SLC,KSLC,UT,749,2016-04-05T13:49:00.000+0000,2016-04-05T11:49:00.000+0000,2016-04-05T10:49:00.000+0000,2016-04-05T11:00:00.000+0000,2016-04-05T10:00:00.000+0000,63.0,188.0,0,0,0
2015,9,5,2015-09-04,EV,IAH,KIAH,TX,STL,KSTL,MO,1254,2015-09-04T17:54:00.000+0000,2015-09-04T15:54:00.000+0000,2015-09-04T14:54:00.000+0000,2015-09-04T15:00:00.000+0000,2015-09-04T14:00:00.000+0000,126.0,667.0,0,0,0
2015,9,2,2015-09-08,WN,PBI,KPBI,FL,BWI,KBWI,MD,1355,2015-09-08T17:55:00.000+0000,2015-09-08T15:55:00.000+0000,2015-09-08T14:55:00.000+0000,2015-09-08T15:00:00.000+0000,2015-09-08T14:00:00.000+0000,140.0,883.0,0,0,0
2015,9,2,2015-09-08,WN,SAT,KSAT,TX,LAS,KLAS,NV,550,2015-09-08T10:50:00.000+0000,2015-09-08T08:50:00.000+0000,2015-09-08T07:50:00.000+0000,2015-09-08T08:00:00.000+0000,2015-09-08T07:00:00.000+0000,170.0,1069.0,0,0,0
