# Joining Flights and Weather Data 

Objective: To Join the Flights Dataset with Weather Dataset before splitting to train/validation/test datasets  

Steps:   
A) Add Unique ID for Flights & Weather Dataset   

For Origin Airport Weather  
B) Join Airline & Weather on ORIGIN & Time Bucket 2 Hours Prior to Departure   
C) Join Airline & Weather on ORIGIN & Time Bucket 3 Hours Prior to Departure   
D) Union Step B and C Tables   
E) Filter Weather Readings later than 2 hours prior   
F) Keep latest remaining Weather Readings   

For Destination Airport Weather  
G) Join Airline & Weather on DEST & Time Bucket 2 Hours Prior to Departure   
H) Join Airline & Weather on DEST & Time Bucket 3 Hours Prior to Departure   
I) Union Step G and H Tables   
J) Filter Weather Readings later than 2 hours prior   
K) Keep latest remaining Weather Readings   

L) Join ORIGIN and DEST tables on flights  (result = 1 table of flights, origin weather, and destination weather)     
M) Add Busyness of Day (Create helper table, groupby flight date to get count per date)   
N) Add Busyness of Day (Join helper table to final table)   
O) Remove irrelevant columns

In [0]:
#Import packages
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType, FloatType
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan, when, count, col, udf, date_trunc, max as max_
from pyspark.ml.feature import Bucketizer
from pyspark.sql.types import TimestampType
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pytz import timezone
import pytz

sqlContext = SQLContext(sc)

## Get Cleaned Data from Shared Folder

#### User changes this cell for 3 month or full flights data preprocessing

In [0]:
#READING PARQUET File from Shared Directory

#####################################
#Cleaned Airport Data

#CHANGE THESE FOUR COMMANDS TO JOIN 3 MONTH or FULL DATASET
# filename = "airlines_3m_preprocessed"              # 3 Month Data  #COMMENT/UNCOMMENT THESE CELLS TO JOIN 3 MONTH or FULL DATASET
# final_filename = "flight_weather_data_3m"                     

filename = "airlines_preprocessed"               # Full data     #COMMENT/UNCOMMENT THESE CELLS TO JOIN 3 MONTH or FULL DATASET
final_filename = "flight_weather_data"                     

air_full = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/data_processing_folder/"+filename+"/part-00*.parquet")

#####################################
#Cleaned Weather Data

filename = "weather_joined_split"                      #CHANGE THIS: file name in database (to open)
w = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/weather_processing_folder/"+filename+"/part-00*.parquet")

#####################################
#Display data and columns
air_full.display()
w.display()
print("Flights Shape:", air_full.count(), len(air_full.columns))
print("Weather Shape:", w.count(), len(w.columns))

YEAR,MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,ORIGIN,ORIGIN_ICAO_CODE,ORIGIN_STATE_ABR,DEST,DEST_ICAO_CODE,DEST_STATE_ABR,CRS_DEP_TIME,CRS_DEP_TIME_UTC,CRS_DEP_TIME_2HR,CRS_DEP_TIME_3HR,CRS_DEP_TIME_2HR_HR,CRS_DEP_TIME_3HR_HR,CRS_ELAPSED_TIME,DISTANCE,FIRST_DEP,PREVIOUS_DELAY,DELAY
2016,4,6,2016-04-30,UA,DFW,KDFW,TX,ORD,KORD,IL,900,2016-04-30T14:00:00.000+0000,2016-04-30T12:00:00.000+0000,2016-04-30T11:00:00.000+0000,2016-04-30T12:00:00.000+0000,2016-04-30T11:00:00.000+0000,142.0,802.0,0,1,1
2016,4,2,2016-04-05,UA,EWR,KEWR,NJ,RSW,KRSW,FL,1521,2016-04-05T19:21:00.000+0000,2016-04-05T17:21:00.000+0000,2016-04-05T16:21:00.000+0000,2016-04-05T17:00:00.000+0000,2016-04-05T16:00:00.000+0000,192.0,1068.0,0,0,0
2016,4,7,2016-04-03,WN,DAY,KDAY,OH,MCO,KMCO,FL,1255,2016-04-03T16:55:00.000+0000,2016-04-03T14:55:00.000+0000,2016-04-03T13:55:00.000+0000,2016-04-03T14:00:00.000+0000,2016-04-03T13:00:00.000+0000,130.0,808.0,0,0,0
2016,4,6,2016-04-02,WN,SNA,KSNA,CA,LAS,KLAS,NV,1555,2016-04-02T22:55:00.000+0000,2016-04-02T20:55:00.000+0000,2016-04-02T19:55:00.000+0000,2016-04-02T20:00:00.000+0000,2016-04-02T19:00:00.000+0000,60.0,226.0,0,0,0
2016,4,4,2016-04-07,WN,LAX,KLAX,CA,OAK,KOAK,CA,1545,2016-04-07T22:45:00.000+0000,2016-04-07T20:45:00.000+0000,2016-04-07T19:45:00.000+0000,2016-04-07T20:00:00.000+0000,2016-04-07T19:00:00.000+0000,75.0,337.0,0,0,0
2016,4,1,2016-04-18,EV,AGS,KAGS,GA,ATL,KATL,GA,1035,2016-04-18T14:35:00.000+0000,2016-04-18T12:35:00.000+0000,2016-04-18T11:35:00.000+0000,2016-04-18T12:00:00.000+0000,2016-04-18T11:00:00.000+0000,60.0,143.0,0,0,0
2016,4,2,2016-04-05,OO,IDA,KIDA,ID,SLC,KSLC,UT,749,2016-04-05T13:49:00.000+0000,2016-04-05T11:49:00.000+0000,2016-04-05T10:49:00.000+0000,2016-04-05T11:00:00.000+0000,2016-04-05T10:00:00.000+0000,63.0,188.0,0,0,0
2015,9,5,2015-09-04,EV,IAH,KIAH,TX,STL,KSTL,MO,1254,2015-09-04T17:54:00.000+0000,2015-09-04T15:54:00.000+0000,2015-09-04T14:54:00.000+0000,2015-09-04T15:00:00.000+0000,2015-09-04T14:00:00.000+0000,126.0,667.0,0,0,0
2015,9,2,2015-09-08,WN,PBI,KPBI,FL,BWI,KBWI,MD,1355,2015-09-08T17:55:00.000+0000,2015-09-08T15:55:00.000+0000,2015-09-08T14:55:00.000+0000,2015-09-08T15:00:00.000+0000,2015-09-08T14:00:00.000+0000,140.0,883.0,0,0,0
2015,9,2,2015-09-08,WN,SAT,KSAT,TX,LAS,KLAS,NV,550,2015-09-08T10:50:00.000+0000,2015-09-08T08:50:00.000+0000,2015-09-08T07:50:00.000+0000,2015-09-08T08:00:00.000+0000,2015-09-08T07:00:00.000+0000,170.0,1069.0,0,0,0


DATE,DATE_HR,LATITUDE,LONGITUDE,ELEVATION,REPORT_TYPE,CALL_SIGN,WND_SPEED,WND_SPEED_QUAL,CIG_HEIGHT,CIG_QUAL,VIS_DIST,VIS_DIST_QUAL,VIS_VAR,VIS_VAR_QUAL,TEMP,TEMP_QUAL,DEW_TEMP,DEW_TEMP_QUAL,SLPRESS,SLPRESS_QUAL
2017-01-01T00:51:00.000+0000,2017-01-01T00:00:00.000+0000,34.8994,-120.4486,73.8,FM-15,KSMX,26,5,22000,5,16093,5,N,5,78,5,50,5,10131,5
2017-01-01T01:51:00.000+0000,2017-01-01T01:00:00.000+0000,34.8994,-120.4486,73.8,FM-15,KSMX,0,5,22000,5,16093,5,N,5,78,5,50,5,10136,5
2017-01-01T02:51:00.000+0000,2017-01-01T02:00:00.000+0000,34.8994,-120.4486,73.8,FM-15,KSMX,15,5,22000,5,14484,5,N,5,56,5,44,5,10138,5
2017-01-01T03:51:00.000+0000,2017-01-01T03:00:00.000+0000,34.8994,-120.4486,73.8,FM-15,KSMX,15,5,22000,5,9656,5,N,5,50,5,44,5,10138,5
2017-01-01T04:51:00.000+0000,2017-01-01T04:00:00.000+0000,34.8994,-120.4486,73.8,FM-15,KSMX,0,5,22000,5,9656,5,N,5,39,5,39,5,10137,5
2017-01-01T05:51:00.000+0000,2017-01-01T05:00:00.000+0000,34.8994,-120.4486,73.8,FM-15,KSMX,0,5,22000,5,9656,5,N,5,33,5,28,5,10145,5
2017-01-01T06:51:00.000+0000,2017-01-01T06:00:00.000+0000,34.8994,-120.4486,73.8,FM-15,KSMX,46,5,22000,5,16093,5,N,5,61,5,39,5,10155,5
2017-01-01T07:51:00.000+0000,2017-01-01T07:00:00.000+0000,34.8994,-120.4486,73.8,FM-15,KSMX,15,5,22000,5,16093,5,N,5,67,5,28,5,10151,5
2017-01-01T07:59:00.000+0000,2017-01-01T07:00:00.000+0000,34.8994,-120.4486,73.8,SOD,KSMX,9999,9,99999,9,999999,9,9,9,9999,9,9999,9,99999,9
2017-01-01T07:59:00.000+0000,2017-01-01T07:00:00.000+0000,34.8994,-120.4486,73.8,SOM,KSMX,9999,9,99999,9,999999,9,9,9,9999,9,9999,9,99999,9


# A) Add Unique ID for Flights & Weather Dataset

In [0]:
# add Flight ID column
air = air_full.coalesce(1)                                   
air = air.withColumn("ID",f.monotonically_increasing_id())   #adds unique ID column

# add Weather READING_ID column, rename CALL_SIGN to AIRPORT
weather = w.coalesce(1)                                                     #Not 100% what this does yet
weather = weather.withColumn("READING_ID",f.monotonically_increasing_id())    #adds unique ID column
weather = weather.withColumn('AIRPORT', w.CALL_SIGN)

#Store Data
file_to_store = air                          
filename = "air"                      
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

file_to_store = weather                          
filename = "weather"                      
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#Read Data
filename = "air"                      
air = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# air.display()
# print("air Shape:", air.count(), len(air.columns))

filename = "weather"                     
weather = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# weather.display()
# print("weather Shape:", weather.count(), len(weather.columns))

# B, C, D) Join Airline & Weather on ORIGIN & Time Bucket 2 & 3 hours Prior to Departure and Union
B) Join Airline & Weather on ORIGIN & Time Bucket 2 Hours Prior to Departure   
C) Join Airline & Weather on ORIGIN & Time Bucket 3 Hours Prior to Departure   
D) Union Step B and C Tables

In [0]:
# register temp tables of airline and weather data
air.registerTempTable("a_tt")
weather.registerTempTable("w_tt")

# join 2 hr data (for origin weather)
join1a = spark.sql("SELECT * FROM a_tt INNER JOIN w_tt ON (a_tt.CRS_DEP_TIME_2HR_HR = w_tt.DATE_HR AND a_tt.origin_icao_code = w_tt.AIRPORT)")
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/join1a", True)      
join1a.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/join1a")
join1a = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/join1a/part-00*.parquet")
join1a.registerTempTable("join1a")

# join 3 hr data (for origin weather)
join2a=spark.sql("SELECT * FROM a_tt INNER JOIN w_tt ON (a_tt.CRS_DEP_TIME_3HR_HR = w_tt.DATE_HR AND a_tt.origin_icao_code = w_tt.AIRPORT)")
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/join2a", True)      
join2a.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/join2a")
join2a = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/join2a/part-00*.parquet")
join2a.registerTempTable("join2a")

# union the 2hr and 3 hr data joins (for origin weather)
joined_origin = spark.sql("SELECT * from join1a UNION SELECT * FROM join2a")

#Store Data
file_to_store = joined_origin                          
filename = "joined_origin"                     
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#Read Data
filename = "joined_origin"                      
joined_origin = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# joined_origin.display()
# print("joined_origin Shape:", joined_origin.count(), len(joined_origin.columns))

# E) Filter Weather Readings later than 2 hours prior   
Filter Data to Only Weather Readings that Occur Prior to the "2 hours prior departure time"
(in other words, get rid of weather readings that occur after 2 hours prior to departure time)

In [0]:
# get rid of weather readings that occur after 2 hours prior to departure

#Origin Data
joined_origin.registerTempTable('j_o')
joined_filtered_origin = spark.sql("SELECT * from j_o WHERE CAST(CRS_DEP_TIME_2HR AS timestamp) > CAST(DATE AS timestamp)")

#Store Data
file_to_store = joined_filtered_origin                          
filename = "joined_filtered_origin"                     
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#Read Data
filename = "joined_filtered_origin"                      
joined_filtered_origin = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# joined_filtered_origin.display()
# print("joined_filtered_origin Shape:", joined_filtered_origin.count(), len(joined_filtered_origin.columns))

# F) Keep latest remaining Weather Readings   

Select maximum (latest) time weather data to keep only. Final Result = 1 weather data for each flight data.

In [0]:
#Find the latest weather record for each flight. This dataframe = Flight ID and Weather Time

# ORIGIN WEATHER
ID_maxdate_o = joined_filtered_origin.groupBy("ID").agg(max_('DATE'))
ID_maxdate_rename_o = ID_maxdate_o.withColumnRenamed("ID", "flight_ID") #Rename Columns
ID_maxdate_rename_o = ID_maxdate_rename_o.withColumnRenamed("max(DATE)", "weather_datetime") #Rename Columns

#SAVING Spark Dataframe to Shared Directory
file_to_store = ID_maxdate_rename_o                       
filename = "ID_maxdate_rename_o"                     
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#READING PARQUET File from Shared Directory
filename = "ID_maxdate_rename_o"                      
ID_maxdate_rename_o = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# ID_maxdate_rename_o.display()

In [0]:
#Use previous table to join with big Airlines/Weather Table. Inner Join gets rid of unnecessary weather data.

#ORIGIN WEATHER
#Register Temp Tables for Joining
ID_maxdate_rename_o.registerTempTable("ID_maxdate_sql_o")
joined_filtered_origin.registerTempTable("joined_filtered_sql_o")

joined_filtered_agg_o = spark.sql("""SELECT * 
                                FROM ID_maxdate_sql_o t1
                                INNER JOIN joined_filtered_sql_o t2
                                ON (t1.flight_ID = t2.ID AND t1.weather_datetime = t2.DATE)
                                """)

#SAVING Spark Dataframe to Shared Directory
file_to_store = joined_filtered_agg_o                       
filename = "joined_filtered_agg_o"                     
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#READING PARQUET File from Shared Directory
filename = "joined_filtered_agg_o"                      
joined_filtered_agg_o = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# joined_filtered_agg_o.display()
# print(joined_filtered_agg_o.count(), len(joined_filtered_agg_o.columns))

# G, H, I) Join Airline & Weather on DEST & Time Bucket 2 & 3 Hours Prior to Departure and Union
G) Join Airline & Weather on DEST & Time Bucket 2 Hours Prior to Departure   
H) Join Airline & Weather on DEST & Time Bucket 3 Hours Prior to Departure   
I) Union Step G and H Tables

In [0]:
# register temp tables of airline and weather data
air.registerTempTable("a_tt")
weather.registerTempTable("w_tt")

# join 2 hr data (for dest weather)
join1b=spark.sql("""SELECT a_tt.*, w_tt.DATE as DEST_DATE, w_tt.DATE_HR as DEST_DATE_HR, w_tt.LATITUDE as DEST_LATITUDE, w_tt.LONGITUDE as DEST_LONGITUDE, w_tt.ELEVATION as DEST_ELEVATION, 
                    w_tt.REPORT_TYPE as DEST_REPORT_TYPE, w_tt.CALL_SIGN as DEST_CALL_SIGN, w_tt.WND_SPEED as DEST_WND_SPEED, w_tt.WND_SPEED_QUAL as DEST_WND_SPEED_QUAL, 
                    w_tt.CIG_HEIGHT as DEST_CIG_HEIGHT, w_tt.CIG_QUAL as DEST_CIG_QUAL, w_tt.VIS_DIST as DEST_VIS_DIST, w_tt.VIS_DIST_QUAL as DEST_VIS_DIST_QUAL, w_tt.VIS_VAR as DEST_VIS_VAR, 
                    w_tt.VIS_VAR_QUAL as DEST_VIS_VAR_QUAL, w_tt.TEMP as DEST_TEMP, w_tt.TEMP_QUAL as DEST_TEMP_QUAL, w_tt.DEW_TEMP as DEST_DEW_TEMP, w_tt.DEW_TEMP_QUAL as DEST_DEW_TEMP_QUAL, 
                    w_tt.SLPRESS as DEST_SLPRESS, w_tt.SLPRESS_QUAL as DEST_SLPRESS_QUAL, w_tt.READING_ID as DEST_READING_ID, w_tt.AIRPORT as DEST_AIRPORT
                    FROM a_tt INNER JOIN w_tt 
                    ON (a_tt.CRS_DEP_TIME_2HR_HR = w_tt.DATE_HR AND a_tt.dest_icao_code = w_tt.AIRPORT)
                    """)
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/join1b", True)      
join1b.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/join1b")
join1b = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/join1b/part-00*.parquet")
join1b.registerTempTable("join1b")

# join 3 hr data (for dest weather)
join2b=spark.sql("""SELECT a_tt.*, w_tt.DATE as DEST_DATE, w_tt.DATE_HR as DEST_DATE_HR, w_tt.LATITUDE as DEST_LATITUDE, w_tt.LONGITUDE as DEST_LONGITUDE, w_tt.ELEVATION as DEST_ELEVATION, 
                    w_tt.REPORT_TYPE as DEST_REPORT_TYPE, w_tt.CALL_SIGN as DEST_CALL_SIGN, w_tt.WND_SPEED as DEST_WND_SPEED, w_tt.WND_SPEED_QUAL as DEST_WND_SPEED_QUAL, 
                    w_tt.CIG_HEIGHT as DEST_CIG_HEIGHT, w_tt.CIG_QUAL as DEST_CIG_QUAL, w_tt.VIS_DIST as DEST_VIS_DIST, w_tt.VIS_DIST_QUAL as DEST_VIS_DIST_QUAL, w_tt.VIS_VAR as DEST_VIS_VAR, 
                    w_tt.VIS_VAR_QUAL as DEST_VIS_VAR_QUAL, w_tt.TEMP as DEST_TEMP, w_tt.TEMP_QUAL as DEST_TEMP_QUAL, w_tt.DEW_TEMP as DEST_DEW_TEMP, w_tt.DEW_TEMP_QUAL as DEST_DEW_TEMP_QUAL, 
                    w_tt.SLPRESS as DEST_SLPRESS, w_tt.SLPRESS_QUAL as DEST_SLPRESS_QUAL, w_tt.READING_ID as DEST_READING_ID, w_tt.AIRPORT as DEST_AIRPORT 
                    FROM a_tt INNER JOIN w_tt 
                    ON (a_tt.CRS_DEP_TIME_3HR_HR = w_tt.DATE_HR AND a_tt.dest_icao_code = w_tt.AIRPORT)
          """)
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/join2b", True)      
join2b.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/join2b")
join2b = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/join2b/part-00*.parquet")
join2b.registerTempTable("join2b")

# union the 2hr and 3 hr data joins (for dest weather)
joined_dest   = spark.sql("SELECT * from join1b UNION SELECT * FROM join2b")

#Store Data
file_to_store = joined_dest                          
filename = "joined_dest"                     
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#Read Data
filename = "joined_dest"                      
joined_dest = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# joined_dest.display()
# print("joined_dest Shape:", joined_dest.count(), len(joined_dest.columns))

# J) Filter Weather Readings later than 2 hours prior   
Filter Data to Only Weather Readings that Occur Prior to the "2 hours prior departure time"
(in other words, get rid of weather readings that occur after 2 hours prior to departure time)

In [0]:
# get rid of weather readings that occur after 2 hours prior to departure

#Destination Data
joined_dest.registerTempTable('j_d')
joined_filtered_dest = spark.sql("SELECT * from j_d WHERE CAST(CRS_DEP_TIME_2HR AS timestamp) > CAST(DEST_DATE AS timestamp)")

#Store Data
file_to_store = joined_filtered_dest                          
filename = "joined_filtered_dest"                     
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#Read Data
filename = "joined_filtered_dest"                      
joined_filtered_dest = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# joined_filtered_dest.display()
# print("joined_filtered_dest Shape:", joined_filtered_dest.count(), len(joined_filtered_dest.columns))

# K) Keep latest remaining Weather Readings   

Select maximum (latest) time weather data to keep only. Final Result = 1 weather data for each flight data.

In [0]:
#Find the latest weather record for each flight. This dataframe = Flight ID and Weather Time

# DEST WEATHER
ID_maxdate_d = joined_filtered_dest.groupBy("ID").agg(max_('DEST_DATE'))
ID_maxdate_rename_d = ID_maxdate_d.withColumnRenamed("ID", "flight_ID") #Rename Columns
ID_maxdate_rename_d = ID_maxdate_rename_d.withColumnRenamed("max(DEST_DATE)", "weather_datetime") #Rename Columns

#SAVING Spark Dataframe to Shared Directory
file_to_store = ID_maxdate_rename_d                       
filename = "ID_maxdate_rename_d"                     
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#READING PARQUET File from Shared Directory
filename = "ID_maxdate_rename_d"                      
ID_maxdate_rename_d = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# ID_maxdate_rename_d.display()

In [0]:
#Use previous table to join with big Airlines/Weather Table. Inner Join gets rid of unnecessary weather data.

#DEST WEATHER
#Register Temp Tables for Joining
ID_maxdate_rename_d.registerTempTable("ID_maxdate_sql_d")
joined_filtered_dest.registerTempTable("joined_filtered_sql_d")

joined_filtered_agg_d = spark.sql("""SELECT * 
                                FROM ID_maxdate_sql_d t1
                                INNER JOIN joined_filtered_sql_d t2
                                ON (t1.flight_ID = t2.ID AND t1.weather_datetime = t2.DEST_DATE)
                                """)

#SAVING Spark Dataframe to Shared Directory
file_to_store = joined_filtered_agg_d                       
filename = "joined_filtered_agg_d"                     
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#READING PARQUET File from Shared Directory
filename = "joined_filtered_agg_d"                      
joined_filtered_agg_d = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# joined_filtered_agg_d.display()
# print(joined_filtered_agg_d.count(), len(joined_filtered_agg_d.columns))


# L) Join ORIGIN and DEST tables on flights  (result = 1 table of flights, origin weather, and destination weather)     
Do an inner join using keys for all flights data table columns.

In [0]:
#Register Temp Tables
joined_filtered_agg_o.registerTempTable("joined_filtered_agg_o_tt")
joined_filtered_agg_d.registerTempTable("joined_filtered_agg_d_tt")

#Join Origin and Dest Flight/Weather tables on Flight data
joined_filtered_agg = spark.sql("""SELECT t1.*, t2.DEST_DATE, t2.DEST_DATE_HR, t2.DEST_LATITUDE, t2.DEST_LONGITUDE, t2.DEST_ELEVATION, 
                     t2.DEST_REPORT_TYPE, t2.DEST_CALL_SIGN, t2.DEST_WND_SPEED, t2.DEST_WND_SPEED_QUAL, 
                     t2.DEST_CIG_HEIGHT, t2.DEST_CIG_QUAL, t2.DEST_VIS_DIST, t2.DEST_VIS_DIST_QUAL, t2.DEST_VIS_VAR, 
                     t2.DEST_VIS_VAR_QUAL, t2.DEST_TEMP, t2.DEST_TEMP_QUAL, t2.DEST_DEW_TEMP, t2.DEST_DEW_TEMP_QUAL, 
                     t2.DEST_SLPRESS, t2.DEST_SLPRESS_QUAL, t2.DEST_READING_ID, t2.DEST_AIRPORT 
                     
                                FROM joined_filtered_agg_o_tt t1
                                INNER JOIN joined_filtered_agg_d_tt t2
                                ON (t1.YEAR = t2.YEAR AND
                                    t1.MONTH = t2.MONTH AND
                                    t1.FL_DATE = t2.FL_DATE AND
                                    t1.OP_UNIQUE_CARRIER = t2.OP_UNIQUE_CARRIER AND
                                    t1.ORIGIN = t2.ORIGIN AND
                                    t1.DEST = t2.DEST AND
                                    t1.CRS_DEP_TIME = t2.CRS_DEP_TIME)
                                """)

#SAVING Spark Dataframe to Shared Directory
file_to_store = joined_filtered_agg                       
filename = "joined_filtered_agg"                     
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#READING PARQUET File from Shared Directory
filename = "joined_filtered_agg"                      
joined_filtered_agg = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# joined_filtered_agg.display()
# print(joined_filtered_agg.count(), len(joined_filtered_agg.columns))

## Error Checking

In [0]:
#Check that joined_filtered_agg does not have any duplicate Flight ID (these two counts should match)
print("joined_filtered_agg TOTAL ID Count:",joined_filtered_agg.select("ID").count())
print("joined_filtered_agg DISTINCT ID Count:",joined_filtered_agg.select("ID").distinct().count())

#joined_filtered_agg_o.where("flight_ID = '1222981'").display()
#There are duplicates because some weather report type is SOD and SOM but everything else is the same. Will be accounted for in Train/Test Split notebooks

#Check Count of Each Table

#Raw Data
print("air_full count:", air_full.count())
print("w count:", w.count())

#Join 1, 2 and Union Dataset Count
print("Join 1a Count:", join1a.count() )
print("Join 1b Count:", join1b.count() )
print("Join 2a Count:", join2a.count() )
print("Join 2b Count:", join2b.count() )
print("Unioned Count:", joined_origin.count())
print("Unioned Count:", joined_dest.count())

#Filtered to Weather Data Prior to Dept. Time
print("Filtered to Prior Weather Data Count:", joined_filtered_origin.count())
print("Filtered to Prior Weather Data Count:", joined_filtered_dest.count())

#Filtered to only most recent flights
print("Filtered to only most recent flight:", joined_filtered_agg_o.count())
print("Filtered to only most recent flight:", joined_filtered_agg_d.count())

#Selecting Max of Prior Weather Data  
print("Final Join Count:", joined_filtered_agg.count())

#2290 data missing from 3month flight

# M, N) Add Busyness of Day  

Steps:   
1) Create helper table grouped by FL_Date and count.  
2) Join helper table to big table on FL_Date  

**NOTE**  
Although we are using test (2019) data for this process, this is VALID because in real life, we are assuming that the full year's worth of flights would be scheduled. Therefore, we are able to do a groupby and count how many scheduled flights there are for each day of the year.

In [0]:
# First step to group by fl_date and count (store as helper table)
busyness = joined_filtered_agg.groupBy("FL_DATE").count()
busyness = busyness.withColumnRenamed("FL_DATE", "flight_date") 
busyness = busyness.withColumnRenamed("count", "flights_per_day") 
# print(busyness.columns)

In [0]:
# Second step to join helper table with flight table.
busyness.registerTempTable("busyness_tt")
joined_filtered_agg.registerTempTable("joined_filtered_agg_tt")

joined_filtered_agg_busy = spark.sql("""SELECT t1.*, t2.flights_per_day
                                FROM joined_filtered_agg_tt t1
                                INNER JOIN busyness_tt t2
                                ON (t1.FL_DATE = t2.flight_date)
                                """)
#SAVING Spark Dataframe to Shared Directory
file_to_store = joined_filtered_agg_busy                       
filename = "joined_filtered_agg_busy"                     
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + filename)

In [0]:
#READING PARQUET File from Shared Directory
filename = "joined_filtered_agg_busy"                      
joined_filtered_agg_busy = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
# joined_filtered_agg_busy.display()
# print(joined_filtered_agg_busy.columns)
# print(joined_filtered_agg_busy.count(), len(joined_filtered_agg_busy.columns) )

# O) Remove irrelevant columns

In [0]:
#Drop Non-Feature Columns

#Saved Features: ['flight_ID', 'weather_datetime', 'YEAR', 'MONTH', 'DAY_OF_WEEK', 'FL_DATE', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'ORIGIN_ICAO_CODE', 'ORIGIN_STATE_ABR', 'DEST', 'DEST_ICAO_CODE', 'DEST_STATE_ABR', 'CRS_DEP_TIME', 'CRS_DEP_TIME_UTC', 'CRS_DEP_TIME_2HR', 'CRS_DEP_TIME_3HR', 'CRS_DEP_TIME_2HR_HR', 'CRS_DEP_TIME_3HR_HR', 'CRS_ELAPSED_TIME', 'DISTANCE', 'FIRST_DEP', 'PREVIOUS_DELAY', 'DELAY', 'ID', 'DATE', 'DATE_HR', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'REPORT_TYPE', 'CALL_SIGN', 'WND_SPEED', 'WND_SPEED_QUAL', 'CIG_HEIGHT', 'CIG_QUAL', 'VIS_DIST', 'VIS_DIST_QUAL', 'VIS_VAR', 'VIS_VAR_QUAL', 'TEMP', 'TEMP_QUAL', 'DEW_TEMP', 'DEW_TEMP_QUAL', 'SLPRESS', 'SLPRESS_QUAL', 'READING_ID', 'AIRPORT', 'DEST_DATE', 'DEST_DATE_HR', 'DEST_LATITUDE', 'DEST_LONGITUDE', 'DEST_ELEVATION', 'DEST_REPORT_TYPE', 'DEST_CALL_SIGN', 'DEST_WND_SPEED', 'DEST_WND_SPEED_QUAL', 'DEST_CIG_HEIGHT', 'DEST_CIG_QUAL', 'DEST_VIS_DIST', 'DEST_VIS_DIST_QUAL', 'DEST_VIS_VAR', 'DEST_VIS_VAR_QUAL', 'DEST_TEMP', 'DEST_TEMP_QUAL', 'DEST_DEW_TEMP', 'DEST_DEW_TEMP_QUAL', 'DEST_SLPRESS', 'DEST_SLPRESS_QUAL', 'DEST_READING_ID', 'DEST_AIRPORT', 'flights_per_day']

#Features to Keep: ['YEAR', 'MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'ORIGIN_STATE_ABR', 'DEST', 'DEST_STATE_ABR', 'CRS_DEP_TIME', 'CRS_ELAPSED_TIME', 'DISTANCE', 'FIRST_DEP', 'PREVIOUS_DELAY', 'DELAY', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'REPORT_TYPE', 'WND_SPEED', 'WND_SPEED_QUAL', 'CIG_HEIGHT', 'CIG_QUAL', 'VIS_DIST', 'VIS_DIST_QUAL', 'VIS_VAR', 'VIS_VAR_QUAL', 'TEMP', 'TEMP_QUAL', 'DEW_TEMP', 'DEW_TEMP_QUAL', 'SLPRESS', 'SLPRESS_QUAL', 'DEST_LATITUDE', 'DEST_LONGITUDE', 'DEST_ELEVATION', 'DEST_REPORT_TYPE', 'DEST_WND_SPEED', 'DEST_WND_SPEED_QUAL', 'DEST_CIG_HEIGHT', 'DEST_CIG_QUAL', 'DEST_VIS_DIST', 'DEST_VIS_DIST_QUAL', 'DEST_VIS_VAR', 'DEST_VIS_VAR_QUAL', 'DEST_TEMP', 'DEST_TEMP_QUAL', 'DEST_DEW_TEMP', 'DEST_DEW_TEMP_QUAL', 'DEST_SLPRESS', 'DEST_SLPRESS_QUAL', 'flights_per_day']

flight_weather_joined = joined_filtered_agg_busy.select('YEAR', 'MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'ORIGIN_STATE_ABR', 'DEST', 'DEST_STATE_ABR', 'CRS_DEP_TIME', 'CRS_ELAPSED_TIME', 'DISTANCE', 'FIRST_DEP', 'PREVIOUS_DELAY', 'DELAY', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'REPORT_TYPE', 'WND_SPEED', 'WND_SPEED_QUAL', 'CIG_HEIGHT', 'CIG_QUAL', 'VIS_DIST', 'VIS_DIST_QUAL', 'VIS_VAR', 'VIS_VAR_QUAL', 'TEMP', 'TEMP_QUAL', 'DEW_TEMP', 'DEW_TEMP_QUAL', 'SLPRESS', 'SLPRESS_QUAL', 'DEST_LATITUDE', 'DEST_LONGITUDE', 'DEST_ELEVATION', 'DEST_REPORT_TYPE', 'DEST_WND_SPEED', 'DEST_WND_SPEED_QUAL', 'DEST_CIG_HEIGHT', 'DEST_CIG_QUAL', 'DEST_VIS_DIST', 'DEST_VIS_DIST_QUAL', 'DEST_VIS_VAR', 'DEST_VIS_VAR_QUAL', 'DEST_TEMP', 'DEST_TEMP_QUAL', 'DEST_DEW_TEMP', 'DEST_DEW_TEMP_QUAL', 'DEST_SLPRESS', 'DEST_SLPRESS_QUAL', 'FLIGHTS_PER_DAY')   #.distinct()

# flight_weather_joined.display()
# print("flight_weather_joined shape:", flight_weather_joined.count(), len(flight_weather_joined.columns))

# FINAL WRITE TO DBFS

In [0]:
#SAVING Spark Dataframe to Shared Directory

#final_filename, see command at top of notebook with markdown "USER CHANGES THIS..." !!!!!!!!!!!!!!!
file_to_store = flight_weather_joined
dbutils.fs.rm("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+final_filename, True)      #remove file if there already is an existing one, be careful with this!!!
file_to_store.write.parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/" + final_filename) 

In [0]:
#READING PARQUET File from Shared Directory
filename = "flight_weather_data"                      
flight_weather_joined = spark.read.option("header", "true").parquet("dbfs:/mnt/mids-w261/team_25/join_data_folder/"+filename+"/part-00*.parquet")
flight_weather_joined.display()
print(flight_weather_joined.count(), len(flight_weather_joined.columns) )

flight_ID,weather_datetime,YEAR,MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,ORIGIN,ORIGIN_ICAO_CODE,ORIGIN_STATE_ABR,DEST,DEST_ICAO_CODE,DEST_STATE_ABR,CRS_DEP_TIME,CRS_DEP_TIME_UTC,CRS_DEP_TIME_2HR,CRS_DEP_TIME_3HR,CRS_DEP_TIME_2HR_HR,CRS_DEP_TIME_3HR_HR,CRS_ELAPSED_TIME,DISTANCE,FIRST_DEP,PREVIOUS_DELAY,DELAY,ID,DATE,DATE_HR,LATITUDE,LONGITUDE,ELEVATION,REPORT_TYPE,CALL_SIGN,WND_SPEED,WND_SPEED_QUAL,CIG_HEIGHT,CIG_QUAL,VIS_DIST,VIS_DIST_QUAL,VIS_VAR,VIS_VAR_QUAL,TEMP,TEMP_QUAL,DEW_TEMP,DEW_TEMP_QUAL,SLPRESS,SLPRESS_QUAL,READING_ID,AIRPORT,DEST_DATE,DEST_DATE_HR,DEST_LATITUDE,DEST_LONGITUDE,DEST_ELEVATION,DEST_REPORT_TYPE,DEST_CALL_SIGN,DEST_WND_SPEED,DEST_WND_SPEED_QUAL,DEST_CIG_HEIGHT,DEST_CIG_QUAL,DEST_VIS_DIST,DEST_VIS_DIST_QUAL,DEST_VIS_VAR,DEST_VIS_VAR_QUAL,DEST_TEMP,DEST_TEMP_QUAL,DEST_DEW_TEMP,DEST_DEW_TEMP_QUAL,DEST_SLPRESS,DEST_SLPRESS_QUAL,DEST_READING_ID,DEST_AIRPORT,flights_per_day
29637399,2015-05-01T12:52:00.000+0000,2015,5,5,2015-05-01,AA,DCA,KDCA,VA,ORD,KORD,IL,1055,2015-05-01T14:55:00.000+0000,2015-05-01T12:55:00.000+0000,2015-05-01T11:55:00.000+0000,2015-05-01T12:00:00.000+0000,2015-05-01T11:00:00.000+0000,127.0,612.0,0,0,0,29637399,2015-05-01T12:52:00.000+0000,2015-05-01T12:00:00.000+0000,38.8472,-77.03454,3.0,FM-15,KDCA,51,5,1280,5,16093,5,N,5,139,5,83,5,10135,5,15585294,KDCA,2015-05-01T12:51:00.000+0000,2015-05-01T12:00:00.000+0000,41.995,-87.9336,201.8,FM-15,KORD,21,5,22000,5,16093,5,N,5,94,5,33,5,10218,5,19777769,KORD,16927
25215947,2015-05-01T08:53:00.000+0000,2015,5,5,2015-05-01,AA,DEN,KDEN,CO,DFW,KDFW,TX,500,2015-05-01T11:00:00.000+0000,2015-05-01T09:00:00.000+0000,2015-05-01T08:00:00.000+0000,2015-05-01T09:00:00.000+0000,2015-05-01T08:00:00.000+0000,119.0,641.0,0,0,0,25215947,2015-05-01T08:53:00.000+0000,2015-05-01T08:00:00.000+0000,39.8328,-104.6575,1650.2,FM-15,KDEN,15,5,22000,5,16093,5,N,5,100,5,89,5,10146,5,17265157,KDEN,2015-05-01T08:53:00.000+0000,2015-05-01T08:00:00.000+0000,32.8978,-97.0189,170.7,FM-15,KDFW,41,5,22000,5,16093,5,N,5,156,5,106,5,10147,5,15017303,KDFW,16927
16680883,2015-05-01T22:53:00.000+0000,2015,5,5,2015-05-01,AA,DFW,KDFW,TX,ATL,KATL,GA,2020,2015-05-02T01:20:00.000+0000,2015-05-01T23:20:00.000+0000,2015-05-01T22:20:00.000+0000,2015-05-01T23:00:00.000+0000,2015-05-01T22:00:00.000+0000,137.0,731.0,1,0,1,16680883,2015-05-01T22:53:00.000+0000,2015-05-01T22:00:00.000+0000,32.8978,-97.0189,170.7,FM-15,KDFW,26,5,22000,5,16093,5,N,5,244,5,122,5,10152,5,15017317,KDFW,2015-05-01T22:52:00.000+0000,2015-05-01T22:00:00.000+0000,33.6301,-84.4418,307.8,FM-15,KATL,57,5,22000,5,16093,5,N,5,206,5,50,5,10150,5,10877104,KATL,16927
18950630,2015-05-01T19:53:00.000+0000,2015,5,5,2015-05-01,AA,DFW,KDFW,TX,LGA,KLGA,NY,1655,2015-05-01T21:55:00.000+0000,2015-05-01T19:55:00.000+0000,2015-05-01T18:55:00.000+0000,2015-05-01T19:00:00.000+0000,2015-05-01T18:00:00.000+0000,211.0,1389.0,0,0,0,18950630,2015-05-01T19:53:00.000+0000,2015-05-01T19:00:00.000+0000,32.8978,-97.0189,170.7,FM-15,KDFW,21,5,22000,5,16093,5,N,5,261,5,106,5,10168,5,15017314,KDFW,2015-05-01T19:51:00.000+0000,2015-05-01T19:00:00.000+0000,40.7792,-73.88,3.4,FM-15,KLGA,51,5,7620,5,16093,5,N,5,139,5,50,5,10137,5,15666888,KLGA,16927
22532875,2015-05-01T17:53:00.000+0000,2015,5,5,2015-05-01,AA,DFW,KDFW,TX,ORD,KORD,IL,1505,2015-05-01T20:05:00.000+0000,2015-05-01T18:05:00.000+0000,2015-05-01T17:05:00.000+0000,2015-05-01T18:00:00.000+0000,2015-05-01T17:00:00.000+0000,148.0,802.0,0,0,0,22532875,2015-05-01T17:53:00.000+0000,2015-05-01T17:00:00.000+0000,32.8978,-97.0189,170.7,FM-15,KDFW,15,5,22000,5,16093,5,N,5,256,5,106,5,10181,5,15017312,KDFW,2015-05-01T17:51:00.000+0000,2015-05-01T17:00:00.000+0000,41.995,-87.9336,201.8,FM-15,KORD,15,5,22000,5,16093,5,N,5,172,5,33,5,10211,5,19777774,KORD,16927
14644437,2015-05-01T20:53:00.000+0000,2015,5,5,2015-05-01,AA,DFW,KDFW,TX,RDU,KRDU,NC,1820,2015-05-01T23:20:00.000+0000,2015-05-01T21:20:00.000+0000,2015-05-01T20:20:00.000+0000,2015-05-01T21:00:00.000+0000,2015-05-01T20:00:00.000+0000,168.0,1061.0,0,0,1,14644437,2015-05-01T20:53:00.000+0000,2015-05-01T20:00:00.000+0000,32.8978,-97.0189,170.7,FM-15,KDFW,0,5,2286,5,16093,5,N,5,261,5,100,5,10162,5,15017315,KDFW,2015-05-01T20:51:00.000+0000,2015-05-01T20:00:00.000+0000,35.8923,-78.7819,126.8,FM-15,KRDU,46,5,1829,5,16093,5,N,5,144,5,111,5,10134,5,9007827,KRDU,16927
3291836,2015-05-01T23:53:00.000+0000,2015,5,5,2015-05-01,AA,DFW,KDFW,TX,TUS,KTUS,AZ,2145,2015-05-02T02:45:00.000+0000,2015-05-02T00:45:00.000+0000,2015-05-01T23:45:00.000+0000,2015-05-02T00:00:00.000+0000,2015-05-01T23:00:00.000+0000,144.0,813.0,0,0,0,3291836,2015-05-01T23:53:00.000+0000,2015-05-01T23:00:00.000+0000,32.8978,-97.0189,170.7,FM-15,KDFW,31,5,22000,5,16093,5,N,5,244,5,122,5,10154,5,15017318,KDFW,2015-05-01T23:53:00.000+0000,2015-05-01T23:00:00.000+0000,32.1313,-110.9552,776.9,FM-15,KTUS,21,5,22000,5,16093,5,N,5,356,5,-67,5,10058,5,21393616,KTUS,16927
28719602,2015-05-01T06:51:00.000+0000,2015,5,5,2015-05-01,AA,EWR,KEWR,NJ,MIA,KMIA,FL,550,2015-05-01T09:50:00.000+0000,2015-05-01T07:50:00.000+0000,2015-05-01T06:50:00.000+0000,2015-05-01T07:00:00.000+0000,2015-05-01T06:00:00.000+0000,182.0,1085.0,0,0,0,28719602,2015-05-01T06:51:00.000+0000,2015-05-01T06:00:00.000+0000,40.6825,-74.1694,2.1,FM-15,KEWR,26,5,1829,5,16093,5,N,5,106,5,33,5,10132,5,15656169,KEWR,2015-05-01T06:53:00.000+0000,2015-05-01T06:00:00.000+0000,25.7881,-80.3169,8.8,FM-15,KMIA,36,5,5791,5,16093,5,N,5,211,5,144,5,10119,5,10964598,KMIA,16927
7809889,2015-05-01T18:51:00.000+0000,2015,5,5,2015-05-01,AA,LGA,KLGA,NY,ORD,KORD,IL,1710,2015-05-01T21:10:00.000+0000,2015-05-01T19:10:00.000+0000,2015-05-01T18:10:00.000+0000,2015-05-01T19:00:00.000+0000,2015-05-01T18:00:00.000+0000,168.0,733.0,0,0,0,7809889,2015-05-01T18:51:00.000+0000,2015-05-01T18:00:00.000+0000,40.7792,-73.88,3.4,FM-15,KLGA,31,5,22000,5,16093,5,N,5,150,5,50,5,10142,5,15666887,KLGA,2015-05-01T18:51:00.000+0000,2015-05-01T18:00:00.000+0000,41.995,-87.9336,201.8,FM-15,KORD,21,5,7620,5,16093,5,N,5,183,5,33,5,10206,5,19777775,KORD,16927
22029173,2015-05-01T16:53:00.000+0000,2015,5,5,2015-05-01,AA,MCI,KMCI,MO,DFW,KDFW,TX,1435,2015-05-01T19:35:00.000+0000,2015-05-01T17:35:00.000+0000,2015-05-01T16:35:00.000+0000,2015-05-01T17:00:00.000+0000,2015-05-01T16:00:00.000+0000,99.0,460.0,0,0,0,22029173,2015-05-01T16:53:00.000+0000,2015-05-01T16:00:00.000+0000,39.2972,-94.7306,306.3,FM-15,KMCI,0,5,22000,5,16093,5,N,5,217,5,50,5,10201,5,4463779,KMCI,2015-05-01T16:53:00.000+0000,2015-05-01T16:00:00.000+0000,32.8978,-97.0189,170.7,FM-15,KDFW,21,5,22000,5,16093,5,N,5,244,5,133,5,10185,5,15017311,KDFW,16927
