In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, HTML

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [127]:
from datetime import datetime

def to_minutes(x):
    h,m = x.split(':')
    return int(h)*60 + int(m)

def findDayOfWeek(dateStr):
    date_object = datetime.strptime(dateStr, '%m/%d/%Y')
    dayOfWeek = date_object.strftime('%A')
    return dayOfWeek 

# findDayOfWeek('01/01/2006')

def flightNoStr(x):
    return str(int(x))

def classifyDelay(delay):
    if delay < -5:
        return -1 #flight is early
    elif delay > 5:
        return 1 #flight is delayed
    else:
        return 0 #flight is ontime
# def calculateDepTime()


In [95]:
jetblue_B6_116_dep_time_range = (735, 885) #13:25= 13*60+25 - 70 min buffer or 13:35 = 13*60+35 + 70 minute buffer

In [258]:
import os
# combinned_df = pd.DataFrame()
dataDir = 'data/arrivals/'
fileNames = os.listdir(dataDir)
dfList = []
for file in fileNames:
    filePath = dataDir + file
    print('Processing file {}'.format(filePath))
    raw_df = pd.read_csv(filePath)
    # raw_df.head()

    airline_df = pd.DataFrame()
    #dropping flights other than MCO, ORD, and JFK 
    # to keep additional filters (jetblue['Origin Airport'] == 'MCO') | (jetblue['Origin Airport'] == 'ORD') |
    filtered_data  = raw_df[(raw_df['Origin Airport'] == 'MCO') | (raw_df['Origin Airport'] == 'JFK') | (raw_df['Origin Airport'] == 'ORD')].copy()
    # filtered_data['Origin Airport'].unique()
    airline_df['DATE'] = filtered_data['Date (MM/DD/YYYY)']
    airline_df['DAY'] = [findDayOfWeek(date) for date in filtered_data['Date (MM/DD/YYYY)']]
    
    filtered_data['Flight Number'] = [flightNoStr(flightNo) for flightNo in filtered_data['Flight Number']]
    filtered_data['Flight No'] = filtered_data['Carrier Code'].str.cat(filtered_data['Flight Number'], sep=' ')
    airline_df['FLIGHT NUMBER'] = filtered_data['Flight No']

    airline_df['ORIGIN'] = filtered_data['Origin Airport']
    
    #converting time data to minutes
    filtered_data['Scheduled Arrival Time'] = [to_minutes(s) for s in filtered_data['Scheduled Arrival Time']] #to_minutes(step_1['Scheduled Arrival Time'])
    filtered_data['Actual Arrival Time'] = [to_minutes(s) for s in filtered_data['Actual Arrival Time']]
    filtered_data['Wheels-on Time'] = [to_minutes(s) for s in filtered_data['Wheels-on Time']]

    filtered_data['Scheduled Departure Time'] = filtered_data['Scheduled Arrival Time'] - filtered_data['Scheduled Elapsed Time (Minutes)']
    airline_df['DEPARTURE TIME'] = filtered_data['Scheduled Departure Time']
    airline_df['ARRIVAL TIME'] = filtered_data['Scheduled Arrival Time']
    # jetblue_df
    airline_df = airline_df[airline_df['ARRIVAL TIME']>=0] ##dropping data where flight departed one earlier date
    # filtered_data['Arrival Delay (Minutes)']
    airline_df['ARRIVAL STATUS'] = [classifyDelay(delay) for delay in filtered_data['Arrival Delay (Minutes)']]

    # airline_df
    dfList.append(airline_df)
    
    # break
    # raw_data = pd.read_csv(file)

combined_df=pd.concat(dfList, ignore_index=True)
combined_df.to_csv('data/combined_arrival_data.csv')


Processing file data/arrivals/JetBlue_Airlines.csv
Processing file data/arrivals/Endeavor_Air.csv
Processing file data/arrivals/Southwest_Airlines.csv
Processing file data/arrivals/United_Airlines.csv
Processing file data/arrivals/Skywest_Airline.csv
Processing file data/arrivals/Republic_Airline.csv
Processing file data/arrivals/American_Airlines.csv
Processing file data/arrivals/Delta_Airlines.csv


In [271]:
dates = combined_df['DATE'].unique()
# finalData =  pd.DataFrame()
# MCO_flightList = []
# JFK_flightList = []
# ORD_flightList = []
totalFlightList = []
totalFlights = 0
for date in specific_dates:
    flightList_MCO = combined_df[(combined_df['DATE'] == date) & (combined_df['ORIGIN'] == 'MCO')]
    sortedList_MCO = flightList_MCO.sort_values(by='DEPARTURE TIME', ascending=True)
    mco_f = sortedList_MCO.shape[0]
    # print('{} flights on date {} from MCO'.format(mco_f, date))
    # sortedList_MCO
    sortedList_MCO['previous_flight_status'] = sortedList_MCO['ARRIVAL STATUS'].shift(periods=1)
    # sortedList_MCO
    totalFlightList.append(sortedList_MCO)

    flightList_JFK = combined_df[(combined_df['DATE'] == date) & (combined_df['ORIGIN'] == 'JFK')]
    sortedList_JFK = flightList_JFK.sort_values(by='DEPARTURE TIME', ascending=True)
    jfk_f = sortedList_JFK.shape[0]
    # print('{} flights on date {} from JFK'.format(jfk_f, date))
    # sortedList_JFK
    sortedList_JFK['previous_flight_status'] = sortedList_JFK['ARRIVAL STATUS'].shift(periods=1)
    # sortedList_JFK
    totalFlightList.append(sortedList_JFK)

    flightList_ORD = combined_df[(combined_df['DATE'] == date) & (combined_df['ORIGIN'] == 'ORD')]
    sortedList_ORD = flightList_ORD.sort_values(by='DEPARTURE TIME', ascending=True)
    ord_f = sortedList_ORD.shape[0]
    # print('{} flights on date {} from ORD'.format(ord_f, date))
    # sortedList_ORD
    sortedList_ORD['previous_flight_status'] = sortedList_ORD['ARRIVAL STATUS'].shift(periods=1)
    # sortedList_ORD
    totalFlightList.append(sortedList_ORD)
    flightsOnDay = mco_f + jfk_f + ord_f
    print('{} flights on date {}'.format(flightsOnDay, date))
    totalFlights = totalFlights + flightsOnDay
    # if i==5:
    #     break
    # i+=1
print('{} flights in total {}'.format(totalFlights))
finalData = pd.concat (totalFlightList, ignore_index=True) 
finalData.to_csv('data/combined_data_with_prev_flight_status.csv')
    # finalData = pd.concat([finalData, sortedList], ignore_index=True)
# finalData

5 flights on date 03/01/2006
9 flights on date 03/01/2007
7 flights on date 03/01/2009
7 flights on date 03/01/2010
5 flights on date 03/01/2011
5 flights on date 03/01/2012
6 flights on date 03/01/2013
5 flights on date 03/01/2014
4 flights on date 03/01/2015
5 flights on date 03/01/2016
6 flights on date 03/01/2017
11 flights on date 03/01/2018
9 flights on date 03/01/2019
7 flights on date 03/01/2022
9 flights on date 03/01/2023
5 flights on date 03/02/2006
9 flights on date 03/02/2007
7 flights on date 03/02/2009
7 flights on date 03/02/2010
5 flights on date 03/02/2011
5 flights on date 03/02/2012
5 flights on date 03/02/2013
4 flights on date 03/02/2014
4 flights on date 03/02/2015
4 flights on date 03/02/2016
6 flights on date 03/02/2017
11 flights on date 03/02/2018
9 flights on date 03/02/2019
6 flights on date 03/02/2022
10 flights on date 03/02/2023
5 flights on date 03/03/2006
8 flights on date 03/03/2007
7 flights on date 03/03/2009
7 flights on date 03/03/2010
5 flights o

IndexError: Replacement index 1 out of range for positional args tuple

In [None]:
#Processing data for Delta airlines


In [16]:
#Data processing pipeline
dest = 'SYR'
filters = {}
filters['Destination Airport'] = 'SYR'

dep_JFK_jb = pd.read_csv('data/departures/JFK_jetblue.csv')
step_1  = dep_JFK_jb[dep_JFK_jb['Destination Airport'] == dest]
step_2 = step_1[ (step_1['Flight Number'] == 2516) | (step_1['Flight Number'] == 116)]
step_2
dep_JFK_del = pd.read_csv('data/departures/JFK_delta.csv')




Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Destination Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Wheels-off time,Taxi-Out time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes)
926,B6,03/01/2014,116.00,N354JB,SYR,09:18,09:15,71.00,95.00,-3.00,10:06,51.00,0.00,0.00,21.00,0.00,0.00
1032,B6,03/01/2015,116.00,N353JB,SYR,09:37,09:36,80.00,68.00,-1.00,09:51,15.00,0.00,0.00,0.00,0.00,0.00
1153,B6,03/01/2016,116.00,N328JB,SYR,09:15,09:11,74.00,90.00,-4.00,09:46,35.00,0.00,0.00,0.00,0.00,0.00
1272,B6,03/01/2017,116.00,N266JB,SYR,12:59,00:00,75.00,0.00,0.00,00:00,0.00,0.00,0.00,0.00,0.00,0.00
1386,B6,03/01/2018,116.00,N281JB,SYR,07:18,07:40,71.00,74.00,22.00,08:03,23.00,22.00,0.00,3.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172051,B6,05/31/2019,116.00,N239JB,SYR,10:35,10:27,79.00,72.00,-8.00,10:49,22.00,0.00,0.00,0.00,0.00,0.00
172151,B6,05/31/2019,2516.00,N306JB,SYR,17:00,16:54,97.00,67.00,-6.00,17:13,19.00,0.00,0.00,0.00,0.00,0.00
172280,B6,05/31/2022,2516.00,N355JB,SYR,21:45,22:20,82.00,79.00,35.00,22:53,33.00,32.00,0.00,0.00,0.00,0.00
172298,B6,05/31/2023,116.00,N329JB,SYR,13:30,13:28,72.00,71.00,-2.00,13:54,26.00,0.00,0.00,0.00,0.00,0.00
