In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, HTML

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [55]:
from datetime import datetime

def to_minutes(x):
    h,m = x.split(':')
    return int(h)*60 + int(m)

def findDayOfWeek(dateStr):
    date_object = datetime.strptime(dateStr, '%m/%d/%Y')
    dayOfWeek = date_object.strftime('%A')
    return dayOfWeek 

# findDayOfWeek('01/01/2006')

def flightNoStr(x):
    return str(int(x))

def classifyDelay(delay):
    if delay < -5:
        return 0 #flight is early
    elif delay > 5:
        return 1 #flight is delayed
    else:
        return 2 #flight is ontime
# def calculateDepTime()


In [56]:
# jetblue_B6_116_dep_time_range = (735, 885) #13:25= 13*60+25 - 70 min buffer or 13:35 = 13*60+35 + 70 minute buffer
RawDataProcessing = True

In [121]:
import os
# combinned_df = pd.DataFrame()
if RawDataProcessing:
    dataDir = 'data/arrivals/'
    fileNames = os.listdir(dataDir)
    dfList = []
    for file in fileNames:
        filePath = dataDir + file
        print('Processing file {}'.format(filePath))
        raw_df = pd.read_csv(filePath)
        # raw_df.head()
    
        airline_df = pd.DataFrame()
        #dropping flights other than MCO, ORD, and JFK 
        # to keep additional filters (jetblue['Origin Airport'] == 'MCO') | (jetblue['Origin Airport'] == 'ORD') |
        filtered_data  = raw_df[(raw_df['Origin Airport'] == 'MCO') | (raw_df['Origin Airport'] == 'JFK') | (raw_df['Origin Airport'] == 'ORD')].copy()
        # filtered_data['Origin Airport'].unique()
        airline_df['DATE'] = filtered_data['Date (MM/DD/YYYY)']
        airline_df['DAY'] = [findDayOfWeek(date) for date in filtered_data['Date (MM/DD/YYYY)']]
        
        filtered_data['Flight Number'] = [flightNoStr(flightNo) for flightNo in filtered_data['Flight Number']]
        filtered_data['Flight No'] = filtered_data['Carrier Code'].str.cat(filtered_data['Flight Number'], sep=' ')
        airline_df['FLIGHT NUMBER'] = filtered_data['Flight No']
    
        airline_df['ORIGIN'] = filtered_data['Origin Airport']
        
        #converting time data to minutes
        filtered_data['Scheduled Arrival Time'] = [to_minutes(s) for s in filtered_data['Scheduled Arrival Time']] #to_minutes(step_1['Scheduled Arrival Time'])
        filtered_data['Actual Arrival Time'] = [to_minutes(s) for s in filtered_data['Actual Arrival Time']]
        filtered_data['Wheels-on Time'] = [to_minutes(s) for s in filtered_data['Wheels-on Time']]
    
        filtered_data['Scheduled Departure Time'] = filtered_data['Scheduled Arrival Time'] - filtered_data['Scheduled Elapsed Time (Minutes)']
        airline_df['DEPARTURE TIME'] = filtered_data['Scheduled Departure Time']
        airline_df['ARRIVAL TIME'] = filtered_data['Scheduled Arrival Time']
        # jetblue_df
        airline_df = airline_df[airline_df['ARRIVAL TIME']>=0] ##dropping data where flight departed one earlier date
        # filtered_data['Arrival Delay (Minutes)']
        airline_df['ARRIVAL STATUS'] = [classifyDelay(delay) for delay in filtered_data['Arrival Delay (Minutes)']]
    
        # airline_df
        dfList.append(airline_df)
        
        # break
        # raw_data = pd.read_csv(file)
    combined_df=pd.concat(dfList, ignore_index=True)
    combined_df['DEPARTURE TIME'] = combined_df['DEPARTURE TIME'].astype(int)
    combined_df.to_csv('data/combined_arrival_data.csv', index=False)


Processing file data/arrivals/JetBlue_Airlines.csv
Processing file data/arrivals/Endeavor_Air.csv
Processing file data/arrivals/Southwest_Airlines.csv
Processing file data/arrivals/United_Airlines.csv
Processing file data/arrivals/Skywest_Airline.csv
Processing file data/arrivals/Republic_Airline.csv
Processing file data/arrivals/American_Airlines.csv
Processing file data/arrivals/Delta_Airlines.csv


In [122]:
if RawDataProcessing:
    dates = combined_df['DATE'].unique()
    # finalData =  pd.DataFrame()
    # MCO_flightList = []
    # JFK_flightList = []
    # ORD_flightList = []
    totalFlightList = []
    totalFlights = 0
    for date in dates:
        flightList_MCO = combined_df[(combined_df['DATE'] == date) & (combined_df['ORIGIN'] == 'MCO')]
        sortedList_MCO = flightList_MCO.sort_values(by='DEPARTURE TIME', ascending=True)
        mco_f = sortedList_MCO.shape[0]
        # print('{} flights on date {} from MCO'.format(mco_f, date))
        # sortedList_MCO
        sortedList_MCO['previous_flight_status'] = sortedList_MCO['ARRIVAL STATUS'].shift(periods=1)
        # sortedList_MCO
        totalFlightList.append(sortedList_MCO)
    
        flightList_JFK = combined_df[(combined_df['DATE'] == date) & (combined_df['ORIGIN'] == 'JFK')]
        sortedList_JFK = flightList_JFK.sort_values(by='DEPARTURE TIME', ascending=True)
        jfk_f = sortedList_JFK.shape[0]
        # print('{} flights on date {} from JFK'.format(jfk_f, date))
        # sortedList_JFK
        sortedList_JFK['previous_flight_status'] = sortedList_JFK['ARRIVAL STATUS'].shift(periods=1)
        # sortedList_JFK
        totalFlightList.append(sortedList_JFK)
    
        flightList_ORD = combined_df[(combined_df['DATE'] == date) & (combined_df['ORIGIN'] == 'ORD')]
        sortedList_ORD = flightList_ORD.sort_values(by='DEPARTURE TIME', ascending=True)
        ord_f = sortedList_ORD.shape[0]
        # print('{} flights on date {} from ORD'.format(ord_f, date))
        # sortedList_ORD
        sortedList_ORD['previous_flight_status'] = sortedList_ORD['ARRIVAL STATUS'].shift(periods=1)
        # sortedList_ORD
        totalFlightList.append(sortedList_ORD)
        flightsOnDay = mco_f + jfk_f + ord_f
        # print('{} flights on date {}'.format(flightsOnDay, date))
        totalFlights = totalFlights + flightsOnDay
        # if i==5:
        #     break
        # i+=1
    print('{} flights in total'.format(totalFlights))
    finalData = pd.concat (totalFlightList, ignore_index=True) 
    finalData.fillna(0, inplace=True)
    finalData['DEPARTURE TIME'] = finalData['DEPARTURE TIME'].astype(int)
    finalData['previous_flight_status'] = finalData['previous_flight_status'].astype(int)
    finalData.to_csv('data/combined_data_with_prev_flight_status.csv', index=False)
        # finalData = pd.concat([finalData, sortedList], ignore_index=True)
    # finalData

9336 flights in total


In [299]:
data_1 = pd.read_csv('data/combined_arrival_data.csv')

# flight_data.columns
# flight_data['FLIGHT NUMBER'].unique().shape
data_1.isna().sum()
data_1 = data_1.drop(columns = ['DATE', 'FLIGHT NUMBER'])
data_1.head()

DATE              0
DAY               0
FLIGHT NUMBER     0
ORIGIN            0
DEPARTURE TIME    0
ARRIVAL TIME      0
ARRIVAL STATUS    0
dtype: int64

Unnamed: 0,DAY,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS
0,Wednesday,JFK,1375,1439,2
1,Wednesday,JFK,590,655,2
2,Wednesday,JFK,920,990,1
3,Thursday,JFK,1360,1427,1
4,Thursday,JFK,575,646,1


In [313]:
data_2 = pd.read_csv('data/combined_data_with_prev_flight_status.csv')
# data_2.head()
# flight_data.columns
# flight_data['FLIGHT NUMBER'].unique().shape
data_2.isna().sum()
data_2 = data_2.drop(columns = ['DATE', 'FLIGHT NUMBER'])
data_2.head()

DATE                      0
DAY                       0
FLIGHT NUMBER             0
ORIGIN                    0
DEPARTURE TIME            0
ARRIVAL TIME              0
ARRIVAL STATUS            0
previous_flight_status    0
dtype: int64

Unnamed: 0,DAY,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,previous_flight_status
0,Wednesday,JFK,590,655,2,0
1,Wednesday,JFK,920,990,1,2
2,Wednesday,JFK,1375,1439,2,1
3,Wednesday,ORD,600,706,0,0
4,Wednesday,ORD,1330,1432,0,0


In [301]:
#Building XGBoost model using the finalData
encoded_data_1 = pd.get_dummies(data_1.drop(columns=['ARRIVAL STATUS']), drop_first=True)
encoded_data_1.head()

Unnamed: 0,DEPARTURE TIME,ARRIVAL TIME,DAY_Monday,DAY_Saturday,DAY_Sunday,DAY_Thursday,DAY_Tuesday,DAY_Wednesday,ORIGIN_MCO,ORIGIN_ORD
0,1375,1439,False,False,False,False,False,True,False,False
1,590,655,False,False,False,False,False,True,False,False
2,920,990,False,False,False,False,False,True,False,False
3,1360,1427,False,False,False,True,False,False,False,False
4,575,646,False,False,False,True,False,False,False,False


In [302]:
#Building XGBoost model using the finalData
encoded_data_2 = pd.get_dummies(data_2.drop(columns=['ARRIVAL STATUS']), drop_first=True)
encoded_data_2.head()

Unnamed: 0,DEPARTURE TIME,ARRIVAL TIME,previous_flight_status,DAY_Monday,DAY_Saturday,DAY_Sunday,DAY_Thursday,DAY_Tuesday,DAY_Wednesday,ORIGIN_MCO,ORIGIN_ORD
0,590,655,0,False,False,False,False,False,True,False,False
1,920,990,2,False,False,False,False,False,True,False,False
2,1375,1439,1,False,False,False,False,False,True,False,False
3,600,706,0,False,False,False,False,False,True,False,True
4,1330,1432,0,False,False,False,False,False,True,False,True


In [305]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(encoded_data_1, data_1['ARRIVAL STATUS'], stratify = data_1['ARRIVAL STATUS'], test_size=0.2, random_state=42)
X_train_1.shape
X_test_1.shape

(7468, 10)

(1868, 10)

In [306]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(encoded_data_2, data_2['ARRIVAL STATUS'], stratify = data_2['ARRIVAL STATUS'], test_size=0.2, random_state=42)
X_train.shape
X_test.shape

(7468, 11)

(1868, 11)

In [308]:
from sklearn.preprocessing import StandardScaler
scaler_1 = StandardScaler()
X_train_scaled_1 = pd.DataFrame(scaler_1.fit_transform(X_train_1), columns=X_train_1.columns, index=X_train_1.index)
X_test_scaled_1 = pd.DataFrame(scaler_1.transform(X_test_1), columns=X_test_1.columns, index=X_test_1.index)

scaler_total_1 = StandardScaler()
total_Data_scaled_1 = pd.DataFrame(scaler_total_1.fit_transform(encoded_data_1), columns=encoded_data_1.columns, index=encoded_data_1.index)
total_y_1 = data_1['ARRIVAL STATUS']
# X_test_scaled

In [309]:
# from sklearn.preprocessing import StandardScaler
scaler_2 = StandardScaler()
X_train_scaled_2 = pd.DataFrame(scaler_2.fit_transform(X_train_2), columns=X_train_2.columns, index=X_train_2.index)
X_test_scaled_2 = pd.DataFrame(scaler_2.transform(X_test_2), columns=X_test_2.columns, index=X_test_2.index)

scaler_total_2 = StandardScaler()
total_Data_scaled_2 = pd.DataFrame(scaler_total_2.fit_transform(encoded_data_2), columns=encoded_data_2.columns, index=encoded_data_2.index)
total_y_2 = data_2['ARRIVAL STATUS']
# X_test_scaled

In [116]:
# X_train_scaled.rename({'DEPARTURE TIME':'DEPARTURE_TIME', 'ARRIVAL TIME':'ARRIVAL_TIME'})
# X_test_scaled.rename({'DEPARTURE TIME':'DEPARTURE_TIME', 'ARRIVAL TIME':'ARRIVAL_TIME'})

# dtrain_class = xgb.DMatrix(X_train_scaled, y_train, enable_categorical=True)
# dtest_class = xgb.DMatrix(X_test_scaled, y_test, enable_categorical=True)



In [316]:
def accuracy_score(test_output):
    total_count = len(test_output)
    wrong_count = len(test_output[test_output['Predicted Arrival Status'] != test_output['ARRIVAL STATUS']])
    accuracy = wrong_count/total_count
    return accuracy*100

In [317]:
tree_methods = ['exact', 'hist', 'approx']
tree_depth = [4, 6, 7, 8, 10, 15]
learning_rates = [0.05, 0.1, 0.2, 0.3, 0.4]

best_accuracy_1 = 0
best_params_1 = {}

In [312]:
for method in tree_methods:
    for depth in tree_depth:
        for lr in learning_rates:
            params = {
                "objective": "multi:softmax",
                "tree_method": method,
                "max_depth" : depth,
                "learning_rate" : lr,
                "n_estimators": 200
            }
            xgbModel = xgb.XGBClassifier(**params)
            # xgbModel = GradientBoostingClassifier()
            # xgbModel = GradientBoostingClassifier()
            xgbModel.fit(X_train_scaled_1, y_train_1)
            y_pred = pd.DataFrame(xgbModel.predict(X_test_scaled_1), columns=['Predicted Arrival Status'], index=X_test_scaled_1.index)
            
            test_output = y_pred.merge(y_test_1, left_index=True, right_index=True)
            # test_output.head()
            accuracy = accuracy_score(test_output)
            print('Accuracy of the xgb model is {}'.format(accuracy))
            if accuracy > best_accuracy_1:
                best_accuracy_1 = accuracy
                best_params_1 = params

Accuracy of the xgb model is 0.5


Accuracy of the xgb model is 0.5037473233404711


Accuracy of the xgb model is 0.5112419700214133


Accuracy of the xgb model is 0.5085653104925053


Accuracy of the xgb model is 0.5224839400428265


Accuracy of the xgb model is 0.515524625267666


Accuracy of the xgb model is 0.5230192719486081


Accuracy of the xgb model is 0.5176659528907923


Accuracy of the xgb model is 0.5273019271948608


Accuracy of the xgb model is 0.5374732334047109


Accuracy of the xgb model is 0.5160599571734475


Accuracy of the xgb model is 0.5214132762312634


Accuracy of the xgb model is 0.5278372591006424


Accuracy of the xgb model is 0.5390792291220556


Accuracy of the xgb model is 0.5369379014989293


Accuracy of the xgb model is 0.521948608137045


Accuracy of the xgb model is 0.5230192719486081


Accuracy of the xgb model is 0.5315845824411135


Accuracy of the xgb model is 0.538543897216274


Accuracy of the xgb model is 0.541220556745182


Accuracy of the xgb model is 0.521948608137045


Accuracy of the xgb model is 0.5374732334047109


Accuracy of the xgb model is 0.538543897216274


Accuracy of the xgb model is 0.5433618843683083


Accuracy of the xgb model is 0.5455032119914347


Accuracy of the xgb model is 0.5433618843683083


Accuracy of the xgb model is 0.547644539614561


Accuracy of the xgb model is 0.5524625267665952


Accuracy of the xgb model is 0.5513918629550322


Accuracy of the xgb model is 0.5471092077087795


Accuracy of the xgb model is 0.4994646680942184


Accuracy of the xgb model is 0.5069593147751607


Accuracy of the xgb model is 0.5101713062098501


Accuracy of the xgb model is 0.5176659528907923


Accuracy of the xgb model is 0.5198072805139187


Accuracy of the xgb model is 0.5160599571734475


Accuracy of the xgb model is 0.5198072805139187


Accuracy of the xgb model is 0.5139186295503212


Accuracy of the xgb model is 0.5342612419700214


Accuracy of the xgb model is 0.5321199143468951


Accuracy of the xgb model is 0.5192719486081371


Accuracy of the xgb model is 0.5230192719486081


Accuracy of the xgb model is 0.5294432548179872


Accuracy of the xgb model is 0.5294432548179872


Accuracy of the xgb model is 0.5358672376873662


Accuracy of the xgb model is 0.5273019271948608


Accuracy of the xgb model is 0.5278372591006424


Accuracy of the xgb model is 0.5337259100642399


Accuracy of the xgb model is 0.5337259100642399


Accuracy of the xgb model is 0.5390792291220556


Accuracy of the xgb model is 0.5262312633832976


Accuracy of the xgb model is 0.5374732334047109


Accuracy of the xgb model is 0.538543897216274


Accuracy of the xgb model is 0.5406852248394004


Accuracy of the xgb model is 0.5422912205567452


Accuracy of the xgb model is 0.5438972162740899


Accuracy of the xgb model is 0.5455032119914347


Accuracy of the xgb model is 0.5417558886509636


Accuracy of the xgb model is 0.5497858672376874


Accuracy of the xgb model is 0.5433618843683083


Accuracy of the xgb model is 0.5042826552462527


Accuracy of the xgb model is 0.5021413276231264


Accuracy of the xgb model is 0.5107066381156317


Accuracy of the xgb model is 0.5160599571734475


Accuracy of the xgb model is 0.5235546038543897


Accuracy of the xgb model is 0.5165952890792291


Accuracy of the xgb model is 0.5144539614561028


Accuracy of the xgb model is 0.5235546038543897


Accuracy of the xgb model is 0.5337259100642399


Accuracy of the xgb model is 0.5289079229122056


Accuracy of the xgb model is 0.5171306209850107


Accuracy of the xgb model is 0.5192719486081371


Accuracy of the xgb model is 0.5337259100642399


Accuracy of the xgb model is 0.538543897216274


Accuracy of the xgb model is 0.5358672376873662


Accuracy of the xgb model is 0.5230192719486081


Accuracy of the xgb model is 0.5273019271948608


Accuracy of the xgb model is 0.5374732334047109


Accuracy of the xgb model is 0.5369379014989293


Accuracy of the xgb model is 0.5353319057815846


Accuracy of the xgb model is 0.525695931477516


Accuracy of the xgb model is 0.5305139186295503


Accuracy of the xgb model is 0.538543897216274


Accuracy of the xgb model is 0.5406852248394004


Accuracy of the xgb model is 0.5455032119914347


Accuracy of the xgb model is 0.5374732334047109


Accuracy of the xgb model is 0.5487152034261242


Accuracy of the xgb model is 0.5487152034261242


Accuracy of the xgb model is 0.5481798715203426


Accuracy of the xgb model is 0.5433618843683083


In [314]:
print('Best Accuracy using xgb model is {}'.format(best_accuracy_1))
print('Best Parameter using xgb model is {}'.format(best_params_1))

Best Accuracy using xgb model is 0.5524625267665952
Best Parameter using xgb model is {'objective': 'multi:softmax', 'tree_method': 'exact', 'max_depth': 15, 'learning_rate': 0.2, 'n_estimators': 200}


In [318]:
best_accuracy_2 = 0
best_params_2 = {}

In [319]:
for method in tree_methods:
    for depth in tree_depth:
        for lr in learning_rates:
            params = {
                "objective": "multi:softmax",
                "tree_method": method,
                "max_depth" : depth,
                "learning_rate" : lr,
                "n_estimators": 200
            }
            xgbModel = xgb.XGBClassifier(**params)
            # xgbModel = GradientBoostingClassifier()
            # xgbModel = GradientBoostingClassifier()
            xgbModel.fit(X_train_scaled_2, y_train_2)
            y_pred = pd.DataFrame(xgbModel.predict(X_test_scaled_2), columns=['Predicted Arrival Status'], index=X_test_scaled_2.index)
            
            test_output = y_pred.merge(y_test_2, left_index=True, right_index=True)
            # test_output.head()
            accuracy = accuracy_score(test_output)
            print('Accuracy of the xgb model is {}'.format(accuracy))
            if accuracy > best_accuracy_2:
                best_accuracy_2 = accuracy
                best_params_2 = params

Accuracy of the xgb model is 50.48179871520343


Accuracy of the xgb model is 50.48179871520343


Accuracy of the xgb model is 50.69593147751606


Accuracy of the xgb model is 50.85653104925053


Accuracy of the xgb model is 51.445396145610275


Accuracy of the xgb model is 50.53533190578159


Accuracy of the xgb model is 51.5524625267666


Accuracy of the xgb model is 52.35546038543897


Accuracy of the xgb model is 53.4796573875803


Accuracy of the xgb model is 54.01498929336188


Accuracy of the xgb model is 51.65952890792291


Accuracy of the xgb model is 52.408993576017124


Accuracy of the xgb model is 54.28265524625267


Accuracy of the xgb model is 54.17558886509636


Accuracy of the xgb model is 55.83511777301927


Accuracy of the xgb model is 51.2847965738758


Accuracy of the xgb model is 52.890792291220556


Accuracy of the xgb model is 54.603854389721626


Accuracy of the xgb model is 55.406852248394


Accuracy of the xgb model is 55.6745182012848


Accuracy of the xgb model is 52.944325481798714


Accuracy of the xgb model is 54.33618843683084


Accuracy of the xgb model is 55.139186295503215


Accuracy of the xgb model is 56.04925053533191


Accuracy of the xgb model is 56.3169164882227


Accuracy of the xgb model is 56.20985010706639


Accuracy of the xgb model is 56.37044967880086


Accuracy of the xgb model is 56.477516059957175


Accuracy of the xgb model is 56.74518201284796


Accuracy of the xgb model is 57.22698072805139


Accuracy of the xgb model is 50.37473233404711


Accuracy of the xgb model is 50.74946466809421


Accuracy of the xgb model is 51.070663811563165


Accuracy of the xgb model is 51.49892933618844


Accuracy of the xgb model is 52.30192719486081


Accuracy of the xgb model is 50.58886509635975


Accuracy of the xgb model is 51.01713062098501


Accuracy of the xgb model is 53.265524625267666


Accuracy of the xgb model is 54.17558886509636


Accuracy of the xgb model is 54.603854389721626


Accuracy of the xgb model is 51.12419700214132


Accuracy of the xgb model is 52.62312633832976


Accuracy of the xgb model is 53.64025695931478


Accuracy of the xgb model is 54.65738758029979


Accuracy of the xgb model is 55.78158458244111


Accuracy of the xgb model is 51.766595289079234


Accuracy of the xgb model is 53.426124197002146


Accuracy of the xgb model is 55.51391862955032


Accuracy of the xgb model is 56.263383297644545


Accuracy of the xgb model is 56.79871520342612


Accuracy of the xgb model is 53.426124197002146


Accuracy of the xgb model is 54.603854389721626


Accuracy of the xgb model is 55.6745182012848


Accuracy of the xgb model is 56.905781584582435


Accuracy of the xgb model is 56.79871520342612


Accuracy of the xgb model is 57.17344753747323


Accuracy of the xgb model is 57.066381156316915


Accuracy of the xgb model is 56.79871520342612


Accuracy of the xgb model is 56.63811563169164


Accuracy of the xgb model is 56.905781584582435


Accuracy of the xgb model is 50.32119914346895


Accuracy of the xgb model is 50.58886509635975


Accuracy of the xgb model is 50.53533190578159


Accuracy of the xgb model is 51.5524625267666


Accuracy of the xgb model is 51.5524625267666


Accuracy of the xgb model is 50.58886509635975


Accuracy of the xgb model is 51.65952890792291


Accuracy of the xgb model is 52.569593147751604


Accuracy of the xgb model is 53.104925053533194


Accuracy of the xgb model is 54.28265524625267


Accuracy of the xgb model is 50.48179871520343


Accuracy of the xgb model is 52.087794432548186


Accuracy of the xgb model is 54.1220556745182


Accuracy of the xgb model is 55.24625267665952


Accuracy of the xgb model is 55.35331905781584


Accuracy of the xgb model is 51.605995717344754


Accuracy of the xgb model is 52.8372591006424


Accuracy of the xgb model is 54.55032119914347


Accuracy of the xgb model is 55.728051391862955


Accuracy of the xgb model is 55.62098501070663


Accuracy of the xgb model is 52.73019271948608


Accuracy of the xgb model is 54.65738758029979


Accuracy of the xgb model is 55.728051391862955


Accuracy of the xgb model is 55.94218415417559


Accuracy of the xgb model is 56.20985010706639


Accuracy of the xgb model is 56.5845824411135


Accuracy of the xgb model is 57.28051391862955


Accuracy of the xgb model is 56.63811563169164


Accuracy of the xgb model is 56.85224839400428


Accuracy of the xgb model is 57.066381156316915


In [320]:
print('Best Accuracy using xgb model is {}'.format(best_accuracy_2))
print('Best Parameter using xgb model is {}'.format(best_params_2))

Best Accuracy using xgb model is 57.28051391862955
Best Parameter using xgb model is {'objective': 'multi:softmax', 'tree_method': 'approx', 'max_depth': 15, 'learning_rate': 0.1, 'n_estimators': 200}


In [321]:
model_1 = xgb.XGBClassifier(**best_params_1)

model_1.fit(total_Data_scaled_1, total_y_1)


model_2 = xgb.XGBClassifier(**best_params_2)

model_2.fit(total_Data_scaled_2, total_y_2)
# y_pred = pd.DataFrame(xgbModel.predict(X_test_scaled), columns=['Predicted Arrival Status'], index=X_test.index)

# test_output = y_pred.merge(y_test, left_index=True, right_index=True)
# # test_output.head()
# accuracy = accuracy_score(test_output)
# print('Accuracy of the xgb model is {}'.format(accuracy))
# if accuracy > best_accuracy:
#     best_accuracy = accuracy
#     best_params = params

In [384]:
test_data = pd.read_csv('CIS_662 _INITIAL_Predictions.csv', na_values=[''], keep_default_na=False)
# test_data.drop(columns=['DATE','FLIGHT NUMBER', 'ARRIVAL STATUS_Prev_flight_early', 'ARRIVAL STATUS_Prev_flight_ontime', 'ARRIVAL STATUS_Prev_flight_late'],inplace=True)
internal_test_data = pd.DataFrame(test_data.drop(columns=['DATE','FLIGHT NUMBER']), index=test_data.index)
test_data
internal_test_data

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,4/10/24,WEDNESDAY,UA 1400,ORD,6:52 PM,9:47 PM,,,,
1,4/10/24,WEDNESDAY,AA 3402,ORD,7:59 PM,10:52 PM,,,,
2,4/10/24,WEDNESDAY,B6 116,JFK,1:33 PM,2:50 PM,,,,
3,4/10/24,WEDNESDAY,DL 5182,JFK,2:55 PM,4:21 PM,,,,
4,4/10/24,WEDNESDAY,WN 5285,MCO,11:05 AM,1:45 PM,,,,
5,4/10/24,WEDNESDAY,B6 656,MCO,1:35 PM,4:25 PM,,,,
6,4/11/24,THURSDAY,UA 1400,ORD,6:52 PM,9:47 PM,,,,
7,4/11/24,THURSDAY,AA 3402,ORD,7:59 PM,10:52 PM,,,,
8,4/11/24,THURSDAY,B6 116,JFK,1:33 PM,2:50 PM,,,,
9,4/11/24,THURSDAY,DL 5182,JFK,2:55 PM,4:21 PM,,,,


Unnamed: 0,DAY,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,WEDNESDAY,ORD,6:52 PM,9:47 PM,,,,
1,WEDNESDAY,ORD,7:59 PM,10:52 PM,,,,
2,WEDNESDAY,JFK,1:33 PM,2:50 PM,,,,
3,WEDNESDAY,JFK,2:55 PM,4:21 PM,,,,
4,WEDNESDAY,MCO,11:05 AM,1:45 PM,,,,
5,WEDNESDAY,MCO,1:35 PM,4:25 PM,,,,
6,THURSDAY,ORD,6:52 PM,9:47 PM,,,,
7,THURSDAY,ORD,7:59 PM,10:52 PM,,,,
8,THURSDAY,JFK,1:33 PM,2:50 PM,,,,
9,THURSDAY,JFK,2:55 PM,4:21 PM,,,,


In [385]:
def convert_to_24Hr(timeStr):
    if ('AM' not in  timeStr) and ('PM' not in timeStr):
        return timeStr
    time = timeStr.split(sep=' ')
    timeValue = time[0]
    hh_mm = timeValue.split(sep=':')
    hr = hh_mm[0]
    min = hh_mm[1]
    am_pm = time[1]
    if am_pm == 'PM':
        hr_int = int(hr) + 12
        hr = str(hr_int)
    return hr +':'+min

In [386]:
internal_test_data['DEPARTURE TIME'] = [convert_to_24Hr(x) for x in internal_test_data['DEPARTURE TIME']]
internal_test_data['DEPARTURE TIME'] = [to_minutes(x) for x in internal_test_data['DEPARTURE TIME']]
internal_test_data['ARRIVAL TIME'] = [convert_to_24Hr(x) for x in internal_test_data['ARRIVAL TIME']]
internal_test_data['ARRIVAL TIME'] = [to_minutes(x) for x in internal_test_data['ARRIVAL TIME']]

In [387]:
internal_test_data.head()
# internal_test_data = len(test_data)

Unnamed: 0,DAY,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,WEDNESDAY,ORD,1132,1307,,,,
1,WEDNESDAY,ORD,1199,1372,,,,
2,WEDNESDAY,JFK,813,890,,,,
3,WEDNESDAY,JFK,895,981,,,,
4,WEDNESDAY,MCO,665,825,,,,


In [389]:
#dynamic training loop
for id in internal_test_data.index:
    day_of_flight = internal_test_data.loc[id,'DAY']
    origin = internal_test_data.loc[id,'ORIGIN']
    dep_time = internal_test_data.loc[id,'DEPARTURE TIME']
    arr_time = internal_test_data.loc[id,'ARRIVAL TIME']
    
    # test_row = pd.DataFrame(test_data.loc[i].copy())
    # test_row[]
    # day_of_flight = test_row.loc[0,'DAY']
    final_data_for_this_row = {}

    
    q1 = internal_test_data.loc[id, 'ARRIVAL STATUS']
    # print(status_qwery)
    if q1 != 'NA':
        columns=X_test_1.columns
        # new_test_row
        data_dict = {}
        data_dict['DEPARTURE TIME'] = [dep_time]
        data_dict['ARRIVAL TIME'] = [arr_time]
        data_dict['DAY_Monday'] = True if day_of_flight=='MONDAY' else False
        data_dict['DAY_Saturday'] = True if day_of_flight=='SATURDAY' else False
        data_dict['DAY_Sunday'] = True if day_of_flight=='SUNDAY' else False
        data_dict['DAY_Thursday'] = True if day_of_flight=='THURSDAY' else False
        data_dict['DAY_Tuesday'] = True if day_of_flight=='TUESDAY' else False
        data_dict['DAY_Wednesday'] = True if day_of_flight=='WEDNESDAY' else False
        data_dict['ORIGIN_MCO'] = True if origin == 'MCO' else False
        data_dict['ORIGIN_ORD'] = True if origin == 'ORD' else False
    
        test_df_1 = pd.DataFrame(data_dict) #use this dataframe to get prediction from model -1
        scaled_test_df_1 = scaler_total_1.transform(test_df_1)

        value = model_1.predict(scaled_test_df_1)
        if value == 0:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS'] = 'ON-TIME'
        else:
            test_data.loc[id,'ARRIVAL STATUS'] = 'DELAYED'

        # test_data.loc[id]

    q2 = internal_test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_early']
    if q2 != 'NA':
        data_dict_2 = {}
        data_dict_2['DEPARTURE TIME'] = [dep_time]
        data_dict_2['ARRIVAL TIME'] = [arr_time]
        data_dict_2['previous_flight_status'] = [0]
        data_dict_2['DAY_Monday'] = True if day_of_flight=='MONDAY' else False
        data_dict_2['DAY_Saturday'] = True if day_of_flight=='SATURDAY' else False
        data_dict_2['DAY_Sunday'] = True if day_of_flight=='SUNDAY' else False
        data_dict_2['DAY_Thursday'] = True if day_of_flight=='THURSDAY' else False
        data_dict_2['DAY_Tuesday'] = True if day_of_flight=='TUESDAY' else False
        data_dict_2['DAY_Wednesday'] = True if day_of_flight=='WEDNESDAY' else False
        data_dict_2['ORIGIN_MCO'] = True if origin == 'MCO' else False
        data_dict_2['ORIGIN_ORD'] = True if origin == 'ORD' else False

        test_df_2 = pd.DataFrame(data_dict_2) #use this dataframe to get prediction from model-2
        scaled_test_df_2 = scaler_total_2.transform(test_df_2)

        value = model_2.predict(scaled_test_df_2)
        if value == 0:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_early'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_early'] = 'ON-TIME'
        else:
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_early'] = 'DELAYED'

    q3 = internal_test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_ontime']
    if q3 != 'NA':
        data_dict_3 = {}
        data_dict_3['DEPARTURE TIME'] = [dep_time]
        data_dict_3['ARRIVAL TIME'] = [arr_time]
        data_dict_3['previous_flight_status'] = [1]
        data_dict_3['DAY_Monday'] = True if day_of_flight=='MONDAY' else False
        data_dict_3['DAY_Saturday'] = True if day_of_flight=='SATURDAY' else False
        data_dict_3['DAY_Sunday'] = True if day_of_flight=='SUNDAY' else False
        data_dict_3['DAY_Thursday'] = True if day_of_flight=='THURSDAY' else False
        data_dict_3['DAY_Tuesday'] = True if day_of_flight=='TUESDAY' else False
        data_dict_3['DAY_Wednesday'] = True if day_of_flight=='WEDNESDAY' else False
        data_dict_3['ORIGIN_MCO'] = True if origin == 'MCO' else False
        data_dict_3['ORIGIN_ORD'] = True if origin == 'ORD' else False

        test_df_3 = pd.DataFrame(data_dict_3) #use this dataframe to get prediction from model-2
        scaled_test_df_3 = scaler_total_2.transform(test_df_3)

        value = model_2.predict(scaled_test_df_3)
        if value == 0:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_ontime'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_ontime'] = 'ON-TIME'
        else:
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_ontime'] = 'DELAYED'

        # test_data.loc[id]

    q4 = internal_test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_late']
    if q4 != 'NA':
        data_dict_4 = {}
        data_dict_4['DEPARTURE TIME'] = [dep_time]
        data_dict_4['ARRIVAL TIME'] = [arr_time]
        data_dict_4['previous_flight_status'] = [2]
        data_dict_4['DAY_Monday'] = True if day_of_flight=='MONDAY' else False
        data_dict_4['DAY_Saturday'] = True if day_of_flight=='SATURDAY' else False
        data_dict_4['DAY_Sunday'] = True if day_of_flight=='SUNDAY' else False
        data_dict_4['DAY_Thursday'] = True if day_of_flight=='THURSDAY' else False
        data_dict_4['DAY_Tuesday'] = True if day_of_flight=='TUESDAY' else False
        data_dict_4['DAY_Wednesday'] = True if day_of_flight=='WEDNESDAY' else False
        data_dict_4['ORIGIN_MCO'] = True if origin == 'MCO' else False
        data_dict_4['ORIGIN_ORD'] = True if origin == 'ORD' else False

        test_df_4 = pd.DataFrame(data_dict_4) #use this dataframe to get prediction from model-2
        scaled_test_df_4 = scaler_total_2.transform(test_df_4)

        value = model_2.predict(scaled_test_df_4)
        if value == 0:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_late'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_late'] = 'ON-TIME'
        else:
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_late'] = 'DELAYED'

    # break
test_data.to_csv('CIS_662 _INITIAL_Predictions_Filled.csv', index=False)
# internal_test_data