In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, HTML

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from datetime import datetime

def to_minutes(x):
    h,m = x.split(':')
    return int(h)*60 + int(m)

def findDayOfWeek(dateStr):
    date_object = datetime.strptime(dateStr, '%m/%d/%Y')
    dayOfWeek = date_object.strftime('%A')
    return dayOfWeek 

# findDayOfWeek('01/01/2006')

def flightNoStr(x):
    return str(int(x))

def classifyDelay(delay):
    if delay < -5:
        return 0 #flight is early
    elif delay > 5:
        return 1 #flight is delayed
    else:
        return 2 #flight is ontime
# def calculateDepTime()


In [3]:
# jetblue_B6_116_dep_time_range = (735, 885) #13:25= 13*60+25 - 70 min buffer or 13:35 = 13*60+35 + 70 minute buffer
RawDataProcessing = True

In [4]:
import os
# combinned_df = pd.DataFrame()
if RawDataProcessing:
    dataDir = 'data/arrivals/'
    fileNames = os.listdir(dataDir)
    dfList = []
    for file in fileNames:
        filePath = dataDir + file
        print('Processing file {}'.format(filePath))
        raw_df = pd.read_csv(filePath)
        # raw_df.head()
    
        airline_df = pd.DataFrame()
        #dropping flights other than MCO, ORD, and JFK 
        # to keep additional filters (jetblue['Origin Airport'] == 'MCO') | (jetblue['Origin Airport'] == 'ORD') |
        filtered_data  = raw_df[(raw_df['Origin Airport'] == 'MCO') | (raw_df['Origin Airport'] == 'JFK') | (raw_df['Origin Airport'] == 'ORD')].copy()
        # filtered_data['Origin Airport'].unique()
        airline_df['DATE'] = filtered_data['Date (MM/DD/YYYY)']
        airline_df['DAY'] = [findDayOfWeek(date) for date in filtered_data['Date (MM/DD/YYYY)']]
        
        filtered_data['Flight Number'] = [flightNoStr(flightNo) for flightNo in filtered_data['Flight Number']]
        filtered_data['Flight No'] = filtered_data['Carrier Code'].str.cat(filtered_data['Flight Number'], sep=' ')
        airline_df['FLIGHT NUMBER'] = filtered_data['Flight No']
    
        airline_df['ORIGIN'] = filtered_data['Origin Airport']
        
        #converting time data to minutes
        filtered_data['Scheduled Arrival Time'] = [to_minutes(s) for s in filtered_data['Scheduled Arrival Time']] #to_minutes(step_1['Scheduled Arrival Time'])
        filtered_data['Actual Arrival Time'] = [to_minutes(s) for s in filtered_data['Actual Arrival Time']]
        filtered_data['Wheels-on Time'] = [to_minutes(s) for s in filtered_data['Wheels-on Time']]
    
        filtered_data['Scheduled Departure Time'] = filtered_data['Scheduled Arrival Time'] - filtered_data['Scheduled Elapsed Time (Minutes)']
        airline_df['DEPARTURE TIME'] = filtered_data['Scheduled Departure Time']
        airline_df['ARRIVAL TIME'] = filtered_data['Scheduled Arrival Time']
        # jetblue_df
        airline_df = airline_df[airline_df['ARRIVAL TIME']>=0] ##dropping data where flight departed one earlier date
        # filtered_data['Arrival Delay (Minutes)']
        airline_df['ARRIVAL STATUS'] = [classifyDelay(delay) for delay in filtered_data['Arrival Delay (Minutes)']]
    
        # airline_df
        dfList.append(airline_df)
        
        # break
        # raw_data = pd.read_csv(file)
    combined_df=pd.concat(dfList, ignore_index=True)
    combined_df['DEPARTURE TIME'] = combined_df['DEPARTURE TIME'].astype(int)
    combined_df.to_csv('data/combined_arrival_data.csv', index=False)


Processing file data/arrivals/Mesa_Airlines.csv
Processing file data/arrivals/JetBlue_Airlines.csv
Processing file data/arrivals/Endeavor_Air.csv
Processing file data/arrivals/Southwest_Airlines.csv
Processing file data/arrivals/United_Airlines.csv
Processing file data/arrivals/PSA_Airlines.csv
Processing file data/arrivals/Envoy_Airlines.csv
Processing file data/arrivals/Skywest_Airline.csv
Processing file data/arrivals/Republic_Airline.csv
Processing file data/arrivals/American_Airlines.csv
Processing file data/arrivals/Delta_Airlines.csv


In [5]:
if RawDataProcessing:
    dates = combined_df['DATE'].unique()
    # finalData =  pd.DataFrame()
    # MCO_flightList = []
    # JFK_flightList = []
    # ORD_flightList = []
    totalFlightList = []
    totalFlights = 0
    for date in dates:
        flightList_MCO = combined_df[(combined_df['DATE'] == date) & (combined_df['ORIGIN'] == 'MCO')]
        sortedList_MCO = flightList_MCO.sort_values(by='DEPARTURE TIME', ascending=True)
        mco_f = sortedList_MCO.shape[0]
        # print('{} flights on date {} from MCO'.format(mco_f, date))
        # sortedList_MCO
        sortedList_MCO['previous_flight_status'] = sortedList_MCO['ARRIVAL STATUS'].shift(periods=1)
        # sortedList_MCO
        totalFlightList.append(sortedList_MCO)
    
        flightList_JFK = combined_df[(combined_df['DATE'] == date) & (combined_df['ORIGIN'] == 'JFK')]
        sortedList_JFK = flightList_JFK.sort_values(by='DEPARTURE TIME', ascending=True)
        jfk_f = sortedList_JFK.shape[0]
        # print('{} flights on date {} from JFK'.format(jfk_f, date))
        # sortedList_JFK
        sortedList_JFK['previous_flight_status'] = sortedList_JFK['ARRIVAL STATUS'].shift(periods=1)
        # sortedList_JFK
        totalFlightList.append(sortedList_JFK)
    
        flightList_ORD = combined_df[(combined_df['DATE'] == date) & (combined_df['ORIGIN'] == 'ORD')]
        sortedList_ORD = flightList_ORD.sort_values(by='DEPARTURE TIME', ascending=True)
        ord_f = sortedList_ORD.shape[0]
        # print('{} flights on date {} from ORD'.format(ord_f, date))
        # sortedList_ORD
        sortedList_ORD['previous_flight_status'] = sortedList_ORD['ARRIVAL STATUS'].shift(periods=1)
        # sortedList_ORD
        totalFlightList.append(sortedList_ORD)
        flightsOnDay = mco_f + jfk_f + ord_f
        # print('{} flights on date {}'.format(flightsOnDay, date))
        totalFlights = totalFlights + flightsOnDay
        # if i==5:
        #     break
        # i+=1
    print('{} flights in total'.format(totalFlights))
    finalData = pd.concat (totalFlightList, ignore_index=True) 
    finalData.fillna(0, inplace=True)
    finalData['DEPARTURE TIME'] = finalData['DEPARTURE TIME'].astype(int)
    finalData['previous_flight_status'] = finalData['previous_flight_status'].astype(int)
    finalData.to_csv('data/combined_data_with_prev_flight_status.csv', index=False)
        # finalData = pd.concat([finalData, sortedList], ignore_index=True)
    # finalData

14087 flights in total


In [6]:
data_1 = pd.read_csv('data/combined_arrival_data.csv')

# flight_data.columns
# flight_data['FLIGHT NUMBER'].unique().shape
data_1.isna().sum()
data_1 = data_1.drop(columns = ['DATE', 'FLIGHT NUMBER'])
data_1.head()

DATE              0
DAY               0
FLIGHT NUMBER     0
ORIGIN            0
DEPARTURE TIME    0
ARRIVAL TIME      0
ARRIVAL STATUS    0
dtype: int64

Unnamed: 0,DAY,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS
0,Sunday,ORD,420,526,0
1,Monday,ORD,1067,1167,0
2,Tuesday,ORD,871,970,2
3,Thursday,ORD,1256,1359,2
4,Friday,ORD,895,997,1


In [7]:
data_2 = pd.read_csv('data/combined_data_with_prev_flight_status.csv')
# data_2.head()
# flight_data.columns
# flight_data['FLIGHT NUMBER'].unique().shape
data_2.isna().sum()
data_2 = data_2.drop(columns = ['DATE', 'FLIGHT NUMBER'])
data_2.head()

DATE                      0
DAY                       0
FLIGHT NUMBER             0
ORIGIN                    0
DEPARTURE TIME            0
ARRIVAL TIME              0
ARRIVAL STATUS            0
previous_flight_status    0
dtype: int64

Unnamed: 0,DAY,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,previous_flight_status
0,Sunday,MCO,776,935,2,0
1,Sunday,MCO,1095,1253,0,2
2,Sunday,JFK,405,475,1,0
3,Sunday,JFK,670,745,2,1
4,Sunday,JFK,1020,1112,2,2


In [8]:
#Building XGBoost model using the finalData
encoded_data_1 = pd.get_dummies(data_1.drop(columns=['ARRIVAL STATUS']), drop_first=True)
encoded_data_1.head()

Unnamed: 0,DEPARTURE TIME,ARRIVAL TIME,DAY_Monday,DAY_Saturday,DAY_Sunday,DAY_Thursday,DAY_Tuesday,DAY_Wednesday,ORIGIN_MCO,ORIGIN_ORD
0,420,526,False,False,True,False,False,False,False,True
1,1067,1167,True,False,False,False,False,False,False,True
2,871,970,False,False,False,False,True,False,False,True
3,1256,1359,False,False,False,True,False,False,False,True
4,895,997,False,False,False,False,False,False,False,True


In [9]:
#Building XGBoost model using the finalData
encoded_data_2 = pd.get_dummies(data_2.drop(columns=['ARRIVAL STATUS']), drop_first=True)
encoded_data_2.head()

Unnamed: 0,DEPARTURE TIME,ARRIVAL TIME,previous_flight_status,DAY_Monday,DAY_Saturday,DAY_Sunday,DAY_Thursday,DAY_Tuesday,DAY_Wednesday,ORIGIN_MCO,ORIGIN_ORD
0,776,935,0,False,False,True,False,False,False,True,False
1,1095,1253,2,False,False,True,False,False,False,True,False
2,405,475,0,False,False,True,False,False,False,False,False
3,670,745,1,False,False,True,False,False,False,False,False
4,1020,1112,2,False,False,True,False,False,False,False,False


In [10]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(encoded_data_1, data_1['ARRIVAL STATUS'], stratify = data_1['ARRIVAL STATUS'], test_size=0.2, random_state=42)
X_train_1.shape
X_test_1.shape

(11269, 10)

(2818, 10)

In [11]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(encoded_data_2, data_2['ARRIVAL STATUS'], stratify = data_2['ARRIVAL STATUS'], test_size=0.2, random_state=42)
X_train_2.shape
X_test_2.shape

(11269, 11)

(2818, 11)

In [12]:
from sklearn.preprocessing import StandardScaler
scaler_1 = StandardScaler()
X_train_scaled_1 = pd.DataFrame(scaler_1.fit_transform(X_train_1), columns=X_train_1.columns, index=X_train_1.index)
X_test_scaled_1 = pd.DataFrame(scaler_1.transform(X_test_1), columns=X_test_1.columns, index=X_test_1.index)

scaler_total_1 = StandardScaler()
total_Data_scaled_1 = pd.DataFrame(scaler_total_1.fit_transform(encoded_data_1), columns=encoded_data_1.columns, index=encoded_data_1.index)
total_y_1 = data_1['ARRIVAL STATUS']
# X_test_scaled

In [13]:
# from sklearn.preprocessing import StandardScaler
scaler_2 = StandardScaler()
X_train_scaled_2 = pd.DataFrame(scaler_2.fit_transform(X_train_2), columns=X_train_2.columns, index=X_train_2.index)
X_test_scaled_2 = pd.DataFrame(scaler_2.transform(X_test_2), columns=X_test_2.columns, index=X_test_2.index)

scaler_total_2 = StandardScaler()
total_Data_scaled_2 = pd.DataFrame(scaler_total_2.fit_transform(encoded_data_2), columns=encoded_data_2.columns, index=encoded_data_2.index)
total_y_2 = data_2['ARRIVAL STATUS']
# X_test_scaled

In [14]:
# X_train_scaled.rename({'DEPARTURE TIME':'DEPARTURE_TIME', 'ARRIVAL TIME':'ARRIVAL_TIME'})
# X_test_scaled.rename({'DEPARTURE TIME':'DEPARTURE_TIME', 'ARRIVAL TIME':'ARRIVAL_TIME'})

# dtrain_class = xgb.DMatrix(X_train_scaled, y_train, enable_categorical=True)
# dtest_class = xgb.DMatrix(X_test_scaled, y_test, enable_categorical=True)



In [15]:
def accuracy_score(test_output):
    total_count = len(test_output)
    wrong_count = len(test_output[test_output['Predicted Arrival Status'] != test_output['ARRIVAL STATUS']])
    accuracy = wrong_count/total_count
    return accuracy*100

In [16]:
tree_methods = ['exact', 'hist', 'approx']
tree_depth = [4, 6, 7, 8, 10, 15]
learning_rates = [0.05, 0.1, 0.2, 0.3, 0.4]

best_accuracy_1 = 0
best_params_1 = {}

In [17]:
for method in tree_methods:
    for depth in tree_depth:
        for lr in learning_rates:
            params = {
                "objective": "multi:softmax",
                "tree_method": method,
                "max_depth" : depth,
                "learning_rate" : lr,
                "n_estimators": 200
            }
            xgbModel = xgb.XGBClassifier(**params)
            # xgbModel = GradientBoostingClassifier()
            # xgbModel = GradientBoostingClassifier()
            xgbModel.fit(X_train_scaled_1, y_train_1)
            y_pred = pd.DataFrame(xgbModel.predict(X_test_scaled_1), columns=['Predicted Arrival Status'], index=X_test_scaled_1.index)
            
            test_output = y_pred.merge(y_test_1, left_index=True, right_index=True)
            # test_output.head()
            accuracy = accuracy_score(test_output)
            print('Accuracy of the xgb model is {}'.format(accuracy))
            if accuracy > best_accuracy_1:
                best_accuracy_1 = accuracy
                best_params_1 = params

Accuracy of the xgb model is 51.20652945351313


Accuracy of the xgb model is 50.99361249112846


Accuracy of the xgb model is 50.638750887154


Accuracy of the xgb model is 50.7097232079489


Accuracy of the xgb model is 51.13555713271823


Accuracy of the xgb model is 51.029098651525906


Accuracy of the xgb model is 51.66784953867991


Accuracy of the xgb model is 51.8097941802697


Accuracy of the xgb model is 52.909865152590484


Accuracy of the xgb model is 52.98083747338538


Accuracy of the xgb model is 51.1000709723208


Accuracy of the xgb model is 51.95173882185947


Accuracy of the xgb model is 52.30660042583393


Accuracy of the xgb model is 52.838892831795604


Accuracy of the xgb model is 53.47764371894961


Accuracy of the xgb model is 51.49041873669269


Accuracy of the xgb model is 52.80340667139816


Accuracy of the xgb model is 52.76792051100071


Accuracy of the xgb model is 53.690560681334276


Accuracy of the xgb model is 53.903477643718944


Accuracy of the xgb model is 52.37757274662882


Accuracy of the xgb model is 53.58410220014195


Accuracy of the xgb model is 54.40028388928317


Accuracy of the xgb model is 54.47125621007807


Accuracy of the xgb model is 54.47125621007807


Accuracy of the xgb model is 54.64868701206529


Accuracy of the xgb model is 54.613200851667855


Accuracy of the xgb model is 54.613200851667855


Accuracy of the xgb model is 55.003548616039744


Accuracy of the xgb model is 55.53584102200142


Accuracy of the xgb model is 51.59687721788503


Accuracy of the xgb model is 51.56139105748758


Accuracy of the xgb model is 51.13555713271823


Accuracy of the xgb model is 51.77430801987225


Accuracy of the xgb model is 51.20652945351313


Accuracy of the xgb model is 51.88076650106459


Accuracy of the xgb model is 51.916252661462025


Accuracy of the xgb model is 52.342086586231375


Accuracy of the xgb model is 53.335699077359834


Accuracy of the xgb model is 53.1582682753726


Accuracy of the xgb model is 52.235628105039034


Accuracy of the xgb model is 52.30660042583393


Accuracy of the xgb model is 52.235628105039034


Accuracy of the xgb model is 52.98083747338538


Accuracy of the xgb model is 53.76153300212917


Accuracy of the xgb model is 52.16465578424414


Accuracy of the xgb model is 52.696948190205816


Accuracy of the xgb model is 52.37757274662882


Accuracy of the xgb model is 53.37118523775728


Accuracy of the xgb model is 54.22285308729595


Accuracy of the xgb model is 53.122782114975166


Accuracy of the xgb model is 53.22924059616749


Accuracy of the xgb model is 54.04542228530873


Accuracy of the xgb model is 54.43577004968062


Accuracy of the xgb model is 54.86160397444997


Accuracy of the xgb model is 54.613200851667855


Accuracy of the xgb model is 54.64868701206529


Accuracy of the xgb model is 54.790631653655076


Accuracy of the xgb model is 55.07452093683464


Accuracy of the xgb model is 54.54222853087296


Accuracy of the xgb model is 51.63236337828248


Accuracy of the xgb model is 50.248403122782115


Accuracy of the xgb model is 50.425833924769336


Accuracy of the xgb model is 50.745209368346345


Accuracy of the xgb model is 51.348474095102915


Accuracy of the xgb model is 50.95812633073101


Accuracy of the xgb model is 51.242015613910574


Accuracy of the xgb model is 52.05819730305181


Accuracy of the xgb model is 52.342086586231375


Accuracy of the xgb model is 53.05180979418027


Accuracy of the xgb model is 51.242015613910574


Accuracy of the xgb model is 52.16465578424414


Accuracy of the xgb model is 52.4485450674237


Accuracy of the xgb model is 53.26472675656494


Accuracy of the xgb model is 53.65507452093683


Accuracy of the xgb model is 50.99361249112846


Accuracy of the xgb model is 51.8097941802697


Accuracy of the xgb model is 53.335699077359834


Accuracy of the xgb model is 53.58410220014195


Accuracy of the xgb model is 54.36479772888574


Accuracy of the xgb model is 52.62597586941094


Accuracy of the xgb model is 53.513129879347055


Accuracy of the xgb model is 53.832505322924064


Accuracy of the xgb model is 54.36479772888574


Accuracy of the xgb model is 54.57771469127041


Accuracy of the xgb model is 54.40028388928317


Accuracy of the xgb model is 54.93257629524485


Accuracy of the xgb model is 54.684173172462735


Accuracy of the xgb model is 55.25195173882186


Accuracy of the xgb model is 54.86160397444997


In [18]:
print('Best Accuracy using xgb model is {}'.format(best_accuracy_1))
print('Best Parameter using xgb model is {}'.format(best_params_1))

Best Accuracy using xgb model is 55.53584102200142
Best Parameter using xgb model is {'objective': 'multi:softmax', 'tree_method': 'exact', 'max_depth': 15, 'learning_rate': 0.4, 'n_estimators': 200}


In [19]:
best_accuracy_2 = 0
best_params_2 = {}

In [20]:
for method in tree_methods:
    for depth in tree_depth:
        for lr in learning_rates:
            params = {
                "objective": "multi:softmax",
                "tree_method": method,
                "max_depth" : depth,
                "learning_rate" : lr,
                "n_estimators": 200
            }
            xgbModel = xgb.XGBClassifier(**params)
            # xgbModel = GradientBoostingClassifier()
            # xgbModel = GradientBoostingClassifier()
            xgbModel.fit(X_train_scaled_2, y_train_2)
            y_pred = pd.DataFrame(xgbModel.predict(X_test_scaled_2), columns=['Predicted Arrival Status'], index=X_test_scaled_2.index)
            
            test_output = y_pred.merge(y_test_2, left_index=True, right_index=True)
            # test_output.head()
            accuracy = accuracy_score(test_output)
            print('Accuracy of the xgb model is {}'.format(accuracy))
            if accuracy > best_accuracy_2:
                best_accuracy_2 = accuracy
                best_params_2 = params

Accuracy of the xgb model is 51.17104329311568


Accuracy of the xgb model is 50.67423704755145


Accuracy of the xgb model is 50.851667849538686


Accuracy of the xgb model is 51.95173882185947


Accuracy of the xgb model is 52.12916962384671


Accuracy of the xgb model is 51.06458481192335


Accuracy of the xgb model is 51.70333569907736


Accuracy of the xgb model is 52.342086586231375


Accuracy of the xgb model is 52.59048970901349


Accuracy of the xgb model is 52.37757274662882


Accuracy of the xgb model is 51.8097941802697


Accuracy of the xgb model is 52.12916962384671


Accuracy of the xgb model is 52.12916962384671


Accuracy of the xgb model is 51.98722498225692


Accuracy of the xgb model is 52.909865152590484


Accuracy of the xgb model is 51.845280340667145


Accuracy of the xgb model is 52.413058907026254


Accuracy of the xgb model is 52.94535131298793


Accuracy of the xgb model is 52.76792051100071


Accuracy of the xgb model is 53.619588360539396


Accuracy of the xgb model is 52.342086586231375


Accuracy of the xgb model is 53.37118523775728


Accuracy of the xgb model is 52.87437899219305


Accuracy of the xgb model is 53.86799148332151


Accuracy of the xgb model is 53.619588360539396


Accuracy of the xgb model is 53.44215755855216


Accuracy of the xgb model is 53.44215755855216


Accuracy of the xgb model is 53.690560681334276


Accuracy of the xgb model is 53.619588360539396


Accuracy of the xgb model is 54.009936124911285


Accuracy of the xgb model is 51.06458481192335


Accuracy of the xgb model is 51.242015613910574


Accuracy of the xgb model is 51.66784953867991


Accuracy of the xgb model is 52.022711142654366


Accuracy of the xgb model is 51.525904897090136


Accuracy of the xgb model is 51.419446415897795


Accuracy of the xgb model is 51.98722498225692


Accuracy of the xgb model is 52.022711142654366


Accuracy of the xgb model is 51.95173882185947


Accuracy of the xgb model is 52.59048970901349


Accuracy of the xgb model is 51.59687721788503


Accuracy of the xgb model is 52.76792051100071


Accuracy of the xgb model is 52.76792051100071


Accuracy of the xgb model is 52.519517388218595


Accuracy of the xgb model is 53.05180979418027


Accuracy of the xgb model is 52.16465578424414


Accuracy of the xgb model is 52.30660042583393


Accuracy of the xgb model is 52.696948190205816


Accuracy of the xgb model is 52.909865152590484


Accuracy of the xgb model is 53.016323633782825


Accuracy of the xgb model is 52.235628105039034


Accuracy of the xgb model is 52.80340667139816


Accuracy of the xgb model is 52.98083747338538


Accuracy of the xgb model is 52.98083747338538


Accuracy of the xgb model is 54.04542228530873


Accuracy of the xgb model is 53.86799148332151


Accuracy of the xgb model is 53.76153300212917


Accuracy of the xgb model is 53.93896380411639


Accuracy of the xgb model is 53.513129879347055


Accuracy of the xgb model is 54.009936124911285


Accuracy of the xgb model is 50.99361249112846


Accuracy of the xgb model is 50.81618168914124


Accuracy of the xgb model is 51.419446415897795


Accuracy of the xgb model is 51.06458481192335


Accuracy of the xgb model is 51.56139105748758


Accuracy of the xgb model is 51.31298793470547


Accuracy of the xgb model is 51.56139105748758


Accuracy of the xgb model is 51.56139105748758


Accuracy of the xgb model is 52.20014194464159


Accuracy of the xgb model is 52.696948190205816


Accuracy of the xgb model is 51.17104329311568


Accuracy of the xgb model is 51.63236337828248


Accuracy of the xgb model is 52.37757274662882


Accuracy of the xgb model is 53.08729595457772


Accuracy of the xgb model is 53.193754435770046


Accuracy of the xgb model is 51.916252661462025


Accuracy of the xgb model is 51.98722498225692


Accuracy of the xgb model is 52.4485450674237


Accuracy of the xgb model is 53.122782114975166


Accuracy of the xgb model is 53.47764371894961


Accuracy of the xgb model is 52.59048970901349


Accuracy of the xgb model is 52.59048970901349


Accuracy of the xgb model is 53.1582682753726


Accuracy of the xgb model is 53.30021291696239


Accuracy of the xgb model is 53.193754435770046


Accuracy of the xgb model is 53.619588360539396


Accuracy of the xgb model is 53.903477643718944


Accuracy of the xgb model is 53.37118523775728


Accuracy of the xgb model is 54.009936124911285


Accuracy of the xgb model is 53.93896380411639


In [21]:
print('Best Accuracy using xgb model is {}'.format(best_accuracy_2))
print('Best Parameter using xgb model is {}'.format(best_params_2))

Best Accuracy using xgb model is 54.04542228530873
Best Parameter using xgb model is {'objective': 'multi:softmax', 'tree_method': 'hist', 'max_depth': 10, 'learning_rate': 0.4, 'n_estimators': 200}


In [22]:
model_1 = xgb.XGBClassifier(**best_params_1)

model_1.fit(total_Data_scaled_1, total_y_1)


model_2 = xgb.XGBClassifier(**best_params_2)

model_2.fit(total_Data_scaled_2, total_y_2)
# y_pred = pd.DataFrame(xgbModel.predict(X_test_scaled), columns=['Predicted Arrival Status'], index=X_test.index)

# test_output = y_pred.merge(y_test, left_index=True, right_index=True)
# # test_output.head()
# accuracy = accuracy_score(test_output)
# print('Accuracy of the xgb model is {}'.format(accuracy))
# if accuracy > best_accuracy:
#     best_accuracy = accuracy
#     best_params = params

In [23]:
test_data = pd.read_csv('CIS_662 _INITIAL_Predictions.csv', na_values=[''], keep_default_na=False)
# test_data.drop(columns=['DATE','FLIGHT NUMBER', 'ARRIVAL STATUS_Prev_flight_early', 'ARRIVAL STATUS_Prev_flight_ontime', 'ARRIVAL STATUS_Prev_flight_late'],inplace=True)
internal_test_data = pd.DataFrame(test_data.drop(columns=['DATE','FLIGHT NUMBER']), index=test_data.index)
test_data
internal_test_data

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,4/10/24,WEDNESDAY,UA 1400,ORD,6:52 PM,9:47 PM,,,,
1,4/10/24,WEDNESDAY,AA 3402,ORD,7:59 PM,10:52 PM,,,,
2,4/10/24,WEDNESDAY,B6 116,JFK,1:33 PM,2:50 PM,,,,
3,4/10/24,WEDNESDAY,DL 5182,JFK,2:55 PM,4:21 PM,,,,
4,4/10/24,WEDNESDAY,WN 5285,MCO,11:05 AM,1:45 PM,,,,
5,4/10/24,WEDNESDAY,B6 656,MCO,1:35 PM,4:25 PM,,,,
6,4/11/24,THURSDAY,UA 1400,ORD,6:52 PM,9:47 PM,,,,
7,4/11/24,THURSDAY,AA 3402,ORD,7:59 PM,10:52 PM,,,,
8,4/11/24,THURSDAY,B6 116,JFK,1:33 PM,2:50 PM,,,,
9,4/11/24,THURSDAY,DL 5182,JFK,2:55 PM,4:21 PM,,,,


Unnamed: 0,DAY,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,WEDNESDAY,ORD,6:52 PM,9:47 PM,,,,
1,WEDNESDAY,ORD,7:59 PM,10:52 PM,,,,
2,WEDNESDAY,JFK,1:33 PM,2:50 PM,,,,
3,WEDNESDAY,JFK,2:55 PM,4:21 PM,,,,
4,WEDNESDAY,MCO,11:05 AM,1:45 PM,,,,
5,WEDNESDAY,MCO,1:35 PM,4:25 PM,,,,
6,THURSDAY,ORD,6:52 PM,9:47 PM,,,,
7,THURSDAY,ORD,7:59 PM,10:52 PM,,,,
8,THURSDAY,JFK,1:33 PM,2:50 PM,,,,
9,THURSDAY,JFK,2:55 PM,4:21 PM,,,,


In [24]:
def convert_to_24Hr(timeStr):
    if ('AM' not in  timeStr) and ('PM' not in timeStr):
        return timeStr
    time = timeStr.split(sep=' ')
    timeValue = time[0]
    hh_mm = timeValue.split(sep=':')
    hr = hh_mm[0]
    min = hh_mm[1]
    am_pm = time[1]
    if am_pm == 'PM':
        hr_int = int(hr) + 12
        hr = str(hr_int)
    return hr +':'+min

In [25]:
internal_test_data['DEPARTURE TIME'] = [convert_to_24Hr(x) for x in internal_test_data['DEPARTURE TIME']]
internal_test_data['DEPARTURE TIME'] = [to_minutes(x) for x in internal_test_data['DEPARTURE TIME']]
internal_test_data['ARRIVAL TIME'] = [convert_to_24Hr(x) for x in internal_test_data['ARRIVAL TIME']]
internal_test_data['ARRIVAL TIME'] = [to_minutes(x) for x in internal_test_data['ARRIVAL TIME']]

In [26]:
internal_test_data.head()
# internal_test_data = len(test_data)

Unnamed: 0,DAY,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,WEDNESDAY,ORD,1132,1307,,,,
1,WEDNESDAY,ORD,1199,1372,,,,
2,WEDNESDAY,JFK,813,890,,,,
3,WEDNESDAY,JFK,895,981,,,,
4,WEDNESDAY,MCO,665,825,,,,


In [27]:
#dynamic training loop
for id in internal_test_data.index:
    day_of_flight = internal_test_data.loc[id,'DAY']
    origin = internal_test_data.loc[id,'ORIGIN']
    dep_time = internal_test_data.loc[id,'DEPARTURE TIME']
    arr_time = internal_test_data.loc[id,'ARRIVAL TIME']
    
    # test_row = pd.DataFrame(test_data.loc[i].copy())
    # test_row[]
    # day_of_flight = test_row.loc[0,'DAY']
    final_data_for_this_row = {}

    
    q1 = internal_test_data.loc[id, 'ARRIVAL STATUS']
    # print(status_qwery)
    if q1 != 'NA':
        columns=X_test_1.columns
        # new_test_row
        data_dict = {}
        data_dict['DEPARTURE TIME'] = [dep_time]
        data_dict['ARRIVAL TIME'] = [arr_time]
        data_dict['DAY_Monday'] = True if day_of_flight=='MONDAY' else False
        data_dict['DAY_Saturday'] = True if day_of_flight=='SATURDAY' else False
        data_dict['DAY_Sunday'] = True if day_of_flight=='SUNDAY' else False
        data_dict['DAY_Thursday'] = True if day_of_flight=='THURSDAY' else False
        data_dict['DAY_Tuesday'] = True if day_of_flight=='TUESDAY' else False
        data_dict['DAY_Wednesday'] = True if day_of_flight=='WEDNESDAY' else False
        data_dict['ORIGIN_MCO'] = True if origin == 'MCO' else False
        data_dict['ORIGIN_ORD'] = True if origin == 'ORD' else False
    
        test_df_1 = pd.DataFrame(data_dict) #use this dataframe to get prediction from model -1
        scaled_test_df_1 = scaler_total_1.transform(test_df_1)

        value = model_1.predict(scaled_test_df_1)
        if value == 0:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS'] = 'ON-TIME'
        else:
            test_data.loc[id,'ARRIVAL STATUS'] = 'LATE'

        # test_data.loc[id]

    q2 = internal_test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_early']
    if q2 != 'NA':
        data_dict_2 = {}
        data_dict_2['DEPARTURE TIME'] = [dep_time]
        data_dict_2['ARRIVAL TIME'] = [arr_time]
        data_dict_2['previous_flight_status'] = [0]
        data_dict_2['DAY_Monday'] = True if day_of_flight=='MONDAY' else False
        data_dict_2['DAY_Saturday'] = True if day_of_flight=='SATURDAY' else False
        data_dict_2['DAY_Sunday'] = True if day_of_flight=='SUNDAY' else False
        data_dict_2['DAY_Thursday'] = True if day_of_flight=='THURSDAY' else False
        data_dict_2['DAY_Tuesday'] = True if day_of_flight=='TUESDAY' else False
        data_dict_2['DAY_Wednesday'] = True if day_of_flight=='WEDNESDAY' else False
        data_dict_2['ORIGIN_MCO'] = True if origin == 'MCO' else False
        data_dict_2['ORIGIN_ORD'] = True if origin == 'ORD' else False

        test_df_2 = pd.DataFrame(data_dict_2) #use this dataframe to get prediction from model-2
        scaled_test_df_2 = scaler_total_2.transform(test_df_2)

        value = model_2.predict(scaled_test_df_2)
        if value == 0:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_early'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_early'] = 'ON-TIME'
        else:
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_early'] = 'LATE'

    q3 = internal_test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_ontime']
    if q3 != 'NA':
        data_dict_3 = {}
        data_dict_3['DEPARTURE TIME'] = [dep_time]
        data_dict_3['ARRIVAL TIME'] = [arr_time]
        data_dict_3['previous_flight_status'] = [1]
        data_dict_3['DAY_Monday'] = True if day_of_flight=='MONDAY' else False
        data_dict_3['DAY_Saturday'] = True if day_of_flight=='SATURDAY' else False
        data_dict_3['DAY_Sunday'] = True if day_of_flight=='SUNDAY' else False
        data_dict_3['DAY_Thursday'] = True if day_of_flight=='THURSDAY' else False
        data_dict_3['DAY_Tuesday'] = True if day_of_flight=='TUESDAY' else False
        data_dict_3['DAY_Wednesday'] = True if day_of_flight=='WEDNESDAY' else False
        data_dict_3['ORIGIN_MCO'] = True if origin == 'MCO' else False
        data_dict_3['ORIGIN_ORD'] = True if origin == 'ORD' else False

        test_df_3 = pd.DataFrame(data_dict_3) #use this dataframe to get prediction from model-2
        scaled_test_df_3 = scaler_total_2.transform(test_df_3)

        value = model_2.predict(scaled_test_df_3)
        if value == 0:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_ontime'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_ontime'] = 'ON-TIME'
        else:
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_ontime'] = 'LATE'

        # test_data.loc[id]

    q4 = internal_test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_late']
    if q4 != 'NA':
        data_dict_4 = {}
        data_dict_4['DEPARTURE TIME'] = [dep_time]
        data_dict_4['ARRIVAL TIME'] = [arr_time]
        data_dict_4['previous_flight_status'] = [2]
        data_dict_4['DAY_Monday'] = True if day_of_flight=='MONDAY' else False
        data_dict_4['DAY_Saturday'] = True if day_of_flight=='SATURDAY' else False
        data_dict_4['DAY_Sunday'] = True if day_of_flight=='SUNDAY' else False
        data_dict_4['DAY_Thursday'] = True if day_of_flight=='THURSDAY' else False
        data_dict_4['DAY_Tuesday'] = True if day_of_flight=='TUESDAY' else False
        data_dict_4['DAY_Wednesday'] = True if day_of_flight=='WEDNESDAY' else False
        data_dict_4['ORIGIN_MCO'] = True if origin == 'MCO' else False
        data_dict_4['ORIGIN_ORD'] = True if origin == 'ORD' else False

        test_df_4 = pd.DataFrame(data_dict_4) #use this dataframe to get prediction from model-2
        scaled_test_df_4 = scaler_total_2.transform(test_df_4)

        value = model_2.predict(scaled_test_df_4)
        if value == 0:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_late'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_late'] = 'ON-TIME'
        else:
            test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_late'] = 'LATE'

    # break
test_data
test_data.to_csv('CIS_662 _INITIAL_Predictions_Filled.csv', index=False)
# internal_test_data

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,4/10/24,WEDNESDAY,UA 1400,ORD,6:52 PM,9:47 PM,EARLY,,,
1,4/10/24,WEDNESDAY,AA 3402,ORD,7:59 PM,10:52 PM,,EARLY,LATE,LATE
2,4/10/24,WEDNESDAY,B6 116,JFK,1:33 PM,2:50 PM,ON-TIME,,,
3,4/10/24,WEDNESDAY,DL 5182,JFK,2:55 PM,4:21 PM,,EARLY,EARLY,ON-TIME
4,4/10/24,WEDNESDAY,WN 5285,MCO,11:05 AM,1:45 PM,EARLY,,,
5,4/10/24,WEDNESDAY,B6 656,MCO,1:35 PM,4:25 PM,,ON-TIME,EARLY,ON-TIME
6,4/11/24,THURSDAY,UA 1400,ORD,6:52 PM,9:47 PM,EARLY,,,
7,4/11/24,THURSDAY,AA 3402,ORD,7:59 PM,10:52 PM,,EARLY,LATE,LATE
8,4/11/24,THURSDAY,B6 116,JFK,1:33 PM,2:50 PM,ON-TIME,,,
9,4/11/24,THURSDAY,DL 5182,JFK,2:55 PM,4:21 PM,,ON-TIME,EARLY,EARLY
