In [1]:
# Cell to load dataset, only run once since dataset is very very large
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

print("starting")
df_total = pd.read_csv('Data/Flights.csv')
df_temp1 = df_total[df_total['ORIGIN_AIRPORT'] == 'BOS']
df_temp2 = df_temp1[df_temp1['CANCELLED'] == 0]
df = df_temp2[df_temp2['DIVERTED'] == 0]
print("DONE")
# Our prediction task is estimating delay so we do not care about cancellation. In addition, all data from year
# and origin airport is the same so we remove it
df = df.drop(['CANCELLED', 'DIVERTED', 'CANCELLATION_REASON', 'YEAR', 'ORIGIN_AIRPORT'], axis = 1)
print(df.shape)

starting


  df_total = pd.read_csv('Data/Flights.csv')


DONE
(104804, 26)


In [47]:
# Print columns and types
print(df.columns)
print(df.dtypes)

Index(['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE',
       'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'WHEELS_ON',
       'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')
MONTH                    int64
DAY                      int64
DAY_OF_WEEK              int64
AIRLINE                 object
FLIGHT_NUMBER            int64
TAIL_NUMBER             object
DESTINATION_AIRPORT     object
SCHEDULED_DEPARTURE      int64
DEPARTURE_TIME         float64
DEPARTURE_DELAY        float64
TAXI_OUT               float64
WHEELS_OFF             float64
SCHEDULED_TIME         float64
ELAPSED_TIME           float64
AIR_TIME               float64
DISTANCE                 int64
WHEELS_ON         

In [50]:
# Cell to split the data into 80, 20, 20 for train, test, split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GroupKFold
groups = df['TAIL_NUMBER']

y = df['ARRIVAL_DELAY']
X = df.drop(columns=['ARRIVAL_DELAY', 'TAIL_NUMBER', 'FLIGHT_NUMBER'])
print(X.columns)


gss = GroupShuffleSplit(n_splits=1, train_size=.8, random_state=0)
for i_other, i_test in gss.split(X, y, groups):
    X_other, y_other, groups_other = X.iloc[i_other], y.iloc[i_other], groups.iloc[i_other]
    X_test, y_test, groups_test = X.iloc[i_test], y.iloc[i_test], groups.iloc[i_test]
groups_kfold = GroupKFold(n_splits=4)
for train_index, val_index in groups_kfold.split(X_other, y_other, groups_other):
    X_train, y_train = X_other.iloc[train_index], y_other.iloc[train_index]
    X_val, y_val = X_other.iloc[val_index], y.iloc[val_index]

print(X_test.shape, X_train.shape, X_val.shape)


Index(['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')
(22254, 23) (61913, 23) (20637, 23)


In [48]:
# Cell to encode data use onehot, minmax, and standard
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler


onehot_ftrs = ['DESTINATION_AIRPORT', 'AIRLINE']
minmax_ftrs = ['MONTH', 'DAY', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME', 'ARRIVAL_TIME', 'SCHEDULED_TIME', 'DISTANCE']
std_ftrs = ['DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 'TAXI_IN', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftrs),
        ('minmax', MinMaxScaler(), minmax_ftrs),
        ('std', StandardScaler(), std_ftrs)])

clf = Pipeline(steps=[('preprocessor', preprocessor)]) # for now we only preprocess 
                                                       # later on we will add other steps here

# Final prepped data to be used in ml algorithm
X_train_prep = clf.fit_transform(X_train)
X_val_prep = clf.transform(X_val)
X_test_prep = clf.transform(X_test)



In [72]:
# Examine missing values
for i in df.columns:
    print(i, df[i].isnull().sum())



MONTH 0
DAY 0
DAY_OF_WEEK 0
AIRLINE 0
FLIGHT_NUMBER 0
TAIL_NUMBER 0
DESTINATION_AIRPORT 0
SCHEDULED_DEPARTURE 0
DEPARTURE_TIME 0
DEPARTURE_DELAY 0
TAXI_OUT 0
WHEELS_OFF 0
SCHEDULED_TIME 0
ELAPSED_TIME 0
AIR_TIME 0
DISTANCE 0
WHEELS_ON 0
TAXI_IN 0
SCHEDULED_ARRIVAL 0
ARRIVAL_TIME 0
ARRIVAL_DELAY 0
AIR_SYSTEM_DELAY 83441
SECURITY_DELAY 83441
AIRLINE_DELAY 83441
LATE_AIRCRAFT_DELAY 83441
WEATHER_DELAY 83441
