# Imports

In [116]:
import os
import zipfile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Display all of the columns
pd.set_option('display.max_columns', None)

# File I/O

In [117]:
WRITE_FILES = True

if not WRITE_FILES:
    print("WARNING: NOT WRITING OUTPUT FILES!")
else:
    print("Outputs will be written to CSV files")

Outputs will be written to CSV files


In [118]:
def write_csv(df, output_path, filename_without_extension, zip=False):
    if WRITE_FILES:        
        if zip:
            compression = { 'method': 'zip',
                            'archive_name': filename_without_extension + '.csv' }
            write_extension = '.zip'
            print(f'Zipping "{filename_without_extension}.csv" into "{filename_without_extension}.zip"')
        else:
            compression = None
            write_extension = '.csv'
        print(f'Writing to "{output_path}{filename_without_extension}{write_extension}"')
        df.to_csv(os.path.join(output_path, filename_without_extension + write_extension), index=False, compression=compression)
    else:
        print("WARNING: NOT WRITING OUTPUT FILES!")

In [136]:
# function to open zipped file and read into dataframe
# Sample usage:
#    unzip_to_df('../input_path/', 'data.zip', dtype = {'feature1': object }, parse_dates = ['datetime']})
# or simply:
#    unzip_to_df('../input_path/', 'data.zip')

def unzip_to_df(zip_filepath, file_inside_zip, dtype = {}, parse_dates = [], **read_csv_kwargs):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as z:
            with z.open(file_inside_zip) as f:
                df = pd.read_csv(f, dtype=dtype, parse_dates=parse_dates, **read_csv_kwargs)
        return df
    except Exception as e:
        print(f'Error occured: {e}')
        return None

In [120]:
root_folder = '../../../'
data_folder = root_folder + 'Preprocessing/Output/Delays/'
modeling_zip = 'modeling_data.zip'
modeling_file = 'modeling_data.csv'

output_path = root_folder + 'Preprocessing/Output/Delays/'
training_output_file = 'modeling_data_training'
testing_output_file = 'modeling_data_testing'


# Read modeling data

In [121]:
modeling_df = unzip_to_df(data_folder + modeling_zip, modeling_file)
modeling_df.head()

Unnamed: 0,Delay Bin,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),Scheduled Elapsed Time,Carrier Code,Destination Airport,Manufacturer,Model,Aircraft Age,Aircraft Age Missing,Type of Engine,Number of Seats,Builder Type Certificated,Precipitation Accumulation One Hour,Precipitation Accumulation Six Hours,Air Temperature,Dew Point Temperature,Relative Humidity,Wind Speed,Wind Direction Interpolation,Wind Gust,Visibility,Ceiling,Ceiling Missing,Sea Level Pressure,Sea Level Pressure Missing
0,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.065403,0.997859,307,UA,EWR,BOEING,757-224,23,False,Turbo-Fan,178,True,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
1,11-30 min,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,278,AA,CLT,AIRBUS,A321-211,7,False,Turbo-Fan,199,True,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
2,31-60 min,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,295,AA,MIA,AIRBUS,A321-231,3,False,Turbo-Fan,379,True,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
3,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,320,B6,JFK,AIRBUS,A321-231,3,False,Turbo-Fan,379,True,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
4,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.173648,0.984808,209,DL,MSP,BOEING,757-251,24,False,Turbo-Fan,178,True,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0


# Separate modeling data

In [122]:
y = modeling_df['Delay Bin']
X = modeling_df.drop('Delay Bin', axis=1)

## Split into numeric and categorical datasets for scaling and OneHotEncoding, respectively.

In [123]:
categorical_features = [
    'Carrier Code', 'Destination Airport', 'Manufacturer', 'Model',
    'Type of Engine', 'Builder Type Certificated', 'Aircraft Age Missing', 'Ceiling Missing', 'Sea Level Pressure Missing'
]

numeric_features = X.drop(categorical_features, axis=1).columns

In [124]:
# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Fit and transform the OneHotEncoder using the categorical variable list
X_categorical = pd.DataFrame(encoder.fit_transform(X[categorical_features]))

# Add the encoded variable names to the dataframe
encoded_columns = encoder.get_feature_names_out(categorical_features)
X_categorical.columns = encoded_columns
X_categorical.head()

Unnamed: 0,Carrier Code_AS,Carrier Code_B6,Carrier Code_DL,Carrier Code_F9,Carrier Code_HA,Carrier Code_MQ,Carrier Code_NK,Carrier Code_OO,Carrier Code_QX,Carrier Code_UA,Carrier Code_WN,Destination Airport_ACV,Destination Airport_ANC,Destination Airport_ASE,Destination Airport_ATL,Destination Airport_AUS,Destination Airport_BDL,Destination Airport_BHM,Destination Airport_BIH,Destination Airport_BNA,Destination Airport_BOI,Destination Airport_BOS,Destination Airport_BTR,Destination Airport_BUF,Destination Airport_BUR,Destination Airport_BWI,Destination Airport_BZN,Destination Airport_CHS,Destination Airport_CID,Destination Airport_CLE,Destination Airport_CLT,Destination Airport_CMH,Destination Airport_COS,Destination Airport_CVG,Destination Airport_DAL,Destination Airport_DCA,Destination Airport_DEN,Destination Airport_DFW,Destination Airport_DRO,Destination Airport_DSM,Destination Airport_DTW,Destination Airport_EGE,Destination Airport_ELP,Destination Airport_EUG,Destination Airport_EWR,Destination Airport_FAT,Destination Airport_FCA,Destination Airport_FLG,Destination Airport_FLL,Destination Airport_GEG,Destination Airport_GJT,Destination Airport_HDN,Destination Airport_HNL,Destination Airport_HOU,Destination Airport_IAD,Destination Airport_IAH,Destination Airport_IND,Destination Airport_ITO,Destination Airport_JAC,Destination Airport_JAX,Destination Airport_JFK,Destination Airport_KOA,Destination Airport_LAS,Destination Airport_LGA,Destination Airport_LIH,Destination Airport_MCI,Destination Airport_MCO,Destination Airport_MDW,Destination Airport_MEM,Destination Airport_MFR,Destination Airport_MIA,Destination Airport_MKE,Destination Airport_MMH,Destination Airport_MRY,Destination Airport_MSN,Destination Airport_MSO,Destination Airport_MSP,Destination Airport_MSY,Destination Airport_MTJ,Destination Airport_OAK,Destination Airport_OGG,Destination Airport_OKC,Destination Airport_OMA,Destination Airport_ORD,Destination Airport_PAE,Destination Airport_PBI,Destination Airport_PDX,Destination Airport_PHL,Destination Airport_PHX,Destination Airport_PIT,Destination Airport_PRC,Destination Airport_PSC,Destination Airport_PSP,Destination Airport_RDD,Destination Airport_RDM,Destination Airport_RDU,Destination Airport_RIC,Destination Airport_RNO,Destination Airport_RSW,Destination Airport_SAF,Destination Airport_SAN,Destination Airport_SAT,Destination Airport_SBA,Destination Airport_SBN,Destination Airport_SBP,Destination Airport_SCK,Destination Airport_SDF,Destination Airport_SEA,Destination Airport_SFO,Destination Airport_SGU,Destination Airport_SJC,Destination Airport_SLC,Destination Airport_SMF,Destination Airport_STL,Destination Airport_STS,Destination Airport_SUN,Destination Airport_TPA,Destination Airport_TUL,Destination Airport_TUS,Destination Airport_XNA,Manufacturer_AIRBUS CANADA LP,Manufacturer_BOEING,Manufacturer_BOMBARDIER INC,Manufacturer_EMBRAER,Manufacturer_OTHER,Manufacturer_YABORA INDUSTRIA AERONAUTICA S,Model_737-76N,Model_737-79P,Model_737-7BD,Model_737-7CT,Model_737-7H4,Model_737-7Q8,Model_737-8,Model_737-800,Model_737-823,Model_737-824,Model_737-832,Model_737-890,Model_737-8EH,Model_737-8H4,Model_737-9,Model_737-900ER,Model_737-924ER,Model_737-932ER,Model_737-990,Model_737-990ER,Model_757-224,Model_757-231,Model_757-232,Model_757-251,Model_757-26D,Model_757-2Q8,Model_757-324,Model_757-33N,Model_757-351,Model_767-322,Model_767-332,Model_767-432ER,Model_777-222,Model_777-223,Model_777-323ER,Model_787-10,Model_787-8,Model_787-9,Model_A319-112,Model_A319-114,Model_A319-131,Model_A319-132,Model_A320-211,Model_A320-212,Model_A320-214,Model_A320-232,Model_A320-251N,Model_A320-271N,Model_A321-211,Model_A321-213,Model_A321-231,Model_A321-253NX,Model_A321-271N,Model_A321-271NX,Model_A330-243,Model_A350-941,Model_BD-500-1A10,Model_BD-500-1A11,Model_CL-600-2B19,Model_CL-600-2C10,Model_CL-600-2C11,Model_ERJ 170-200 LL,Model_ERJ 170-200 LR,Model_OTHER,Type of Engine_Turbo-Fan,Type of Engine_Turbo-Jet,Builder Type Certificated_True,Aircraft Age Missing_True,Ceiling Missing_1,Sea Level Pressure Missing_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [125]:
# Drop original categorical features
X_numeric = X.drop(categorical_features, axis=1)
X_numeric.head()

Unnamed: 0,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),Scheduled Elapsed Time,Aircraft Age,Number of Seats,Precipitation Accumulation One Hour,Precipitation Accumulation Six Hours,Air Temperature,Dew Point Temperature,Relative Humidity,Wind Speed,Wind Direction Interpolation,Wind Gust,Visibility,Ceiling,Sea Level Pressure
0,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.065403,0.997859,307,23,178,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1017.5
1,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,278,7,199,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1017.5
2,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,295,3,379,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1017.5
3,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,320,3,379,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1017.5
4,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.173648,0.984808,209,24,178,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1017.5


In [126]:
# Merge one-hot encoded features
X = X_numeric.merge(X_categorical,left_index=True, right_index=True)
X.head()

Unnamed: 0,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),Scheduled Elapsed Time,Aircraft Age,Number of Seats,Precipitation Accumulation One Hour,Precipitation Accumulation Six Hours,Air Temperature,Dew Point Temperature,Relative Humidity,Wind Speed,Wind Direction Interpolation,Wind Gust,Visibility,Ceiling,Sea Level Pressure,Carrier Code_AS,Carrier Code_B6,Carrier Code_DL,Carrier Code_F9,Carrier Code_HA,Carrier Code_MQ,Carrier Code_NK,Carrier Code_OO,Carrier Code_QX,Carrier Code_UA,Carrier Code_WN,Destination Airport_ACV,Destination Airport_ANC,Destination Airport_ASE,Destination Airport_ATL,Destination Airport_AUS,Destination Airport_BDL,Destination Airport_BHM,Destination Airport_BIH,Destination Airport_BNA,Destination Airport_BOI,Destination Airport_BOS,Destination Airport_BTR,Destination Airport_BUF,Destination Airport_BUR,Destination Airport_BWI,Destination Airport_BZN,Destination Airport_CHS,Destination Airport_CID,Destination Airport_CLE,Destination Airport_CLT,Destination Airport_CMH,Destination Airport_COS,Destination Airport_CVG,Destination Airport_DAL,Destination Airport_DCA,Destination Airport_DEN,Destination Airport_DFW,Destination Airport_DRO,Destination Airport_DSM,Destination Airport_DTW,Destination Airport_EGE,Destination Airport_ELP,Destination Airport_EUG,Destination Airport_EWR,Destination Airport_FAT,Destination Airport_FCA,Destination Airport_FLG,Destination Airport_FLL,Destination Airport_GEG,Destination Airport_GJT,Destination Airport_HDN,Destination Airport_HNL,Destination Airport_HOU,Destination Airport_IAD,Destination Airport_IAH,Destination Airport_IND,Destination Airport_ITO,Destination Airport_JAC,Destination Airport_JAX,Destination Airport_JFK,Destination Airport_KOA,Destination Airport_LAS,Destination Airport_LGA,Destination Airport_LIH,Destination Airport_MCI,Destination Airport_MCO,Destination Airport_MDW,Destination Airport_MEM,Destination Airport_MFR,Destination Airport_MIA,Destination Airport_MKE,Destination Airport_MMH,Destination Airport_MRY,Destination Airport_MSN,Destination Airport_MSO,Destination Airport_MSP,Destination Airport_MSY,Destination Airport_MTJ,Destination Airport_OAK,Destination Airport_OGG,Destination Airport_OKC,Destination Airport_OMA,Destination Airport_ORD,Destination Airport_PAE,Destination Airport_PBI,Destination Airport_PDX,Destination Airport_PHL,Destination Airport_PHX,Destination Airport_PIT,Destination Airport_PRC,Destination Airport_PSC,Destination Airport_PSP,Destination Airport_RDD,Destination Airport_RDM,Destination Airport_RDU,Destination Airport_RIC,Destination Airport_RNO,Destination Airport_RSW,Destination Airport_SAF,Destination Airport_SAN,Destination Airport_SAT,Destination Airport_SBA,Destination Airport_SBN,Destination Airport_SBP,Destination Airport_SCK,Destination Airport_SDF,Destination Airport_SEA,Destination Airport_SFO,Destination Airport_SGU,Destination Airport_SJC,Destination Airport_SLC,Destination Airport_SMF,Destination Airport_STL,Destination Airport_STS,Destination Airport_SUN,Destination Airport_TPA,Destination Airport_TUL,Destination Airport_TUS,Destination Airport_XNA,Manufacturer_AIRBUS CANADA LP,Manufacturer_BOEING,Manufacturer_BOMBARDIER INC,Manufacturer_EMBRAER,Manufacturer_OTHER,Manufacturer_YABORA INDUSTRIA AERONAUTICA S,Model_737-76N,Model_737-79P,Model_737-7BD,Model_737-7CT,Model_737-7H4,Model_737-7Q8,Model_737-8,Model_737-800,Model_737-823,Model_737-824,Model_737-832,Model_737-890,Model_737-8EH,Model_737-8H4,Model_737-9,Model_737-900ER,Model_737-924ER,Model_737-932ER,Model_737-990,Model_737-990ER,Model_757-224,Model_757-231,Model_757-232,Model_757-251,Model_757-26D,Model_757-2Q8,Model_757-324,Model_757-33N,Model_757-351,Model_767-322,Model_767-332,Model_767-432ER,Model_777-222,Model_777-223,Model_777-323ER,Model_787-10,Model_787-8,Model_787-9,Model_A319-112,Model_A319-114,Model_A319-131,Model_A319-132,Model_A320-211,Model_A320-212,Model_A320-214,Model_A320-232,Model_A320-251N,Model_A320-271N,Model_A321-211,Model_A321-213,Model_A321-231,Model_A321-253NX,Model_A321-271N,Model_A321-271NX,Model_A330-243,Model_A350-941,Model_BD-500-1A10,Model_BD-500-1A11,Model_CL-600-2B19,Model_CL-600-2C10,Model_CL-600-2C11,Model_ERJ 170-200 LL,Model_ERJ 170-200 LR,Model_OTHER,Type of Engine_Turbo-Fan,Type of Engine_Turbo-Jet,Builder Type Certificated_True,Aircraft Age Missing_True,Ceiling Missing_1,Sea Level Pressure Missing_1
0,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.065403,0.997859,307,23,178,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1017.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,278,7,199,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1017.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,295,3,379,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1017.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,320,3,379,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1017.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.173648,0.984808,209,24,178,0.0,0.0,57.02,33.08,40.23,5.75,80.0,0.0,10.0,35000,1017.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


## Split into training and testing sets

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=69)

In [128]:
# Separate numeric and encoded features for scaling
# Save indices to merge again later since the scaled output is in array format

X_train_numeric = X_train[numeric_features].copy()
X_train_numeric_index = X_train_numeric.index

X_train_encoded = X_train[encoded_columns].copy()
X_train_encoded_index = X_train_encoded.index

X_test_numeric = X_test[numeric_features].copy()
X_test_numeric_index = X_test_numeric.index

X_test_encoded = X_test[encoded_columns].copy()
X_test_encoded_index = X_test_encoded.index

In [129]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train_numeric)

# Scale the data
X_train_scaled = X_scaler.transform(X_train_numeric)
X_test_scaled = X_scaler.transform(X_test_numeric)

# Recombine scaled and encoded features

In [130]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, index=X_train_numeric_index, columns=numeric_features)
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns)

X_test_scaled_df = pd.DataFrame(X_test_scaled, index=X_test_numeric_index, columns=numeric_features)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_columns)

X_train_full_df = X_train_scaled_df.merge(X_train_encoded_df, left_index=True, right_index=True)
X_test_full_df = X_test_scaled_df.merge(X_test_encoded_df, left_index=True, right_index=True)
print(X_train_full_df.shape, X_test_full_df.shape)

(640962, 218) (213655, 218)


# Add target column back

In [131]:
training_data_output_df = X_train_scaled_df.copy()
training_data_output_df['Delay Bin'] = y_train

testing_data_output_df = X_test_scaled_df.copy()
testing_data_output_df['Delay Bin'] = y_test

In [132]:
print(len(training_data_output_df))
print(len(testing_data_output_df))

640962
213655


# Write data to zipped CSV's

In [133]:
write_csv(training_data_output_df, output_path, training_output_file, zip=True)

Zipping "modeling_data_training.csv" into "modeling_data_training.zip"
Writing to "../../../Preprocessing/Output/Delays/modeling_data_training.zip"


In [134]:
write_csv(testing_data_output_df, output_path, testing_output_file, zip=True)

Zipping "modeling_data_testing.csv" into "modeling_data_testing.zip"
Writing to "../../../Preprocessing/Output/Delays/modeling_data_testing.zip"


# Save scaler for future use...

In [135]:
# TODO