## Setup

In [1]:
#imports
import zipfile
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# function to open zipped file and read into df
def unzip_to_df(zip_filepath, file_inside_zip, **read_csv_kwargs):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as z:
            with z.open(file_inside_zip) as f:
                df = pd.read_csv(f, dtype={'Manufacturer Code': str}, **read_csv_kwargs)
        return df
    except Exception as e:
        print(f'Error occured: {e}')
        return None

In [3]:
# get df
zip_path = 'Output/Delays/modeling_data.zip'
file_name = 'modeling_data.csv'

delays_df = unzip_to_df(zip_path, file_name)

## Check Data
---

In [4]:
# display
delays_df.head()

Unnamed: 0,Delay Bin,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),Scheduled Elapsed Time,...,Dew Point Temperature,Relative Humidity,Wind Speed,Wind Direction Interpolation,Wind Gust,Visibility,Ceiling,Ceiling Missing,Sea Level Pressure,Sea Level Pressure Missing
0,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.065403,0.997859,307,...,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
1,11-30 min,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,278,...,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
2,31-60 min,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,295,...,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
3,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,320,...,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
4,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.173648,0.984808,209,...,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0


In [5]:
# convert dtype to int if bool
bool_cols = [col for col in list(delays_df.columns) if delays_df[col].dtype == 'bool']

# convert bool types to int64
for bool in bool_cols:
    delays_df[bool] = delays_df[bool].astype(np.int64)
    print(f'{bool}: {delays_df[bool].dtype}')

Aircraft Age Missing: int64
Builder Type Certificated: int64


In [6]:
# data type
delays_df.dtypes

Delay Bin                                   object
Month (sin)                                float64
Month (cos)                                float64
Day (sin)                                  float64
Day (cos)                                  float64
Day of Week (sin)                          float64
Day of Week (cos)                          float64
Scheduled Departure Total Minutes (sin)    float64
Scheduled Departure Total Minutes (cos)    float64
Scheduled Elapsed Time                       int64
Carrier Code                                object
Destination Airport                         object
Manufacturer                                object
Model                                       object
Aircraft Age                                 int64
Aircraft Age Missing                         int64
Type of Engine                              object
Number of Seats                              int64
Builder Type Certificated                    int64
Precipitation Accumulation One 

## Categorize Features
---

### separate y and x's

In [7]:
# separate 
t_name = 'Delay Bin'
y_var = delays_df[t_name]
x_vars = delays_df.drop(columns=t_name).copy()

### identify categorical and numerical columns

In [8]:
# get x cols list
cols_list = list(x_vars.columns)

# get numeric cols
num_names = [
    col for col in cols_list
    if (x_vars[col].dtypes == 'int64' or x_vars[col].dtypes == 'float64')
        and col.endswith('Missing') == False
]

# get categorical cols--> leftover
cat_names = list(set(cols_list) - set(num_names))

In [9]:
# display cols names
display(num_names)
cat_names

['Month (sin)',
 'Month (cos)',
 'Day (sin)',
 'Day (cos)',
 'Day of Week (sin)',
 'Day of Week (cos)',
 'Scheduled Departure Total Minutes (sin)',
 'Scheduled Departure Total Minutes (cos)',
 'Scheduled Elapsed Time',
 'Aircraft Age',
 'Number of Seats',
 'Builder Type Certificated',
 'Precipitation Accumulation One Hour',
 'Precipitation Accumulation Six Hours',
 'Air Temperature',
 'Dew Point Temperature',
 'Relative Humidity',
 'Wind Speed',
 'Wind Direction Interpolation',
 'Wind Gust',
 'Visibility',
 'Ceiling',
 'Sea Level Pressure']

['Carrier Code',
 'Sea Level Pressure Missing',
 'Manufacturer',
 'Ceiling Missing',
 'Aircraft Age Missing',
 'Type of Engine',
 'Model',
 'Destination Airport']

## Train/Test Datasets Split
---

### Split data

In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    x_vars, y_var, 
    random_state=1, 
    stratify=y_var  #b/c classification
)

### Encode categorical variables and scale numeric

In [11]:
# create transformers for numeric and categorical columns separately
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(
    drop='first',  #drop first createdcol
    handle_unknown='ignore',  #prevents errors if test/new data has unforseen categories
    sparse_output=False
)

In [12]:
# combine transformers with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_names),  #apply onehotencoder to category cols
        ('num', num_transformer, num_names)  #apply scaling to numeric cols
    ],
    #remainder='passthrough'  #keep rest of the cols untransformed
    remainder='drop'  #drop rest of the columns
)

In [16]:
# fit/transform on x train and transform x test --> avoid data leakage
x_train_processed = preprocessor.fit_transform(x_train)
x_test_processed = preprocessor.transform(x_test)



### Inspecting unknown category

##### unknown category is a destination airport that only appeared once in the main df

In [26]:
# get unknown category col and inspect
unknown_col = cat_names[7]
unknown_vals = set(x_test[unknown_col].unique()) - set(x_train[unknown_col].unique())

# display
print(f'Affected column: {unknown_col}')
print(f'Unknown values: {unknown_vals}')
delays_df[delays_df['Destination Airport'] == 'BUR']

Affected column: Destination Airport
Unknown values: {'BUR'}


Unnamed: 0,Delay Bin,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),Scheduled Elapsed Time,...,Dew Point Temperature,Relative Humidity,Wind Speed,Wind Direction Interpolation,Wind Gust,Visibility,Ceiling,Ceiling Missing,Sea Level Pressure,Sea Level Pressure Missing
835507,31-60 min,-0.5,0.866025,-0.848644,0.528964,0.866025,0.5,-0.833886,0.551937,30,...,59.0,93.2,4.6,260.0,0.0,9.0,700,0,1019.0,0


## Export
---

### Review processed data

In [27]:
# look into col names
encoded_feature_names = preprocessor.get_feature_names_out()
encoded_feature_names

array(['cat__Carrier Code_AS', 'cat__Carrier Code_B6',
       'cat__Carrier Code_DL', 'cat__Carrier Code_F9',
       'cat__Carrier Code_HA', 'cat__Carrier Code_MQ',
       'cat__Carrier Code_NK', 'cat__Carrier Code_OO',
       'cat__Carrier Code_QX', 'cat__Carrier Code_UA',
       'cat__Carrier Code_WN', 'cat__Sea Level Pressure Missing_1',
       'cat__Manufacturer_AIRBUS CANADA LP', 'cat__Manufacturer_BOEING',
       'cat__Manufacturer_BOMBARDIER INC', 'cat__Manufacturer_EMBRAER',
       'cat__Manufacturer_OTHER',
       'cat__Manufacturer_YABORA INDUSTRIA AERONAUTICA S',
       'cat__Ceiling Missing_1', 'cat__Aircraft Age Missing_1',
       'cat__Type of Engine_Turbo-Fan', 'cat__Type of Engine_Turbo-Jet',
       'cat__Model_737-76N', 'cat__Model_737-79P', 'cat__Model_737-7BD',
       'cat__Model_737-7CT', 'cat__Model_737-7H4', 'cat__Model_737-7Q8',
       'cat__Model_737-8', 'cat__Model_737-800', 'cat__Model_737-823',
       'cat__Model_737-824', 'cat__Model_737-832', 'cat__Model_73

### Convert to dfs

In [28]:
# get df versions
x_train_processed_df = pd.DataFrame(x_train_processed, columns=encoded_feature_names)
x_test_processed_df = pd.DataFrame(x_train_processed, columns=encoded_feature_names)

In [29]:
# display training
x_train_processed_df

Unnamed: 0,cat__Carrier Code_AS,cat__Carrier Code_B6,cat__Carrier Code_DL,cat__Carrier Code_F9,cat__Carrier Code_HA,cat__Carrier Code_MQ,cat__Carrier Code_NK,cat__Carrier Code_OO,cat__Carrier Code_QX,cat__Carrier Code_UA,...,num__Precipitation Accumulation Six Hours,num__Air Temperature,num__Dew Point Temperature,num__Relative Humidity,num__Wind Speed,num__Wind Direction Interpolation,num__Wind Gust,num__Visibility,num__Ceiling,num__Sea Level Pressure
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.002117,-0.807790,0.033613,0.647823,-0.674606,-0.998722,-0.182304,0.511330,-1.404302,0.638906
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,-1.356855,-0.436958,0.462968,-1.615149,-1.899008,-0.182304,0.511330,-0.958320,0.276965
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,2.386769,1.529357,-0.180072,-1.615149,-1.899008,-0.182304,-0.395617,0.764792,-0.421065
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,2.536514,0.403348,-1.439820,1.208525,0.601786,-0.182304,0.511330,0.764792,-2.075653
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,1.413427,-2.773007,-2.852824,-0.909742,0.001595,-0.182304,0.511330,0.764792,1.724730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,-2.454984,-1.092396,0.568298,-1.615149,-1.899008,-0.182304,0.511330,0.764792,0.432083
640958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,1.014107,0.134450,-0.803626,0.973389,0.601786,-0.182304,0.511330,0.764792,0.276965
640959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.036958,1.138894,-2.386467,-2.651116,-0.439470,-1.598913,-0.182304,0.511330,0.764792,1.207671
640960,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,0.165552,0.588215,0.513527,-0.202290,0.401723,-0.182304,0.511330,0.764792,-0.059123


In [30]:
# display processed
x_test_processed_df

Unnamed: 0,cat__Carrier Code_AS,cat__Carrier Code_B6,cat__Carrier Code_DL,cat__Carrier Code_F9,cat__Carrier Code_HA,cat__Carrier Code_MQ,cat__Carrier Code_NK,cat__Carrier Code_OO,cat__Carrier Code_QX,cat__Carrier Code_UA,...,num__Precipitation Accumulation Six Hours,num__Air Temperature,num__Dew Point Temperature,num__Relative Humidity,num__Wind Speed,num__Wind Direction Interpolation,num__Wind Gust,num__Visibility,num__Ceiling,num__Sea Level Pressure
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.002117,-0.807790,0.033613,0.647823,-0.674606,-0.998722,-0.182304,0.511330,-1.404302,0.638906
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,-1.356855,-0.436958,0.462968,-1.615149,-1.899008,-0.182304,0.511330,-0.958320,0.276965
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,2.386769,1.529357,-0.180072,-1.615149,-1.899008,-0.182304,-0.395617,0.764792,-0.421065
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,2.536514,0.403348,-1.439820,1.208525,0.601786,-0.182304,0.511330,0.764792,-2.075653
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,1.413427,-2.773007,-2.852824,-0.909742,0.001595,-0.182304,0.511330,0.764792,1.724730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,-2.454984,-1.092396,0.568298,-1.615149,-1.899008,-0.182304,0.511330,0.764792,0.432083
640958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,1.014107,0.134450,-0.803626,0.973389,0.601786,-0.182304,0.511330,0.764792,0.276965
640959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.036958,1.138894,-2.386467,-2.651116,-0.439470,-1.598913,-0.182304,0.511330,0.764792,1.207671
640960,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036958,0.165552,0.588215,0.513527,-0.202290,0.401723,-0.182304,0.511330,0.764792,-0.059123


### Export as zipped csvs