## Setup

In [1]:
#imports
import zipfile
import joblib
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# function to open zipped file and read into df
def unzip_to_df(zip_filepath, file_inside_zip, **read_csv_kwargs):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as z:
            with z.open(file_inside_zip) as f:
                df = pd.read_csv(f, dtype={'Manufacturer Code': str}, **read_csv_kwargs)
        return df
    except Exception as e:
        print(f'Error occured: {e}')
        return None

In [3]:
# get df
zip_path = 'Output/Delays/modeling_data_dest.zip'
file_name = 'modeling_data_dest.csv'

delays_df = unzip_to_df(zip_path, file_name)

## Check Data
---

In [4]:
# display
delays_df.head()

Unnamed: 0,Delay Bin,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),Scheduled Elapsed Time,...,Destination Relative Humidity,Destination Wind Speed,Destination Wind Direction (sin),Destination Wind Direction (cos),Destination Wind Gust,Destination Visibility,Destination Ceiling,Destination Ceiling Missing,Destination Sea Level Pressure,Destination Sea Level Pressure Missing
0,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.254602,0.967046,147,...,64.32,5.75,-0.766044,-0.642788,0.0,10.0,4700.0,0.0,1017.7,0.0
1,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.246153,0.969231,183,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0
2,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.173648,0.984808,209,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0
3,11-30 min,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.382683,0.92388,179,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0
4,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.207912,0.978148,234,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0


In [5]:
# convert dtype to int if bool
bool_cols = [col for col in list(delays_df.columns) if delays_df[col].dtype == 'bool']

# convert bool types to int64
for bool in bool_cols:
    delays_df[bool] = delays_df[bool].astype(np.int64)
    print(f'{bool}: {delays_df[bool].dtype}')

Aircraft Age Missing: int64
Builder Type Certificated: int64


In [6]:
# data type
delays_df.dtypes

Delay Bin                                           object
Month (sin)                                        float64
Month (cos)                                        float64
Day (sin)                                          float64
Day (cos)                                          float64
Day of Week (sin)                                  float64
Day of Week (cos)                                  float64
Scheduled Departure Total Minutes (sin)            float64
Scheduled Departure Total Minutes (cos)            float64
Scheduled Elapsed Time                               int64
Carrier Code                                        object
Destination Airport                                 object
Manufacturer                                        object
Model                                               object
Aircraft Age                                         int64
Aircraft Age Missing                                 int64
Type of Engine                                      obje

## Categorize Features
---

### separate y and x's

In [7]:
# separate 
t_name = 'Delay Bin'
y_var = delays_df[t_name]
x_vars = delays_df.drop(columns=t_name).copy()

### identify categorical and numerical columns

In [8]:
# get x cols list
cols_list = list(x_vars.columns)

# get numeric cols
num_names = [
    col for col in cols_list
    if (x_vars[col].dtypes == 'int64' or x_vars[col].dtypes == 'float64')
        and col.endswith('Missing') == False and col.endswith('Certificated') == False
]

# get categorical cols--> leftover
cat_names = list(set(cols_list) - set(num_names))

In [9]:
# display cols names
display(num_names)
cat_names

['Month (sin)',
 'Month (cos)',
 'Day (sin)',
 'Day (cos)',
 'Day of Week (sin)',
 'Day of Week (cos)',
 'Scheduled Departure Total Minutes (sin)',
 'Scheduled Departure Total Minutes (cos)',
 'Scheduled Elapsed Time',
 'Aircraft Age',
 'Number of Seats',
 'Precipitation Accumulation One Hour',
 'Precipitation Accumulation Six Hours',
 'Air Temperature',
 'Dew Point Temperature',
 'Relative Humidity',
 'Wind Speed',
 'Wind Direction (sin)',
 'Wind Direction (cos)',
 'Wind Gust',
 'Visibility',
 'Ceiling',
 'Sea Level Pressure',
 'Destination Precipication Accumulation One Hour',
 'Destination Precipitation Six Hours',
 'Destination Air Temperature',
 'Destination Dew Point Temperature',
 'Destination Relative Humidity',
 'Destination Wind Speed',
 'Destination Wind Direction (sin)',
 'Destination Wind Direction (cos)',
 'Destination Wind Gust',
 'Destination Visibility',
 'Destination Ceiling',
 'Destination Sea Level Pressure']

['Model',
 'Destination Sea Level Pressure Missing',
 'Ceiling Missing',
 'Aircraft Age Missing',
 'Sea Level Pressure Missing',
 'Type of Engine',
 'Carrier Code',
 'Builder Type Certificated',
 'Destination Ceiling Missing',
 'Manufacturer',
 'Destination Airport']

## Train/Test Datasets Split
---

### Split data

In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    x_vars, y_var, 
    random_state=1, 
    stratify=y_var  #b/c classification
)

### Encode categorical variables and scale numeric

In [11]:
# create transformers for numeric and categorical columns separately
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(
    drop='first',  #drop first createdcol
    handle_unknown='ignore',  #prevents errors if test/new data has unforseen categories
    sparse_output=False
)

In [12]:
# combine transformers with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_names),  #apply onehotencoder to category cols
        ('num', num_transformer, num_names)  #apply scaling to numeric cols
    ],
    #remainder='passthrough'  #keep rest of the cols untransformed
    remainder='drop'  #drop rest of the columns
)

In [13]:
# fit/transform on x train and transform x test --> avoid data leakage
x_train_processed = preprocessor.fit_transform(x_train)
x_test_processed = preprocessor.transform(x_test)

### Inspecting unknown category

##### unknown category is a destination airport that only appeared once in the main df

In [14]:
# get unknown category col and inspect
unknown_col = cat_names[4]
unknown_vals = set(x_test[unknown_col].unique()) - set(x_train[unknown_col].unique())

# display
print(f'Affected column: {unknown_col}')
print(f'Unknown values: {unknown_vals}')
delays_df[delays_df['Destination Airport'] == 'BUR']

Affected column: Sea Level Pressure Missing
Unknown values: set()


Unnamed: 0,Delay Bin,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),Scheduled Elapsed Time,...,Destination Relative Humidity,Destination Wind Speed,Destination Wind Direction (sin),Destination Wind Direction (cos),Destination Wind Gust,Destination Visibility,Destination Ceiling,Destination Ceiling Missing,Destination Sea Level Pressure,Destination Sea Level Pressure Missing


## Export
---

### Train/test data

##### Review processed data

In [15]:
# look into col names
encoded_feature_names = preprocessor.get_feature_names_out()
encoded_feature_names

array(['cat__Model_737-76N', 'cat__Model_737-79P', 'cat__Model_737-7BD',
       'cat__Model_737-7CT', 'cat__Model_737-7H4', 'cat__Model_737-7Q8',
       'cat__Model_737-8', 'cat__Model_737-800', 'cat__Model_737-823',
       'cat__Model_737-824', 'cat__Model_737-832', 'cat__Model_737-890',
       'cat__Model_737-8EH', 'cat__Model_737-8H4', 'cat__Model_737-9',
       'cat__Model_737-900ER', 'cat__Model_737-924ER',
       'cat__Model_737-932ER', 'cat__Model_737-990',
       'cat__Model_737-990ER', 'cat__Model_757-224', 'cat__Model_757-231',
       'cat__Model_757-232', 'cat__Model_757-251', 'cat__Model_757-26D',
       'cat__Model_757-2Q8', 'cat__Model_757-324', 'cat__Model_757-33N',
       'cat__Model_757-351', 'cat__Model_767-322', 'cat__Model_767-332',
       'cat__Model_767-432ER', 'cat__Model_777-222', 'cat__Model_777-223',
       'cat__Model_777-323ER', 'cat__Model_787-10', 'cat__Model_787-8',
       'cat__Model_787-9', 'cat__Model_A319-112', 'cat__Model_A319-114',
       'cat__Mode

##### Convert to dfs

In [16]:
# get df versions
x_train_processed_df = pd.DataFrame(x_train_processed, columns=encoded_feature_names)
x_test_processed_df = pd.DataFrame(x_test_processed, columns=encoded_feature_names)

In [17]:
# display training
x_train_processed_df

Unnamed: 0,cat__Model_737-76N,cat__Model_737-79P,cat__Model_737-7BD,cat__Model_737-7CT,cat__Model_737-7H4,cat__Model_737-7Q8,cat__Model_737-8,cat__Model_737-800,cat__Model_737-823,cat__Model_737-824,...,num__Destination Air Temperature,num__Destination Dew Point Temperature,num__Destination Relative Humidity,num__Destination Wind Speed,num__Destination Wind Direction (sin),num__Destination Wind Direction (cos),num__Destination Wind Gust,num__Destination Visibility,num__Destination Ceiling,num__Destination Sea Level Pressure
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.561290,-1.270340,-1.459132,0.288275,1.196912,-0.622234,-0.198453,0.319289,-0.704622,1.268863
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.997298,0.431024,1.462126,-0.330024,-1.297269,-0.856675,-0.198453,0.319289,-0.485966,0.878214
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-2.201661,-1.820782,0.983547,-0.639173,-1.399024,-0.622234,-0.198453,0.319289,1.358937,-1.231286
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.950937,-0.169458,-1.139544,0.290963,0.371366,1.279823,-0.198453,0.319289,-0.697789,-0.606249
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.773825,0.130783,-0.822365,0.600112,0.138799,1.346774,-0.198453,0.319289,1.358937,0.956344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.950937,0.681224,-0.487520,1.218410,1.095158,-0.856675,-0.198453,0.319289,1.358937,0.721955
622425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.171643,0.431024,0.215091,-0.639173,-1.482324,-0.114677,-0.198453,0.319289,-0.690956,0.018788
622426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.340584,0.431024,-1.022308,0.600112,0.957058,0.839217,-0.198453,0.319289,1.358937,-1.270351
622427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.430539,-1.270340,-0.550153,-0.948322,0.957058,0.839217,-0.198453,0.319289,-0.554296,-0.489055


In [18]:
# display processed
x_test_processed_df

Unnamed: 0,cat__Model_737-76N,cat__Model_737-79P,cat__Model_737-7BD,cat__Model_737-7CT,cat__Model_737-7H4,cat__Model_737-7Q8,cat__Model_737-8,cat__Model_737-800,cat__Model_737-823,cat__Model_737-824,...,num__Destination Air Temperature,num__Destination Dew Point Temperature,num__Destination Relative Humidity,num__Destination Wind Speed,num__Destination Wind Direction (sin),num__Destination Wind Direction (cos),num__Destination Wind Gust,num__Destination Visibility,num__Destination Ceiling,num__Destination Sea Level Pressure
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.891031,0.431024,1.462126,-0.639173,-1.297269,-0.856675,-0.198453,0.319289,-0.848114,0.609318
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.950937,-0.419658,-1.139544,0.290963,0.957058,-1.068572,-0.198453,0.319289,1.358937,-0.254666
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.880092,0.681224,-0.487520,0.290963,-0.791690,1.170501,-0.198453,0.319289,-0.827615,0.738232
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.234317,0.981465,-0.472264,0.909261,0.786807,1.022130,-0.198453,0.319289,1.358937,0.531188
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.384178,-0.169458,-0.839228,0.290963,0.786807,1.022130,-0.198453,0.319289,1.358937,-1.114092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-2.768421,-1.820782,1.443658,-1.875769,-0.101056,1.369319,-0.198453,0.319289,1.358937,-2.286037
207473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.950937,0.431024,-0.487520,-0.948322,-0.988918,-1.251484,-0.198453,0.319289,-0.827615,-0.332795
207474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.430539,-0.719899,-0.189613,-0.330024,-1.159169,-1.068572,-0.198453,0.319289,-0.820782,1.581381
207475,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.040892,-1.270340,-0.873756,0.600112,0.589579,1.170501,-0.198453,0.319289,-0.622626,-0.332795


##### Export as zipped csvs

In [19]:
# function to export df as zipped csv
def export_zipped_file(df, name):
    
    # set up output path
    output_path = f'Output/Split_Train_Test/dest/{name}.zip'

    # export
    df.to_csv(
        output_path,
        index=False,
        compression={
            'method': 'zip',
            'archive_name': f'{name}.csv'
        }
    )

In [20]:
# export to zipped files
export_zipped_file(y_train, 'y_train')
export_zipped_file(x_train_processed_df, 'x_train')
export_zipped_file(y_test, 'y_test')
export_zipped_file(x_test_processed_df, 'x_test')

### Export preprocessor

In [21]:
# save the fitted preprocessor
joblib.dump(preprocessor, 'Output/Preprocessor/dest/preprocessor.joblib')

['Output/Preprocessor/dest/preprocessor.joblib']