## Setup

In [1]:
#imports
import zipfile
import joblib
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
# function to open zipped file and read into df
def unzip_to_df(zip_filepath, file_inside_zip, **read_csv_kwargs):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as z:
            with z.open(file_inside_zip) as f:
                df = pd.read_csv(f, dtype={'Manufacturer Code': str}, **read_csv_kwargs)
        return df
    except Exception as e:
        print(f'Error occured: {e}')
        return None

In [3]:
# get df
zip_path = 'Output/Delays/modeling_data_dest_narrow.zip'
file_name = 'modeling_data_dest_narrow.csv'

delays_df = unzip_to_df(zip_path, file_name)

## Check Data
---

In [4]:
# display
delays_df.head()

Unnamed: 0,Delay Bin,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),Scheduled Elapsed Time,...,Destination Relative Humidity,Destination Wind Speed,Destination Wind Direction (sin),Destination Wind Direction (cos),Destination Wind Gust,Destination Visibility,Destination Ceiling,Destination Ceiling Missing,Destination Sea Level Pressure,Destination Sea Level Pressure Missing
0,0,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.254602,0.967046,147,...,64.32,5.75,-0.766044,-0.642788,0.0,10.0,4700.0,0.0,1017.7,0.0
1,0,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.246153,0.969231,183,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0
2,0,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.173648,0.984808,209,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0
3,2,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.382683,0.92388,179,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0
4,0,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.207912,0.978148,234,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0


In [5]:
# see if there's imbalance
delays_df['Delay Bin'].value_counts()

Delay Bin
0    501226
1    195705
2     89656
3     43319
Name: count, dtype: int64

In [6]:
# convert dtype to int if bool
bool_cols = [col for col in list(delays_df.columns) if delays_df[col].dtype == 'bool']

# convert bool types to int64
for bool in bool_cols:
    delays_df[bool] = delays_df[bool].astype(np.int64)
    print(f'{bool}: {delays_df[bool].dtype}')

Aircraft Age Missing: int64
Builder Type Certificated: int64


In [7]:
# data type
delays_df.dtypes

Delay Bin                                            int64
Month (sin)                                        float64
Month (cos)                                        float64
Day (sin)                                          float64
Day (cos)                                          float64
Day of Week (sin)                                  float64
Day of Week (cos)                                  float64
Scheduled Departure Total Minutes (sin)            float64
Scheduled Departure Total Minutes (cos)            float64
Scheduled Elapsed Time                               int64
Carrier Code                                        object
Destination Airport                                 object
Manufacturer                                        object
Model                                               object
Aircraft Age                                         int64
Aircraft Age Missing                                 int64
Type of Engine                                      obje

## Categorize Features
---

### separate y and x's

In [8]:
# separate 
t_name = 'Delay Bin'
y_var = delays_df[t_name]
x_vars = delays_df.drop(columns=t_name).copy()

### identify categorical and numerical columns

In [9]:
# get x cols list
cols_list = list(x_vars.columns)

# get numeric cols
num_names = [
    col for col in cols_list
    if (x_vars[col].dtypes == 'int64' or x_vars[col].dtypes == 'float64')
        and col.endswith('Missing') == False and col.endswith('Certificated') == False
]

# get categorical cols--> leftover
cat_names = list(set(cols_list) - set(num_names))

In [10]:
# display cols names
display(num_names)
cat_names

['Month (sin)',
 'Month (cos)',
 'Day (sin)',
 'Day (cos)',
 'Day of Week (sin)',
 'Day of Week (cos)',
 'Scheduled Departure Total Minutes (sin)',
 'Scheduled Departure Total Minutes (cos)',
 'Scheduled Elapsed Time',
 'Aircraft Age',
 'Number of Seats',
 'Precipitation Accumulation One Hour',
 'Precipitation Accumulation Six Hours',
 'Air Temperature',
 'Dew Point Temperature',
 'Relative Humidity',
 'Wind Speed',
 'Wind Direction (sin)',
 'Wind Direction (cos)',
 'Wind Gust',
 'Visibility',
 'Ceiling',
 'Sea Level Pressure',
 'Destination Precipication Accumulation One Hour',
 'Destination Precipitation Six Hours',
 'Destination Air Temperature',
 'Destination Dew Point Temperature',
 'Destination Relative Humidity',
 'Destination Wind Speed',
 'Destination Wind Direction (sin)',
 'Destination Wind Direction (cos)',
 'Destination Wind Gust',
 'Destination Visibility',
 'Destination Ceiling',
 'Destination Sea Level Pressure']

['Ceiling Missing',
 'Manufacturer',
 'Destination Sea Level Pressure Missing',
 'Builder Type Certificated',
 'Aircraft Age Missing',
 'Destination Ceiling Missing',
 'Type of Engine',
 'Carrier Code',
 'Sea Level Pressure Missing',
 'Model',
 'Destination Airport']

## Train/Test Datasets Split
---

### Split data

In [11]:
x_train, x_test, y_train, y_test = train_test_split(
    x_vars, y_var, 
    random_state=1,
    train_size=0.8,
    test_size=0.2,
    stratify=y_var  #b/c classification
)

### Encode categorical variables and scale numeric

In [12]:
# create transformers for numeric and categorical columns separately
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(
    drop='first',  #drop first createdcol
    handle_unknown='ignore',  #prevents errors if test/new data has unforseen categories
    sparse_output=False
)

In [13]:
# combine transformers with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_names),  #apply onehotencoder to category cols
        ('num', num_transformer, num_names)  #apply scaling to numeric cols
    ],
    #remainder='passthrough'  #keep rest of the cols untransformed
    remainder='drop'  #drop rest of the columns
)

In [14]:
# fit/transform on x train and transform x test --> avoid data leakage
x_train_processed = preprocessor.fit_transform(x_train)
x_test_processed = preprocessor.transform(x_test)

### Rebalance training data

In [15]:
# apply rebalancing on training set
sm = SMOTE(random_state=1)
x_train_bal, y_train_bal = sm.fit_resample(x_train_processed, y_train)

### Inspecting unknown category

##### unknown category is a destination airport that only appeared once in the main df

In [16]:
# get unknown category col and inspect
unknown_col = cat_names[4]
unknown_vals = set(x_test[unknown_col].unique()) - set(x_train[unknown_col].unique())

# display
print(f'Affected column: {unknown_col}')
print(f'Unknown values: {unknown_vals}')
delays_df[delays_df['Destination Airport'] == 'BUR']

Affected column: Aircraft Age Missing
Unknown values: set()


Unnamed: 0,Delay Bin,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),Scheduled Elapsed Time,...,Destination Relative Humidity,Destination Wind Speed,Destination Wind Direction (sin),Destination Wind Direction (cos),Destination Wind Gust,Destination Visibility,Destination Ceiling,Destination Ceiling Missing,Destination Sea Level Pressure,Destination Sea Level Pressure Missing


## Export
---

### Train/test data

##### Review processed data

In [17]:
# look into col names
encoded_feature_names = preprocessor.get_feature_names_out()
encoded_feature_names

array(['cat__Ceiling Missing_1', 'cat__Manufacturer_AIRBUS CANADA LP',
       'cat__Manufacturer_BOEING', 'cat__Manufacturer_BOMBARDIER INC',
       'cat__Manufacturer_EMBRAER', 'cat__Manufacturer_OTHER',
       'cat__Manufacturer_YABORA INDUSTRIA AERONAUTICA S',
       'cat__Destination Sea Level Pressure Missing_1.0',
       'cat__Builder Type Certificated_1', 'cat__Aircraft Age Missing_1',
       'cat__Destination Ceiling Missing_1.0',
       'cat__Type of Engine_Turbo-Fan', 'cat__Type of Engine_Turbo-Jet',
       'cat__Carrier Code_AS', 'cat__Carrier Code_B6',
       'cat__Carrier Code_DL', 'cat__Carrier Code_F9',
       'cat__Carrier Code_HA', 'cat__Carrier Code_MQ',
       'cat__Carrier Code_NK', 'cat__Carrier Code_OO',
       'cat__Carrier Code_QX', 'cat__Carrier Code_UA',
       'cat__Carrier Code_WN', 'cat__Sea Level Pressure Missing_1',
       'cat__Model_737-76N', 'cat__Model_737-79P', 'cat__Model_737-7BD',
       'cat__Model_737-7CT', 'cat__Model_737-7H4', 'cat__Model_737-7

##### Convert to dfs

In [18]:
# get df versions
x_train_processed_df = pd.DataFrame(x_train_bal, columns=encoded_feature_names)
x_test_processed_df = pd.DataFrame(x_test_processed, columns=encoded_feature_names)

In [19]:
# display training
x_train_processed_df

Unnamed: 0,cat__Ceiling Missing_1,cat__Manufacturer_AIRBUS CANADA LP,cat__Manufacturer_BOEING,cat__Manufacturer_BOMBARDIER INC,cat__Manufacturer_EMBRAER,cat__Manufacturer_OTHER,cat__Manufacturer_YABORA INDUSTRIA AERONAUTICA S,cat__Destination Sea Level Pressure Missing_1.0,cat__Builder Type Certificated_1,cat__Aircraft Age Missing_1,...,num__Destination Air Temperature,num__Destination Dew Point Temperature,num__Destination Relative Humidity,num__Destination Wind Speed,num__Destination Wind Direction (sin),num__Destination Wind Direction (cos),num__Destination Wind Gust,num__Destination Visibility,num__Destination Ceiling,num__Destination Sea Level Pressure
0,1.000000,0.0,1.000000,0.0,0.000000,0.0,0.000000,0.0,1.000000,0.0,...,-1.812830,-1.269386,0.992511,-0.330626,-1.160738,-1.068467,-0.197859,0.318287,1.358243,1.855326
1,1.000000,0.0,1.000000,0.0,0.000000,0.0,0.000000,0.0,1.000000,0.0,...,-0.608721,-0.169340,0.192444,-0.330626,-0.793188,-1.399702,-0.197859,0.318287,-0.691325,0.956913
2,1.000000,0.0,0.000000,0.0,1.000000,0.0,0.000000,0.0,1.000000,0.0,...,-1.387851,-0.169340,1.006167,-1.876703,-0.102420,1.369064,-0.197859,0.318287,1.358243,-1.464895
3,1.000000,0.0,0.000000,1.0,0.000000,0.0,0.000000,0.0,1.000000,0.0,...,-0.219157,-1.519396,-0.872705,-0.330626,1.195799,0.392769,-0.197859,0.318287,1.358243,-0.605544
4,0.000000,0.0,1.000000,0.0,0.000000,0.0,0.000000,0.0,1.000000,0.0,...,-1.812830,-0.169340,1.940382,-0.639841,-1.160738,-1.068467,-0.197859,-3.054405,-0.882618,0.527237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1603915,0.000000,0.0,0.096703,0.0,0.903297,0.0,0.000000,0.0,0.903297,0.0,...,0.559973,-0.719363,-1.157066,1.527355,1.195799,-0.622195,-0.197859,0.318287,1.358243,-1.816448
1603916,0.000000,0.0,0.967857,0.0,0.000000,0.0,0.000000,0.0,1.000000,0.0,...,1.326580,0.980708,-0.460195,1.178384,1.099300,0.611611,-0.197859,0.318287,1.358243,-0.568993
1603917,0.482697,0.0,0.000000,0.0,0.517303,0.0,0.482697,0.0,1.000000,0.0,...,0.973985,-0.848694,-1.574926,-0.499986,-0.330501,1.325871,-0.197859,0.318287,1.358243,-0.689143
1603918,0.000000,0.0,0.000000,0.0,1.000000,0.0,0.000000,0.0,1.000000,0.0,...,0.969705,-1.714977,-1.984404,-1.446162,-0.018912,1.361218,-0.197859,0.318287,1.358243,0.046633


In [20]:
# display processed
x_test_processed_df

Unnamed: 0,cat__Ceiling Missing_1,cat__Manufacturer_AIRBUS CANADA LP,cat__Manufacturer_BOEING,cat__Manufacturer_BOMBARDIER INC,cat__Manufacturer_EMBRAER,cat__Manufacturer_OTHER,cat__Manufacturer_YABORA INDUSTRIA AERONAUTICA S,cat__Destination Sea Level Pressure Missing_1.0,cat__Builder Type Certificated_1,cat__Aircraft Age Missing_1,...,num__Destination Air Temperature,num__Destination Dew Point Temperature,num__Destination Relative Humidity,num__Destination Wind Speed,num__Destination Wind Direction (sin),num__Destination Wind Direction (cos),num__Destination Wind Gust,num__Destination Visibility,num__Destination Ceiling,num__Destination Sea Level Pressure
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.042082,0.430685,0.604527,0.599709,-0.342321,1.346522,-0.197859,0.318287,-0.814299,0.019439
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.998286,-0.419350,0.585248,-1.876703,-0.102420,1.369064,-0.197859,-3.054405,-0.909945,1.425650
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.382898,0.680695,-0.146540,-0.639841,-0.342321,-1.575949,-0.197859,0.318287,-0.780139,-1.386772
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.559973,2.080754,1.039102,-0.639841,1.258128,-0.372368,-0.197859,-3.616520,-0.964600,0.722545
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.431647,-0.419350,0.192444,-0.330626,0.955898,0.839041,-0.197859,0.318287,-0.691325,2.011571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165977,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.551592,0.430685,-1.101640,0.599709,0.137482,1.346522,-0.197859,0.318287,-0.554687,-0.410237
165978,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.431647,0.430685,1.019823,-0.330626,-1.400639,-0.622195,-0.197859,0.318287,-0.814299,0.488176
165979,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.559973,0.680695,-0.146540,1.527355,0.785614,-1.251353,-0.197859,0.318287,-0.732316,1.464712
165980,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-1.210776,-0.719363,0.575609,-0.639841,-1.298865,-0.856602,-0.197859,0.318287,-0.848458,0.019439


##### Export as zipped csvs

In [21]:
# function to export df as zipped csv
def export_zipped_file(df, name):
    
    # set up output path
    output_path = f'Output/Split_Train_Test/balanced/{name}.zip'

    # export
    df.to_csv(
        output_path,
        index=False,
        compression={
            'method': 'zip',
            'archive_name': f'{name}.csv'
        }
    )

In [22]:
# export to zipped files
export_zipped_file(y_train_bal, 'y_train')
export_zipped_file(x_train_processed_df, 'x_train')
export_zipped_file(y_test, 'y_test')
export_zipped_file(x_test_processed_df, 'x_test')

### Export preprocessor

In [23]:
# save the fitted preprocessor
joblib.dump(preprocessor, 'Output/Preprocessor/balanced/preprocessor.joblib')

['Output/Preprocessor/balanced/preprocessor.joblib']