In [2]:
#imports
from sqlalchemy import create_engine
import zipfile
import sqlite3
import joblib
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE


In [3]:
root_path = 'Output/'
split_path = f'{root_path}Split_Train_Test/'
preprocessor_path = f'{root_path}Preprocessor/'
REBALANCE = False

In [4]:
# function to open zipped file and read into df
def unzip_to_df(zip_filepath, file_inside_zip, **read_csv_kwargs):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as z:
            with z.open(file_inside_zip) as f:
                df = pd.read_csv(f, dtype={'Manufacturer Code': str}, **read_csv_kwargs)
        return df
    except Exception as e:
        print(f'Error occured: {e}')
        return None

In [5]:
# get df
zip_path = 'Output/Delays/modeling_data.zip'
file_name = 'modeling_data.csv'

delays_df = unzip_to_df(zip_path, file_name)

In [6]:
# Create a connection engine (SQLite version)
engine = create_engine("sqlite:///../Database/delays_database.sqlite")
connection = engine.connect()

In [7]:

# Save the DataFrame to a table
delays_df.to_sql("delays", con=connection, if_exists="replace", index=False)

829906

In [8]:
# Read all rows from the table
df_from_db = pd.read_sql("SELECT * FROM delays", con=connection)

In [9]:
connection.close()
engine.dispose()

## Check Data
---

In [10]:
# display
delays_df.head()

Unnamed: 0,Is Delayed,Delay Bin,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),...,Destination Relative Humidity,Destination Wind Speed,Destination Wind Direction (sin),Destination Wind Direction (cos),Destination Wind Gust,Destination Visibility,Destination Ceiling,Destination Ceiling Missing,Destination Sea Level Pressure,Destination Sea Level Pressure Missing
0,0,0,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.254602,0.967046,...,64.32,5.75,-0.766044,-0.642788,0.0,10.0,4700.0,0.0,1017.7,0.0
1,0,0,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.246153,0.969231,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0
2,0,0,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.173648,0.984808,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0
3,1,2,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.382683,0.92388,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0
4,0,0,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.207912,0.978148,...,72.97,5.75,-0.866025,-0.5,0.0,5.0,3600.0,0.0,1017.5,0.0


In [11]:
# see if there's imbalance
delays_df['Delay Bin'].value_counts()

Delay Bin
0    501226
1    195705
2     89656
3     43319
Name: count, dtype: int64

In [12]:
# convert dtype to int if bool
bool_cols = [col for col in list(delays_df.columns) if delays_df[col].dtype == 'bool']

# convert bool types to int64
for bool in bool_cols:
    delays_df[bool] = delays_df[bool].astype(np.int64)
    print(f'{bool}: {delays_df[bool].dtype}')

In [13]:
# data type
delays_df.dtypes

Is Delayed                                           int64
Delay Bin                                            int64
Month (sin)                                        float64
Month (cos)                                        float64
Day (sin)                                          float64
Day (cos)                                          float64
Day of Week (sin)                                  float64
Day of Week (cos)                                  float64
Scheduled Departure Total Minutes (sin)            float64
Scheduled Departure Total Minutes (cos)            float64
Scheduled Elapsed Time                               int64
Carrier Code                                        object
Destination Airport                                 object
Manufacturer                                        object
Model                                               object
Aircraft Age                                         int64
Aircraft Age Missing                                 int

## Categorize Features
---

### Separate features and target

In [14]:
# separate 
t_name = 'Delay Bin'
y_var = delays_df[t_name]
x_vars = delays_df.drop(columns=t_name).copy()

### Identify categorical and numerical columns

In [15]:
# identify categorical and numerical columns
# get x cols list
cols_list = list(x_vars.columns)

# get numeric cols
num_names = [
    col for col in cols_list
    if (x_vars[col].dtypes == 'int64' or x_vars[col].dtypes == 'float64')
        and col.endswith('Missing') == False and col.endswith('Certificated') == False
]

# get categorical cols--> leftover
cat_names = list(set(cols_list) - set(num_names))

In [16]:
# display cols names
display(num_names)
cat_names

['Is Delayed',
 'Month (sin)',
 'Month (cos)',
 'Day (sin)',
 'Day (cos)',
 'Day of Week (sin)',
 'Day of Week (cos)',
 'Scheduled Departure Total Minutes (sin)',
 'Scheduled Departure Total Minutes (cos)',
 'Scheduled Elapsed Time',
 'Aircraft Age',
 'Number of Seats',
 'Precipitation Accumulation One Hour',
 'Precipitation Accumulation Six Hours',
 'Air Temperature',
 'Dew Point Temperature',
 'Relative Humidity',
 'Wind Speed',
 'Wind Direction (sin)',
 'Wind Direction (cos)',
 'Wind Gust',
 'Visibility',
 'Ceiling',
 'Sea Level Pressure',
 'Destination Precipication Accumulation One Hour',
 'Destination Precipitation Six Hours',
 'Destination Air Temperature',
 'Destination Dew Point Temperature',
 'Destination Relative Humidity',
 'Destination Wind Speed',
 'Destination Wind Direction (sin)',
 'Destination Wind Direction (cos)',
 'Destination Wind Gust',
 'Destination Visibility',
 'Destination Ceiling',
 'Destination Sea Level Pressure']

['Aircraft Age Missing',
 'Sea Level Pressure Missing',
 'Manufacturer',
 'Type of Engine',
 'Model',
 'Builder Type Certificated',
 'Destination Sea Level Pressure Missing',
 'Carrier Code',
 'Ceiling Missing',
 'Destination Ceiling Missing',
 'Destination Airport']

## Train/Test Datasets Split
---

### Split data

In [17]:
# Split into train and test sets

x_train, x_test, y_train, y_test = train_test_split(
    x_vars, y_var, 
    random_state=1,
    train_size=0.8,
    test_size=0.2,
    stratify=y_var  #b/c classification
)

### Encode categorical variables and scale numeric

In [18]:
# create transformers for numeric and categorical columns separately
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(
    drop='first',  #drop first createdcol
    handle_unknown='ignore',  #prevents errors if test/new data has unforseen categories
    sparse_output=False
)

In [19]:
# combine transformers with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_names),  #apply onehotencoder to category cols
        ('num', num_transformer, num_names)  #apply scaling to numeric cols
    ],
    #remainder='passthrough'  #keep rest of the cols untransformed
    remainder='drop'  #drop rest of the columns
)

In [20]:
# fit/transform on x train and transform x test --> avoid data leakage
x_train_processed = preprocessor.fit_transform(x_train)
x_test_processed = preprocessor.transform(x_test)

### Rebalance training data (optional)

In [21]:
# apply rebalancing on training set
if REBALANCE:
    sm = SMOTE(random_state=1)
    x_train_bal, y_train_bal = sm.fit_resample(x_train_processed, y_train)
else:
    x_train_bal = x_train_processed
    y_train_bal = y_train

### Inspecting unknown category

##### unknown category is a destination airport that only appeared once in the main df

In [22]:
# get unknown category col and inspect
unknown_col = cat_names[4]
unknown_vals = set(x_test[unknown_col].unique()) - set(x_train[unknown_col].unique())

# display
print(f'Affected column: {unknown_col}')
print(f'Unknown values: {unknown_vals}')
delays_df[delays_df['Destination Airport'] == 'BUR']

Affected column: Model
Unknown values: set()


Unnamed: 0,Is Delayed,Delay Bin,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),...,Destination Relative Humidity,Destination Wind Speed,Destination Wind Direction (sin),Destination Wind Direction (cos),Destination Wind Gust,Destination Visibility,Destination Ceiling,Destination Ceiling Missing,Destination Sea Level Pressure,Destination Sea Level Pressure Missing


## Export
---

### Train/test data

##### Review processed data

In [23]:
# look into col names
encoded_feature_names = preprocessor.get_feature_names_out()
encoded_feature_names

array(['cat__Aircraft Age Missing_1', 'cat__Sea Level Pressure Missing_1',
       'cat__Manufacturer_AIRBUS CANADA LP', 'cat__Manufacturer_BOEING',
       'cat__Manufacturer_BOMBARDIER INC', 'cat__Manufacturer_EMBRAER',
       'cat__Manufacturer_OTHER',
       'cat__Manufacturer_YABORA INDUSTRIA AERONAUTICA S',
       'cat__Type of Engine_Turbo-Fan', 'cat__Type of Engine_Turbo-Jet',
       'cat__Model_737-76N', 'cat__Model_737-79P', 'cat__Model_737-7BD',
       'cat__Model_737-7CT', 'cat__Model_737-7H4', 'cat__Model_737-7Q8',
       'cat__Model_737-8', 'cat__Model_737-800', 'cat__Model_737-823',
       'cat__Model_737-824', 'cat__Model_737-832', 'cat__Model_737-890',
       'cat__Model_737-8EH', 'cat__Model_737-8H4', 'cat__Model_737-9',
       'cat__Model_737-900ER', 'cat__Model_737-924ER',
       'cat__Model_737-932ER', 'cat__Model_737-990',
       'cat__Model_737-990ER', 'cat__Model_757-224', 'cat__Model_757-231',
       'cat__Model_757-232', 'cat__Model_757-251', 'cat__Model_757-26D

##### Convert to dfs

In [24]:
# get df versions
x_train_processed_df = pd.DataFrame(x_train_bal, columns=encoded_feature_names)
x_test_processed_df = pd.DataFrame(x_test_processed, columns=encoded_feature_names)

In [25]:
# display training
x_train_processed_df

Unnamed: 0,cat__Aircraft Age Missing_1,cat__Sea Level Pressure Missing_1,cat__Manufacturer_AIRBUS CANADA LP,cat__Manufacturer_BOEING,cat__Manufacturer_BOMBARDIER INC,cat__Manufacturer_EMBRAER,cat__Manufacturer_OTHER,cat__Manufacturer_YABORA INDUSTRIA AERONAUTICA S,cat__Type of Engine_Turbo-Fan,cat__Type of Engine_Turbo-Jet,...,num__Destination Air Temperature,num__Destination Dew Point Temperature,num__Destination Relative Humidity,num__Destination Wind Speed,num__Destination Wind Direction (sin),num__Destination Wind Direction (cos),num__Destination Wind Gust,num__Destination Visibility,num__Destination Ceiling,num__Destination Sea Level Pressure
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-1.812830,-1.269386,0.992511,-0.330626,-1.160738,-1.068467,-0.197859,0.318287,1.358243,1.855326
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.608721,-0.169340,0.192444,-0.330626,-0.793188,-1.399702,-0.197859,0.318287,-0.691325,0.956913
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,-1.387851,-0.169340,1.006167,-1.876703,-0.102420,1.369064,-0.197859,0.318287,1.358243,-1.464895
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,-0.219157,-1.519396,-0.872705,-0.330626,1.195799,0.392769,-0.197859,0.318287,1.358243,-0.605544
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-1.812830,-0.169340,1.940382,-0.639841,-1.160738,-1.068467,-0.197859,-3.054405,-0.882618,0.527237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663919,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.998286,-0.169340,0.880052,0.908925,0.137482,1.346522,-0.197859,0.318287,-0.848458,0.136623
663920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.559973,1.230718,0.227789,-0.018722,0.370093,1.279582,-0.197859,-2.492289,-0.875786,-0.449298
663921,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-1.210776,-0.719363,0.575609,-0.639841,-1.160738,-1.068467,-0.197859,0.318287,1.358243,-1.425834
663922,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.431647,0.430685,1.019823,-0.949057,-1.462967,0.142942,-0.197859,0.318287,-0.759644,1.074097


##### Export as zipped csvs

In [26]:
# function to export df as zipped csv
def export_zipped_file(df, name):
    
    # set up output path
    output_path = f'{split_path}{name}.zip'

    # export
    df.to_csv(
        output_path,
        index=False,
        compression={
            'method': 'zip',
            'archive_name': f'{name}.csv'
        }
    )

In [27]:
# export to zipped files
export_zipped_file(y_train_bal, 'y_train')
export_zipped_file(x_train_processed_df, 'x_train')
export_zipped_file(y_test, 'y_test')
export_zipped_file(x_test_processed_df, 'x_test')

### Export preprocessor

In [28]:
# save the fitted preprocessor
joblib.dump(preprocessor, f'{preprocessor_path}preprocessor.joblib')

['Output/Preprocessor/preprocessor.joblib']