## Setup

In [1]:
#imports
import zipfile
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# function to open zipped file and read into df
def unzip_to_df(zip_filepath, file_inside_zip, **read_csv_kwargs):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as z:
            with z.open(file_inside_zip) as f:
                df = pd.read_csv(f, dtype={'Manufacturer Code': str}, **read_csv_kwargs)
        return df
    except Exception as e:
        print(f'Error occured: {e}')
        return None

In [8]:
# get df
zip_path = 'Output/Delays/modeling_data.zip'
file_name = 'modeling_data.csv'

delays_df = unzip_to_df(zip_path, file_name)

## Check Data
---

In [9]:
# display
delays_df.head()

Unnamed: 0,Delay Bin,Month (sin),Month (cos),Day (sin),Day (cos),Day of Week (sin),Day of Week (cos),Scheduled Departure Total Minutes (sin),Scheduled Departure Total Minutes (cos),Scheduled Elapsed Time,...,Dew Point Temperature,Relative Humidity,Wind Speed,Wind Direction Interpolation,Wind Gust,Visibility,Ceiling,Ceiling Missing,Sea Level Pressure,Sea Level Pressure Missing
0,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.065403,0.997859,307,...,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
1,11-30 min,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,278,...,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
2,31-60 min,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,295,...,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
3,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.130526,0.991445,320,...,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0
4,Early,0.5,0.866025,0.201299,0.97953,0.866025,-0.5,0.173648,0.984808,209,...,33.08,40.23,5.75,80.0,0.0,10.0,35000,1,1017.5,0


In [None]:
# convert dtype to int if bool
bool_cols = [col for col in list(delays_df.columns) if delays_df[col].dtype == 'bool']

# convert bool types to int64
for bool in bool_cols:
    delays_df[bool] = delays_df[bool].astype(np.int64)
    print(f'{bool}: {delays_df[bool].dtype}')

Aircraft Age Missing: int64
Builder Type Certificated: int64


In [11]:
# data type
delays_df.dtypes

Delay Bin                                   object
Month (sin)                                float64
Month (cos)                                float64
Day (sin)                                  float64
Day (cos)                                  float64
Day of Week (sin)                          float64
Day of Week (cos)                          float64
Scheduled Departure Total Minutes (sin)    float64
Scheduled Departure Total Minutes (cos)    float64
Scheduled Elapsed Time                       int64
Carrier Code                                object
Destination Airport                         object
Manufacturer                                object
Model                                       object
Aircraft Age                                 int64
Aircraft Age Missing                         int64
Type of Engine                              object
Number of Seats                              int64
Builder Type Certificated                    int64
Precipitation Accumulation One 

## Categorize Features
---

### separate y and x's

In [13]:
# separate 
t_name = 'Delay Bin'
y_var = delays_df[t_name]
x_vars = delays_df.drop(columns=t_name).copy()

### identify categorical and numerical columns

In [30]:
# get x cols list
cols_list = list(x_vars.columns)

# get numeric cols
num_cols = [
    col for col in cols_list
    if (x_vars[col].dtypes == 'int64' or x_vars[col].dtypes == 'float64')
        and col.endswith('Missing') == False
]

# get categorical cols--> leftover
cat_cols = set(cols_list) - set(num_cols)

In [31]:
# display cols
display(num_cols)
cat_cols

['Month (sin)',
 'Month (cos)',
 'Day (sin)',
 'Day (cos)',
 'Day of Week (sin)',
 'Day of Week (cos)',
 'Scheduled Departure Total Minutes (sin)',
 'Scheduled Departure Total Minutes (cos)',
 'Scheduled Elapsed Time',
 'Aircraft Age',
 'Number of Seats',
 'Builder Type Certificated',
 'Precipitation Accumulation One Hour',
 'Precipitation Accumulation Six Hours',
 'Air Temperature',
 'Dew Point Temperature',
 'Relative Humidity',
 'Wind Speed',
 'Wind Direction Interpolation',
 'Wind Gust',
 'Visibility',
 'Ceiling',
 'Sea Level Pressure']

{'Aircraft Age Missing',
 'Carrier Code',
 'Ceiling Missing',
 'Destination Airport',
 'Manufacturer',
 'Model',
 'Sea Level Pressure Missing',
 'Type of Engine'}