# Create Staging Tables
The "raw" data was pre-processed and converted to parquet to enable direct access to the data while reducing the overhead of reading tar files. We want to further refine the raw data by creating new features and perform general data cleanup. Feature engineering is highly iterative so having quick access to the underlying data allows us to iterate faster.  

Staging tables are created to simulate different databases. Data within each of these databases can be updated independently. Common operations performed downstream on the data can be moved upstream to reduce the computation of repetitive operations that can be performed once and used many times. Sometimes, it becomes necessary to go back to the raw data to perform quality assurance. Having clear data lineage of the intermediate operations that was applied to get data to the current state is of great importance when diagnosing and debugging complicated data pipelines.  

Data within the staging tables should be readily consumed by a variety of users. In the ML and analytics workflow demo, these tables are sorted and physically partitioned to facilitate data merging operations. Typical merging involves an airport name and a datetime attribute. 

In [1]:
import os
import sys
import glob
import io
import tarfile
import shutil
from functools import reduce
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pandas.api.types import CategoricalDtype
import numpy as np
from time import time
import datetime

import dask
import dask.dataframe as dd
from dask.distributed import Client, wait, progress, get_worker


# tar_data_dir = './data/raw/bts_carrier_ontime_perf_data/tar'
# tar_files = glob.glob(tar_data_dir+'/*.tar.gz')
# tar_files = sorted(tar_files)

# Original raw data converted to parquet:
parquet_output_dir = './data/converted/flight'

# # Encoded data except for TargetEncoder cols, which need to be performed after test/train split:
# partial_enc_output_dir = './data/encoded/NAS' 

print('pandas version', pd.__version__)
print('dask version', dask.__version__)

# Set dask shuffle method globally:
# https://docs.dask.org/en/latest/dataframe-groupby.html#selecting-methods
# dask.config.set(shuffle='tasks') # Default is 'disk'.

# Use mem intensive CPU instances to run (e.g., AWS M-series)
# client = Client(n_workers=4, threads_per_worker=1)
# client = Client(n_workers=6, threads_per_worker=4) # Good for aws xx.8xlarge instances.
client = Client(n_workers=6, threads_per_worker=6) # Good for DSWS.
client

pandas version 1.1.4
dask version 2021.11.2


Perhaps you already have a cluster running?
Hosting the HTTP server on port 37075 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:37075/status,

0,1
Dashboard: http://127.0.0.1:37075/status,Workers: 6
Total threads: 36,Total memory: 251.65 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:32999,Workers: 6
Dashboard: http://127.0.0.1:37075/status,Total threads: 36
Started: Just now,Total memory: 251.65 GiB

0,1
Comm: tcp://127.0.0.1:40699,Total threads: 6
Dashboard: http://127.0.0.1:38183/status,Memory: 41.94 GiB
Nanny: tcp://127.0.0.1:34285,
Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-5vq5j6y5,Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-5vq5j6y5
GPU: Quadro RTX 8000,GPU memory: 48.00 GiB

0,1
Comm: tcp://127.0.0.1:36091,Total threads: 6
Dashboard: http://127.0.0.1:41749/status,Memory: 41.94 GiB
Nanny: tcp://127.0.0.1:38911,
Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-rf6yb7tw,Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-rf6yb7tw
GPU: Quadro RTX 8000,GPU memory: 48.00 GiB

0,1
Comm: tcp://127.0.0.1:41813,Total threads: 6
Dashboard: http://127.0.0.1:38339/status,Memory: 41.94 GiB
Nanny: tcp://127.0.0.1:37187,
Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-6ydt6bl2,Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-6ydt6bl2
GPU: Quadro RTX 8000,GPU memory: 48.00 GiB

0,1
Comm: tcp://127.0.0.1:38975,Total threads: 6
Dashboard: http://127.0.0.1:39797/status,Memory: 41.94 GiB
Nanny: tcp://127.0.0.1:38611,
Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-lcu2zl3c,Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-lcu2zl3c
GPU: Quadro RTX 8000,GPU memory: 48.00 GiB

0,1
Comm: tcp://127.0.0.1:36507,Total threads: 6
Dashboard: http://127.0.0.1:43189/status,Memory: 41.94 GiB
Nanny: tcp://127.0.0.1:35647,
Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-6ystkdo3,Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-6ystkdo3
GPU: Quadro RTX 8000,GPU memory: 48.00 GiB

0,1
Comm: tcp://127.0.0.1:34401,Total threads: 6
Dashboard: http://127.0.0.1:45711/status,Memory: 41.94 GiB
Nanny: tcp://127.0.0.1:37929,
Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-acwd8cn3,Local directory: /home/btong/github/flight-delay-causes/dask-worker-space/worker-acwd8cn3
GPU: Quadro RTX 8000,GPU memory: 48.00 GiB


In [2]:
# Some useful EDA available at: https://www.kaggle.com/robikscube/tutorial-time-series-forecasting-with-xgboost

# CRS = Computerized Reservation System
# Identify columns to use in ML from original flight data:
ignore_cols = [ 'FL_DATE', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER',
               'ORIGIN_AIRPORT_ID', 'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN_WAC', 'ORIGIN_STATE_FIPS', 'ORIGIN_STATE_NM', 'ORIGIN_CITY_NAME', 'ORIGIN_STATE_ABR',
               'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST_WAC', 'DEST_STATE_FIPS', 'DEST_STATE_NM', 'DEST_CITY_NAME', 'DEST_STATE_ABR',
               'DEP_DELAY_NEW', 'ARR_DELAY_NEW', 'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED',
               'FIRST_DEP_TIME', 'TOTAL_ADD_GTIME', 'LONGEST_ADD_GTIME', 'FLIGHTS',
               'ARR_DELAY', 'ARR_DELAY_GROUP',
              ]


delay_causes_cols = ['LATE_AIRCRAFT_DELAY', 'CARRIER_DELAY', 'NAS_DELAY', 'WEATHER_DELAY', 'SECURITY_DELAY']

feature_cols = [
 'YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_YEAR', 'DAY_OF_WEEK',
 'TAIL_NUM', 'OD_PAIR',
 'OP_UNIQUE_CARRIER',
#  'ORIGIN_CITY_MARKET_ID', # Better accounting of ORIGIN, especially over long study period. Airport name can change over time.
 'ORIGIN',
#  'DEST_CITY_MARKET_ID', # REMOVE: study single arrival airport
 'DEST', # REMOVE: study single arrival airport
#  'CRS_DEP_TIME',
    'CRS_DEP_TIME_HR', 'CRS_DEP_TIME_QTHR',
#  'DEP_TIME', # REMOVE: in 24hr format, so can be misleading if red-eye flight. DEP_DELAY should account for actual time delta.
 'DEP_DELAY', 'DEP_DEL15',
 'ARR_DELAY', 'ARR_DEL15', 
#  'DEP_DELAY_GROUP', # DEP_DELAY binned to 15 min increments.
#  'DEP_TIME_BLK', # Similar to hour bins, but combines multiple hours during early morning and late evening hours. Cyclical.
 'TAXI_OUT',
#  'WHEELS_OFF', # REMOVE: avoid using 24hr time format. Possibly accounted for in DEP_TIME + TAXI_OUT.
#  'WHEELS_ON', # REMOVE: leaks actual arrival time
 'TAXI_IN', # REMOVE: leaks actual arrival time
#  'CRS_ARR_TIME', # CRS estimated arrival time. Use HOUR and QTHR
    'CRS_ARR_TIME_HR', 'CRS_ARR_TIME_QTHR',
#  'ARR_TIME', # REMOVE: leaks actual arrival time
#     'ARR_TIME_HR', 'ARR_TIME_QTHR',
#  'ARR_TIME_BLK', # REMOVE: leaks actual arrival time
 'CRS_ELAPSED_TIME',
 'ACTUAL_ELAPSED_TIME', # REMOVE: leaks actual arrival time
 'AIR_TIME', # REMOVE: leaks actual arrival time
 'DISTANCE',
 'DISTANCE_GROUP'
]

# Elected to use CRS_ARR_TIME and CRS_DEP_TIME with minute offset indicated by ARR_DELAY and DEP_DELAY.
# Actual times for arrival/departure were used in merging attributes like arrival/departure rates and weather.

In [3]:
# Check for missing files in parquet output dir:
pq_files = sorted(glob.glob(parquet_output_dir+'/*.parquet'))
pq_files_date = [fn.split('/')[-1][:6] for fn in pq_files]

data_range_yyyymm = pd.period_range(start='1987-10-01', end='2021-04-01', freq='M').astype(str).str.replace('-', '')
print(list(set(data_range_yyyymm) - set(pq_files_date)))

['199006', '200606', '199603', '200408', '200106', '201301', '199004', '199403', '200301', '200508', '201107', '201410', '201409', '200811', '199910', '199404', '199810', '199005', '198905', '201512', '200407', '199901', '200104', '201004', '201810', '201809', '200011', '201003', '201009', '199711', '201102', '199912', '201610', '201504', '199612', '200302', '199904', '201704', '202004', '199602', '201403', '199909', '199708', '201901', '200204', '200612', '200203', '199712', '201601', '199205', '199104', '199207', '199802', '200706', '200605', '199411', '201402', '199409', '201902', '200503', '199506', '201109', '201101', '200112', '201212', '200809', '200907', '199610', '201105', '199410', '199103', '200601', '199902', '198710', '199907', '200808', '200509', '200807', '198712', '198910', '199302', '199905', '199203', '198809', '199906', '200709', '200702', '199803', '199705', '200812', '200912', '199202', '198806', '201201', '200404', '202012', '199607', '198904', '198907', '201802',

In [4]:
%%time

# Load data. Using persist spilled to disk and may stall calculations. Disabling .persist() allowed calculations to proceed.
# flt_tbl_clean = dd.read_parquet(pq_files[:36])
flt_tbl_clean = dd.read_parquet(pq_files)

# Get time range to limit weather data:
start_date = flt_tbl_clean['DEP_TIME_DT_LOCAL'].min().compute() - pd.Timedelta('1d')
end_date = flt_tbl_clean['ARR_TIME_DT_LOCAL'].max().compute() + pd.Timedelta('1d')

# Not all delay attribute columns are contained in original data. Must be ran against merged data.
# Encode delay cause column into single column:
flt_tbl_clean[delay_causes_cols] = flt_tbl_clean[delay_causes_cols].fillna(0).astype(bool) # Eventually move into original zip csv processing

flt_tbl_clean['OTHER_DELAY'] = flt_tbl_clean['WEATHER_DELAY'] | flt_tbl_clean['SECURITY_DELAY']
flt_tbl_clean = flt_tbl_clean.drop(columns=['WEATHER_DELAY', 'SECURITY_DELAY'])

# Update delay causes:
# delay_causes_cols_mg = delay_causes_cols # May choose to merge columns like SECURITY_DELAY and WEATHER_DELAY into OTHER_DELAY
delay_causes_cols_mg = ['LATE_AIRCRAFT_DELAY', 'CARRIER_DELAY', 'NAS_DELAY', 'OTHER_DELAY']

delay_causes_combo = 1*flt_tbl_clean[delay_causes_cols_mg[-1]]
for ii in np.arange(1, len(delay_causes_cols_mg), 1):
    
    # Reverse order so that binary digits correspond to order in original list.
    slice_col = delay_causes_cols_mg[::-1][ii]
    delay_causes_combo = delay_causes_combo + (10**ii)*flt_tbl_clean[slice_col]

# Merge encoded delay causes back in:    
flt_tbl_clean['DELAY_CAUSES_ENC'] = delay_causes_combo.values
flt_tbl_clean['DELAY_CAUSES_ENC'] = flt_tbl_clean['DELAY_CAUSES_ENC'].astype(str).str.zfill(len(delay_causes_cols_mg)).astype('category')
flt_tbl_clean['DELAY_CAUSES_ENC'] = flt_tbl_clean['DELAY_CAUSES_ENC'].cat.as_known()

# Convert bitstring to bitletter code for readability:
bitstr_cats = flt_tbl_clean['DELAY_CAUSES_ENC'].cat.categories
delay_causes_cols_code = [cc[0] for cc in delay_causes_cols_mg]
bitletter = []
for bitstr in bitstr_cats:
    bitmsk = [delay_causes_cols_code[ii] if bitstr[ii]=='1' else '-' for ii in range(len(bitstr))]
    bitletter.append(''.join(bitmsk))
    
delay_causes_bitremap = dict(zip(bitstr_cats, bitletter))
flt_tbl_clean['DELAY_CAUSES_ENC'] = flt_tbl_clean['DELAY_CAUSES_ENC'].cat.rename_categories(delay_causes_bitremap)

flt_tbl_clean = flt_tbl_clean[feature_cols+['UID', 'DELAY_CAUSES_ENC', 'ARR_TIME_DT_LOCAL', 'DEP_TIME_DT_LOCAL', 'CRS_DEP_TIME_DT_LOCAL', 'CRS_ARR_TIME_DT_LOCAL']].persist()
flt_tbl_clean

CPU times: user 558 ms, sys: 69.5 ms, total: 628 ms
Wall time: 4.66 s


Unnamed: 0_level_0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_YEAR,DAY_OF_WEEK,TAIL_NUM,OD_PAIR,OP_UNIQUE_CARRIER,ORIGIN,DEST,CRS_DEP_TIME_HR,CRS_DEP_TIME_QTHR,DEP_DELAY,DEP_DEL15,ARR_DELAY,ARR_DEL15,TAXI_OUT,TAXI_IN,CRS_ARR_TIME_HR,CRS_ARR_TIME_QTHR,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,DISTANCE_GROUP,UID,DELAY_CAUSES_ENC,ARR_TIME_DT_LOCAL,DEP_TIME_DT_LOCAL,CRS_DEP_TIME_DT_LOCAL,CRS_ARR_TIME_DT_LOCAL
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
,float32,int64,int64,int64,float32,int64,object,object,object,object,object,int8,int8,float32,bool,float64,bool,float32,float64,int8,int8,float32,float64,float64,float32,float32,int64,category[known],datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns]
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [5]:
%%time

def create_arpt_demand_offset_features(df, t_offset):
    """
    Create offset features by time shifting by t_offset minutes. Negative t_offset for lag feature. Positive t_offset for lead feature.
    """
    df = df.copy()
    if t_offset > 0:
        suffix = '_LEAD'+str(np.abs(t_offset))
        
        # Only compute lead features for CRS fields:
        crs_cols = [cc for cc in df.columns if cc[:3]=='CRS']
        df = df[crs_cols]
        
    elif t_offset < 0:
        suffix = '_LAG'+str(np.abs(t_offset))
    else:
        raise ValueError('t_offset of 0 not valid. Choose positive or negative increments only.')
    
    # Use negative sign convention so that negative t_offset corresponds to lag features.
    df.index = df.index - np.sign(t_offset)*pd.Timedelta(str(np.abs(t_offset))+'min')
    df.columns = df.columns + suffix
    return(df)

def qthr_ops_count(ops_df, op_type):
    """
    Count number of operations per quarter hour. Assumed that ops_df has been filtered for either arrivals or departures.
    
    ops_df: dataframe
        Dataframe containing purely arrivals or departures to study airport. Datetime indexed.
    op_type: str
        Select 'ARR' or 'DEP'.
    """
    def cnt_(t_series, prefix):
        qtr_cnt = t_series.dt.round('15min').value_counts()
        qtr_cnt.name = prefix + '_PER_QTHR'
        return(qtr_cnt)
    
    # Gather data into mem:
    ops_df = ops_df[[op_type+'_TIME_DT_LOCAL', 'CRS_'+op_type+'_TIME_DT_LOCAL', op_type + '_DEL15']]
    
    # Compute "scheduled" demand using the CRS times.
    actual_cnt = cnt_(ops_df[op_type+'_TIME_DT_LOCAL'], op_type)
    crs_cnt = cnt_(ops_df['CRS_'+op_type+'_TIME_DT_LOCAL'], 'CRS_'+op_type)

    # Filter for delayed flights, then compute number of actual delayed flights within quarter hour:
    ops_df_delay = ops_df[ops_df[op_type + '_DEL15'] > 0]
    delayed_cnt = cnt_(ops_df_delay[op_type+'_TIME_DT_LOCAL'], 'DEL_'+op_type)

    out = pd.concat([actual_cnt, crs_cnt, delayed_cnt], axis=1).fillna(0).astype('int8')
    
    # Take difference between between actual and scheduled/expected counts within quarter hour bin.
    out[op_type+'_DIFF0_PER_QTHR'] = out['CRS_'+op_type+'_PER_QTHR'] - out[op_type+'_PER_QTHR']
    return(out)

def compute_arpt_demand(arpt_ops, op_type):
    """
    Compute arrival or departure demand at each airport.
    
    arpt_ops: dataframe
        Dataframe of either ARR or DEP ops.
    op_type: str
        Select 'ARR' or 'DEP' as operation type.
    """
    arpt_demand = qthr_ops_count(arpt_ops, op_type)

    # Future is known based on CRS (schedule). We can use these to create lead features.
    # Chain merge based on time index. 
    arpt_demand_lead_lag_features = pd.concat([create_arpt_demand_offset_features(arpt_demand, td) for td in [-30, -15, 15, 30]], axis=1)

    # Assemble airport demand data:
    arpt_demand = pd.concat([arpt_demand, arpt_demand_lead_lag_features], axis=1).fillna(0).astype('int8')
    arpt_demand = arpt_demand.reset_index().rename(columns={'index': 'DT_LOCAL_QTHR'})

    # Create forward looking features:
    arpt_demand[op_type+'_DIFF1_PER_QTHR'] = arpt_demand['CRS_'+op_type+'_PER_QTHR_LEAD15'] - arpt_demand[op_type+'_PER_QTHR']
    arpt_demand[op_type+'_DIFF2_PER_QTHR'] = arpt_demand['CRS_'+op_type+'_PER_QTHR_LEAD30'] - arpt_demand[op_type+'_PER_QTHR']
    return(arpt_demand)


# Select subset of columns as needed for demand computations:
subset_cols = ['ORIGIN', 'DEST'] + reduce(lambda x, y: x+y, [[ot+'_TIME_DT_LOCAL', 'CRS_'+ot+'_TIME_DT_LOCAL', ot + '_DEL15'] for ot in ['ARR', 'DEP']])

# Compute arrival demand metrics:
arr_arpt_grp = flt_tbl_clean[subset_cols].groupby('DEST')
arr_demand = arr_arpt_grp.apply(compute_arpt_demand, 'ARR').reset_index().drop(columns='level_1').rename(columns={'ORIGIN': 'ARPT_NAME', 'DEST': 'ARPT_NAME'})

# Compute departure demand metrics:
dep_arpt_grp = flt_tbl_clean[subset_cols].groupby('ORIGIN')
dep_demand = dep_arpt_grp.apply(compute_arpt_demand, 'DEP').reset_index().drop(columns='level_1').rename(columns={'ORIGIN': 'ARPT_NAME', 'DEST': 'ARPT_NAME'})

# Merge arrival and departure demand for all airports at each QTHR:
arpt_demand = arr_demand.merge(dep_demand, on=['ARPT_NAME', 'DT_LOCAL_QTHR'], how='outer').fillna(0).compute()



# TODO: optimize join by setting index? Setting index to ARPT_NAME or DT_LOCAL_QTHR may be needed. Original data is organized by month, so temporal index may be faster.
# dask does not accept multi-index. Only use one index at a time.

numeric_cols = arpt_demand.select_dtypes('number').columns
arpt_demand[numeric_cols] = arpt_demand[numeric_cols].astype('int8')

# Extract year:
arpt_demand['YEAR'] = arpt_demand['DT_LOCAL_QTHR'].dt.year
arpt_demand['YEAR'] = arpt_demand['YEAR'].astype('int16')

run_diagnostics = False
if run_diagnostics == True:
    # Check if any nan's remain:
    nan_cnt = arpt_demand.isna().sum()
    if np.sum(nan_cnt) > 0:
        raise ValueError('NaN values detected. Fill or impute values to keep data.')

    # Verify that 24 hour demand makes sense:
    arpt_demand['DT_LOCAL_QTHR'].dt.hour.hist(bins=24)
    arpt_demand[(arpt_demand['ARR_PER_QTHR'] > 0) & (arpt_demand['DEP_PER_QTHR'] > 0)]['DT_LOCAL_QTHR'].dt.hour.hist(bins=24).legend(['All QTHR Count', 'QTHR w/ ARR and DEP'])

# Airport demand represented as pandas.DataFrame:
# arpt_demand = arpt_demand.set_index('ARPT_NAME')

# TODO: parallelize airport demand table merge. Convert arpt_demand to dask.dataframe?
# arpt_demand = dd.from_pandas(arpt_demand, npartitions=32)

arpt_demand


# TODO: need capacity model. Find info about airport efficiency rates under various operating conditions? Should be in ASPM summary info. 
# For now, can use CRS and actuals with lag features to estimate.

# TODO: create late arrival at ORIGIN airport feature to model delay chain effects? Currently use DEP_DELAY.

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


CPU times: user 1.25 s, sys: 1.39 s, total: 2.64 s
Wall time: 9.37 s


Unnamed: 0,ARPT_NAME,DT_LOCAL_QTHR,ARR_PER_QTHR,CRS_ARR_PER_QTHR,DEL_ARR_PER_QTHR,ARR_DIFF0_PER_QTHR,ARR_PER_QTHR_LAG30,CRS_ARR_PER_QTHR_LAG30,DEL_ARR_PER_QTHR_LAG30,ARR_DIFF0_PER_QTHR_LAG30,...,DEP_DIFF0_PER_QTHR_LAG30,DEP_PER_QTHR_LAG15,CRS_DEP_PER_QTHR_LAG15,DEL_DEP_PER_QTHR_LAG15,DEP_DIFF0_PER_QTHR_LAG15,CRS_DEP_PER_QTHR_LEAD15,CRS_DEP_PER_QTHR_LEAD30,DEP_DIFF1_PER_QTHR,DEP_DIFF2_PER_QTHR,YEAR
0,ABQ_NM,2021-01-01 11:45:00,2,1,0,-1,1,0,0,-1,...,0,0,0,0,0,0,2,0,2,2021
1,ABQ_NM,2021-01-01 12:30:00,1,1,0,0,0,3,0,3,...,0,3,2,0,-1,1,1,0,0,2021
2,ABQ_NM,2021-01-01 13:15:00,2,1,0,-1,1,0,0,-1,...,1,1,1,0,0,2,0,0,-2,2021
3,ABQ_NM,2021-01-01 13:45:00,2,0,0,-2,2,1,0,-1,...,-2,0,2,0,2,1,1,1,1,2021
4,ABQ_NM,2021-01-01 14:00:00,2,2,0,0,0,0,0,0,...,2,0,0,0,0,1,0,-1,-2,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498012,YKM_WA,2021-04-29 05:45:00,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,2021
498013,YKM_WA,2021-04-29 14:15:00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1,-1,2021
498014,YKM_WA,2021-04-29 14:45:00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2021
498015,YKM_WA,2021-04-30 06:00:00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2021


# Load External Data for Merging

In [6]:
# Load airport database:
arpt_db = pd.read_csv('./data/suplemental/airports.csv')

# Filter for US airports with IATA codes. BTS data looks like it's only for domestic flights at ~270 major airports. 
# Known that only small/medium/large airports are in BTS data. Pre-filter and make categorical encoding known.
arpt_db = arpt_db[(arpt_db['iso_country'] == 'US') & 
                  (~arpt_db['iata_code'].isna()) &
                  (arpt_db['type'].isin(['small_airport', 'medium_airport', 'large_airport']))
                 ]
arpt_db['type'] = arpt_db['type'].astype('category')
arpt_db = arpt_db[['iata_code', 'type', 'iso_region', 'latitude_deg', 'longitude_deg', 'elevation_ft']]
arpt_db['elevation_ft'] = arpt_db['elevation_ft'].fillna(0).astype('int16') # Highest US airport is ~2.2k ft
arpt_db['IATA_ST'] = arpt_db['iata_code'] + '_' + arpt_db['iso_region'].str.split('-').str[1]
# arpt_db = arpt_db[arpt_db['IATA_ST'].isin(bts_unique_arpt['ARPT_ST'])]
arpt_db = arpt_db.rename(columns={'iata_code': 'IATA_CODE', 'type': 'ARPT_TYPE', 
                                  'latitude_deg': 'ARPT_LAT', 'longitude_deg': 'ARPT_LON', 'elevation_ft': 'ARPT_ELV_FT'})

arpt_db = arpt_db.drop(columns=['iso_region', 'IATA_CODE']).dropna()
arpt_db = arpt_db.set_index('IATA_ST')
arpt_db

# TODO: geocode to get timezone per airport.

Unnamed: 0_level_0,ARPT_TYPE,ARPT_LAT,ARPT_LON,ARPT_ELV_FT
IATA_ST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OCA_FL,small_airport,25.325399,-80.274803,8
PQS_AK,small_airport,61.934601,-162.899994,305
CSE_CO,small_airport,38.851918,-106.928341,8980
JCY_TX,small_airport,30.251801,-98.622498,1515
PMX_MA,small_airport,42.223301,-72.311401,418
...,...,...,...,...
KKI_AK,small_airport,60.904800,-161.421997,23
BCC_AK,small_airport,63.573316,-156.149454,740
KBC_AK,small_airport,66.274002,-145.824005,450
CZC_AK,small_airport,61.941200,-145.294006,1150


In [7]:
%%time

# Load weather data:
wx_processed_data_dir = './data/converted/weather/NAS/' # Single file for all weather
wx_data = dd.read_parquet(wx_processed_data_dir)


# Pandas can read partitioned parquet directory with option use_legacy_dataset=False.
# https://arrow.apache.org/docs/python/parquet.html#reading-from-partitioned-datasets
# pd.read_parquet('./data/staging_tbl/arpt_demand', use_legacy_dataset=False)

wx_data = wx_data[wx_data['DT_LOCAL_HR'].between(start_date, end_date)]
# wx_data['YEAR'] = wx_data['DT_LOCAL_HR'].dt.year
# wx_data['YEAR'] = wx_data['YEAR'].astype('int16')

wx_data = wx_data.drop(columns='MISSING_WX')
wx_numeric_dtypes = list(wx_data.select_dtypes('number').columns)
wx_cols = list(wx_data.columns)

# # Since wx_data is large, may need to do a partitioned join? Has worker mem issues.
# wx_grp = wx_data.groupby('IATA_ST')
# wx_grp_keys = list(wx_grp.groups.keys())

print('Number of weather attributes:', len(wx_data.columns) - 2) # Two attributes used for merge
wx_data

# TODO: additional feature engineering to get weather in adjacent hour?

Number of weather attributes: 14
CPU times: user 15.7 ms, sys: 2.88 ms, total: 18.6 ms
Wall time: 16.3 ms


Unnamed: 0_level_0,DT_LOCAL_HR,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPressureChange,HourlyVisibility,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,HourlySkyCover_BKN,HourlySkyCover_FEW,HourlySkyCover_OVC,HourlySkyCover_SCT,HourlySkyCover_VV,HourlyPresentWeatherTypeCombo,YEAR,ARPT_NAME
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
,datetime64[ns],float32,float32,float32,float32,int16,float32,int16,int16,int16,int16,int16,int16,object,int16,category[known]
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [8]:
# Feature engineering:
from pandas.tseries.holiday import get_calendar, USFederalHolidayCalendar

# Add holidays +- padded days: 
holiday_df = pd.DataFrame(USFederalHolidayCalendar().holidays(start='2003', end='2025', return_name=True)).reset_index()
holiday_df.columns = ['DT_LOCAL_DAY', 'HOLIDAY_NAME']
holiday_pad = pd.concat([pd.DataFrame({'DT_LOCAL_DAY': holiday_df['DT_LOCAL_DAY'] + ii*pd.Timedelta('1D'), 
                                       'HOLIDAY_NAME': holiday_df['HOLIDAY_NAME']}) for ii in [-1,0,1]])
holiday_pad = holiday_pad.sort_values('DT_LOCAL_DAY').reset_index(drop=True)
holiday_pad['DT_LOCAL_DAY'] = holiday_pad['DT_LOCAL_DAY'].dt.round('d')
# holiday_pad = pd.get_dummies(holiday_pad) # Use target encoding for holidays. OHE on sparse feature can make training harder.
holiday_pad['IS_HOLIDAY'] = True
holiday_names = ['HOLIDAY_NAME_' + hn for hn in list(holiday_df['HOLIDAY_NAME'].unique())]
print('Unique holidays:', holiday_names)

holiday_pad = holiday_pad.set_index('DT_LOCAL_DAY')
holiday_pad.head(5)

Unique holidays: ['HOLIDAY_NAME_New Years Day', 'HOLIDAY_NAME_Martin Luther King Jr. Day', 'HOLIDAY_NAME_Presidents Day', 'HOLIDAY_NAME_Memorial Day', 'HOLIDAY_NAME_July 4th', 'HOLIDAY_NAME_Labor Day', 'HOLIDAY_NAME_Columbus Day', 'HOLIDAY_NAME_Veterans Day', 'HOLIDAY_NAME_Thanksgiving', 'HOLIDAY_NAME_Christmas']


Unnamed: 0_level_0,HOLIDAY_NAME,IS_HOLIDAY
DT_LOCAL_DAY,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-12-31,New Years Day,True
2003-01-01,New Years Day,True
2003-01-02,New Years Day,True
2003-01-19,Martin Luther King Jr. Day,True
2003-01-20,Martin Luther King Jr. Day,True


# Merge External Data into Master Flight Table

In [9]:
%%time

# Merge departure airport info:
data_mg = (flt_tbl_clean
           .merge(arpt_db.rename(columns=dict(zip(arpt_db.columns, 'DEP_' + arpt_db.columns))), left_on='ORIGIN', right_on='IATA_ST', how='left')
           )

arr_grp = data_mg.groupby('DEST')
dep_grp = data_mg.groupby('ORIGIN')

# Some airports coords not found. Fill relevant attributes with 0's.
arpt_attr_cols = ['DEP_'+cc for cc in ['ARPT_LAT', 'ARPT_LON', 'ARPT_ELV_FT']]
data_mg[arpt_attr_cols] = data_mg[arpt_attr_cols].fillna(0)
data_mg['DEP_ARPT_ELV_FT'] = data_mg['DEP_ARPT_ELV_FT'].astype('int16')

# Merge arrival airport info:
data_mg = (data_mg
            .merge(arpt_db.rename(columns=dict(zip(arpt_db.columns, 'ARR_' + arpt_db.columns))), left_on='DEST', right_on='IATA_ST', how='left')
           )

arpt_attr_cols = ['ARR_'+cc for cc in ['ARPT_LAT', 'ARPT_LON', 'ARPT_ELV_FT']]
data_mg[arpt_attr_cols] = data_mg[arpt_attr_cols].fillna(0)
data_mg['ARR_ARPT_ELV_FT'] = data_mg['ARR_ARPT_ELV_FT'].astype('int16')


# Create temporal columns for merging various data sources:
for ot in ['ARR', 'DEP']:
    data_mg[ot+'_TIME_DT_LOCAL_QTHR'] = data_mg[ot+'_TIME_DT_LOCAL'].dt.round('15min') # QTHR demand
    data_mg[ot+'_TIME_DT_LOCAL_HR'] = data_mg[ot+'_TIME_DT_LOCAL'].dt.round('h') # Hourly weather
    data_mg[ot+'_TIME_DT_LOCAL_DAY'] = data_mg[ot+'_TIME_DT_LOCAL'].dt.round('d') # Holidays

# Lost left index during merge. Need to reset index prior to merge.

# Merge holidays:
data_mg = data_mg.merge(holiday_pad, left_on='ARR_TIME_DT_LOCAL_DAY', right_on='DT_LOCAL_DAY', how='left')

# Backfill holiday columns after merge:
data_mg['IS_HOLIDAY'] = data_mg['IS_HOLIDAY'].fillna(False)
# data_mg[holiday_names] = data_mg[holiday_names].fillna(0).astype(bool)
data_mg['HOLIDAY_NAME'] = data_mg['HOLIDAY_NAME'].fillna('NA')

# Identify if travel date is on a weekend:
data_mg['IS_WEEKEND'] = data_mg['DAY_OF_WEEK'] >= 6



# Drop unecessary columns to prevent leakage and duplicate data:
data_mg_final = data_mg.drop(columns=['CRS_DEP_TIME_DT_LOCAL', 'CRS_ARR_TIME_DT_LOCAL'])

# Update dtypes: (keep as float for now since fields may contain nan's)
# int_16_cols = ['YEAR', 'CRS_ELAPSED_TIME', 'ARR_DELAY', 'DEP_DELAY', 'DISTANCE', 'DISTANCE_GROUP', 'TAXI_OUT']
# data_mg_final[int_16_cols] = data_mg_final[int_16_cols].astype('int16')

# OHE requires known categories. Using dask .categorize() requires full data scan, which can be expensive to compute.
# data_mg_final = data_mg_final.categorize(columns=['HOLIDAY_NAME', 'DEP_ARPT_TYPE', 'ARR_ARPT_TYPE'])
data_mg_final = data_mg_final.categorize() # Categorize all object columns. Assumed that unique identifiers have already been removed.

data_mg_final['YEAR'] = data_mg_final['YEAR'].astype('int16')

# Generate YYYYMM integer time column to repartition output:
data_mg_final['YYYYMM'] = data_mg_final['UID']//10000000

# TODO: replace .categorize() with deterministic mapping for better performance?
data_mg_final

CPU times: user 505 ms, sys: 35 ms, total: 540 ms
Wall time: 1.68 s


Unnamed: 0_level_0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_YEAR,DAY_OF_WEEK,TAIL_NUM,OD_PAIR,OP_UNIQUE_CARRIER,ORIGIN,DEST,CRS_DEP_TIME_HR,CRS_DEP_TIME_QTHR,DEP_DELAY,DEP_DEL15,ARR_DELAY,ARR_DEL15,TAXI_OUT,TAXI_IN,CRS_ARR_TIME_HR,CRS_ARR_TIME_QTHR,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,DISTANCE_GROUP,UID,DELAY_CAUSES_ENC,ARR_TIME_DT_LOCAL,DEP_TIME_DT_LOCAL,DEP_ARPT_TYPE,DEP_ARPT_LAT,DEP_ARPT_LON,DEP_ARPT_ELV_FT,ARR_ARPT_TYPE,ARR_ARPT_LAT,ARR_ARPT_LON,ARR_ARPT_ELV_FT,ARR_TIME_DT_LOCAL_QTHR,ARR_TIME_DT_LOCAL_HR,ARR_TIME_DT_LOCAL_DAY,DEP_TIME_DT_LOCAL_QTHR,DEP_TIME_DT_LOCAL_HR,DEP_TIME_DT_LOCAL_DAY,HOLIDAY_NAME,IS_HOLIDAY,IS_WEEKEND,YYYYMM
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
,int16,int64,int64,int64,float32,int64,category[known],category[known],category[known],category[known],category[known],int8,int8,float32,bool,float64,bool,float32,float64,int8,int8,float32,float64,float64,float32,float32,int64,category[known],datetime64[ns],datetime64[ns],category[known],float64,float64,int16,category[known],float64,float64,int16,datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns],category[known],bool,bool,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [10]:
%%time

print('Object columns:', list(data_mg_final.select_dtypes('object').columns))
print('Categorical columns:', list(data_mg_final.select_dtypes('category').columns))

ohe_encode_cols = ['DEP_ARPT_TYPE', 'ARR_ARPT_TYPE']

# cat_dtype = {'HOLIDAY_NAME': CategoricalDtype(holiday_names+['NA']), 
#              'ARR_ARPT_TYPE': ['small_airport', 'medium_airport', 'large_airport'],
#              'DEP_ARPT_TYPE': ['small_airport', 'medium_airport', 'large_airport']
#             }

# Cyclical features need to be cosine xformed. Note that we want to map first element to 0 if it is not already explicit.
# Lat/lon is also cyclical, no internatioal airports considered so we can keep them as decimal degrees.
cyclical_cols = {'QUARTER': 4, 'MONTH': 12, 'DAY_OF_YEAR': 365.25, 'DAY_OF_MONTH': 31, 'DAY_OF_WEEK': 7, 
                 'CRS_DEP_TIME_HR': 24, 'CRS_DEP_TIME_QTHR': 4,
                 'CRS_ARR_TIME_HR': 24, 'CRS_ARR_TIME_QTHR': 4,
                 'HourlyWindDirection': 360
                }

# OHE encode. Default dtype is np.uint8, which spark doesn't understand.
data_mg_final = dd.get_dummies(data_mg_final, columns=ohe_encode_cols, dtype=bool)


# Cosine transform cyclical columns:
def cosine_xform(df, col_name, period, shift=0):
    coef = 2.*np.pi/period
    val_shift = df[col_name] + shift
    df[col_name+'_cos'] = np.cos(coef*val_shift) 
    df[col_name+'_cos'] = df[col_name+'_cos'].astype('float32')
    df[col_name+'_sin'] = np.sin(coef*val_shift)
    df[col_name+'_sin'] = df[col_name+'_sin'].astype('float32')
    df = df.drop(columns=col_name)
    return(df)

# Apply to shifted vars:
for cc in ['QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_YEAR', 'DAY_OF_WEEK']:
    data_mg_final = cosine_xform(data_mg_final, cc, cyclical_cols[cc], shift=-1)
    
# Apply to non-shifted vars:
for cc in ['CRS_DEP_TIME_HR', 'CRS_DEP_TIME_QTHR', 'CRS_ARR_TIME_HR', 'CRS_ARR_TIME_QTHR']: #, 'ORIGIN_HourlyWindDirection', 'DEST_HourlyWindDirection']:
    data_mg_final = cosine_xform(data_mg_final, cc, cyclical_cols[cc], shift=0)
    
# Clean up and data down-casting to save on storage:
# data_mg_final = data_mg_final.drop(columns=['UID'])
fp64_cols = list(data_mg_final.select_dtypes('float64').columns)
data_mg_final[fp64_cols] = data_mg_final[fp64_cols].astype('float32')


# Issue with float16 in older version of pyarrow. RAPIDS conda env is still on v1.0. 
# Cast to float32 so parquet file can be written...
fp16_cols = list(data_mg_final.select_dtypes('float16').columns)
data_mg_final[fp16_cols] = data_mg_final[fp16_cols].astype('float32')

print('Number of features: ', len(data_mg_final.columns))
data_mg_final.columns

Object columns: []
Categorical columns: ['TAIL_NUM', 'OD_PAIR', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'DELAY_CAUSES_ENC', 'DEP_ARPT_TYPE', 'ARR_ARPT_TYPE', 'HOLIDAY_NAME']
Number of features:  61
CPU times: user 894 ms, sys: 22.5 ms, total: 916 ms
Wall time: 877 ms


Index(['YEAR', 'TAIL_NUM', 'OD_PAIR', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST',
       'DEP_DELAY', 'DEP_DEL15', 'ARR_DELAY', 'ARR_DEL15', 'TAXI_OUT',
       'TAXI_IN', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME',
       'DISTANCE', 'DISTANCE_GROUP', 'UID', 'DELAY_CAUSES_ENC',
       'ARR_TIME_DT_LOCAL', 'DEP_TIME_DT_LOCAL', 'DEP_ARPT_LAT',
       'DEP_ARPT_LON', 'DEP_ARPT_ELV_FT', 'ARR_ARPT_LAT', 'ARR_ARPT_LON',
       'ARR_ARPT_ELV_FT', 'ARR_TIME_DT_LOCAL_QTHR', 'ARR_TIME_DT_LOCAL_HR',
       'ARR_TIME_DT_LOCAL_DAY', 'DEP_TIME_DT_LOCAL_QTHR',
       'DEP_TIME_DT_LOCAL_HR', 'DEP_TIME_DT_LOCAL_DAY', 'HOLIDAY_NAME',
       'IS_HOLIDAY', 'IS_WEEKEND', 'YYYYMM', 'DEP_ARPT_TYPE_large_airport',
       'DEP_ARPT_TYPE_medium_airport', 'DEP_ARPT_TYPE_small_airport',
       'ARR_ARPT_TYPE_large_airport', 'ARR_ARPT_TYPE_medium_airport',
       'ARR_ARPT_TYPE_small_airport', 'QUARTER_cos', 'QUARTER_sin',
       'MONTH_cos', 'MONTH_sin', 'DAY_OF_MONTH_cos', 'DAY_OF_MONTH_sin',
       'DAY_O

In [11]:
%%time

import pyarrow as pa
import pyarrow.parquet as pq

staging_dir = './data/staging_tbl/'

try:
    # RECURSIVELY DELETE DIRECTORY and then add it
    shutil.rmtree(staging_dir)
except:
    pass

os.mkdir(staging_dir)

# Partitioning based on time (e.g., year, year-month, etc.) would lead to flights getting dropped when pre-filter to limit amount of data during joins.

# Export airport demand table:
print('Exporting airport demand staging tables...')
arpt_demand_pa = arpt_demand
obj_dtypes = list(arpt_demand_pa.select_dtypes('object').columns)
arpt_demand_pa[obj_dtypes] = arpt_demand_pa[obj_dtypes].astype('category')
pq.write_to_dataset(pa.Table.from_pandas(arpt_demand_pa), root_path=staging_dir+'arpt_demand',
                    partition_cols=['ARPT_NAME'], flavor='spark')

# Remove blank airport name. Spark cannot read it as partition key.
try:
    shutil.rmtree(staging_dir+'arpt_demand/ARPT_NAME=') 
except:
    pass

# Export airport weather table:
print('Exporting airport weather staging tables...')
# wx_data = wx_data.categorize()
wx_data = cosine_xform(wx_data, 'HourlyWindDirection', cyclical_cols['HourlyWindDirection'], shift=0)
wx_data.repartition(1).to_parquet(staging_dir+'arpt_weather', engine='pyarrow', overwrite=True, partition_on=['ARPT_NAME'], flavor='spark', write_metadata_file=False)
# repartition(1) requires all wx data fit within single worker.

# Export partial airport flight table. write_metadata_file=False required for dask to read partitioned data.
print('Exporting partially encoded flight staging tables...')
data_mg_final.to_parquet(staging_dir+'nas_flights', engine='pyarrow', overwrite=True, partition_on=['YYYYMM'], flavor='spark', write_metadata_file=False)

Exporting airport demand staging tables...
Exporting airport weather staging tables...
Exporting partially encoded flight staging tables...
CPU times: user 2.43 s, sys: 471 ms, total: 2.9 s
Wall time: 4.98 s


[None]