# Data Conversion to Parquet
The Bureau of Transportation Statistics Airline "On-Time: Reporting Carrier On-Time Performance" data can be obtained from: https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr. All columns were manually selected since the "Prezipped File" option does not appear to be working. Monthly data was downloaded, renamed as YYYYMM.csv, and zipped into yearly archives (YYYY.tar.gz files). The assembled *.tar.gz files are provided as a starting point.  

Here are some important dates in which new columns become available in the data:
- Cause of Delay (Data starts 6/2003)  
- Gate Return Information at Origin Airport (Data starts 10/2008)  
- Diverted Airport Information (Data starts 10/2008)  

Gate returns and cancelled/diverted flights are removed during processing since they are incomplete flights. Handling of these flights can be the topic of future works since they do affect the airport demand. 

In [1]:
import os
import sys
import glob
import io
import tarfile
import shutil
from functools import reduce
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pandas.api.types import CategoricalDtype
import numpy as np
from time import time
import datetime

import dask
import dask.dataframe as dd
from dask.distributed import Client, wait, progress, get_worker


tar_data_dir = './data/raw/bts_carrier_ontime_perf_data/tar'
tar_files = glob.glob(tar_data_dir+'/*.tar.gz')
tar_files = sorted(tar_files)

# Original raw data converted to parquet:
parquet_output_dir = './data/converted/flight'

print('pandas version', pd.__version__)
print('dask version', dask.__version__)

# Set dask shuffle method globally:
# https://docs.dask.org/en/latest/dataframe-groupby.html#selecting-methods
# dask.config.set(shuffle='tasks') # Default is 'disk'.

# Use mem intensive CPU instances to run (e.g., AWS M-series)
# client = Client(n_workers=4, threads_per_worker=1)
# client = Client(n_workers=6, threads_per_worker=4) # Good for aws xx.8xlarge instances.
client = Client(n_workers=10, threads_per_worker=2) # Good for DSWS.
client

pandas version 1.1.5
dask version 2021.06.1


0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 10
Total threads:  20,Total memory:  251.65 GiB

0,1
Comm: tcp://127.0.0.1:40897,Workers: 10
Dashboard: http://127.0.0.1:8787/status,Total threads:  20
Started:  Just now,Total memory:  251.65 GiB

0,1
Comm: tcp://127.0.0.1:41919,Total threads: 2
Dashboard: http://127.0.0.1:45581/status,Memory: 25.17 GiB
Nanny: tcp://127.0.0.1:39583,
Local directory: /data/airline_delay_causal/dask-worker-space/worker-eli2fdqj,Local directory: /data/airline_delay_causal/dask-worker-space/worker-eli2fdqj
GPU: Quadro RTX 8000,GPU memory: 47.46 GiB

0,1
Comm: tcp://127.0.0.1:44827,Total threads: 2
Dashboard: http://127.0.0.1:33853/status,Memory: 25.17 GiB
Nanny: tcp://127.0.0.1:35459,
Local directory: /data/airline_delay_causal/dask-worker-space/worker-urjso2pj,Local directory: /data/airline_delay_causal/dask-worker-space/worker-urjso2pj
GPU: Quadro RTX 8000,GPU memory: 47.46 GiB

0,1
Comm: tcp://127.0.0.1:41353,Total threads: 2
Dashboard: http://127.0.0.1:40033/status,Memory: 25.17 GiB
Nanny: tcp://127.0.0.1:42303,
Local directory: /data/airline_delay_causal/dask-worker-space/worker-o2tx57gv,Local directory: /data/airline_delay_causal/dask-worker-space/worker-o2tx57gv
GPU: Quadro RTX 8000,GPU memory: 47.46 GiB

0,1
Comm: tcp://127.0.0.1:41233,Total threads: 2
Dashboard: http://127.0.0.1:39153/status,Memory: 25.17 GiB
Nanny: tcp://127.0.0.1:34403,
Local directory: /data/airline_delay_causal/dask-worker-space/worker-p7q5iygg,Local directory: /data/airline_delay_causal/dask-worker-space/worker-p7q5iygg
GPU: Quadro RTX 8000,GPU memory: 47.46 GiB

0,1
Comm: tcp://127.0.0.1:34227,Total threads: 2
Dashboard: http://127.0.0.1:34703/status,Memory: 25.17 GiB
Nanny: tcp://127.0.0.1:37333,
Local directory: /data/airline_delay_causal/dask-worker-space/worker-293axeby,Local directory: /data/airline_delay_causal/dask-worker-space/worker-293axeby
GPU: Quadro RTX 8000,GPU memory: 47.46 GiB

0,1
Comm: tcp://127.0.0.1:32839,Total threads: 2
Dashboard: http://127.0.0.1:46241/status,Memory: 25.17 GiB
Nanny: tcp://127.0.0.1:33719,
Local directory: /data/airline_delay_causal/dask-worker-space/worker-mz02rbau,Local directory: /data/airline_delay_causal/dask-worker-space/worker-mz02rbau
GPU: Quadro RTX 8000,GPU memory: 47.46 GiB

0,1
Comm: tcp://127.0.0.1:44145,Total threads: 2
Dashboard: http://127.0.0.1:40755/status,Memory: 25.17 GiB
Nanny: tcp://127.0.0.1:46559,
Local directory: /data/airline_delay_causal/dask-worker-space/worker-uyzol66j,Local directory: /data/airline_delay_causal/dask-worker-space/worker-uyzol66j
GPU: Quadro RTX 8000,GPU memory: 47.46 GiB

0,1
Comm: tcp://127.0.0.1:36641,Total threads: 2
Dashboard: http://127.0.0.1:46009/status,Memory: 25.17 GiB
Nanny: tcp://127.0.0.1:37643,
Local directory: /data/airline_delay_causal/dask-worker-space/worker-ptah5azx,Local directory: /data/airline_delay_causal/dask-worker-space/worker-ptah5azx
GPU: Quadro RTX 8000,GPU memory: 47.46 GiB

0,1
Comm: tcp://127.0.0.1:34215,Total threads: 2
Dashboard: http://127.0.0.1:41929/status,Memory: 25.17 GiB
Nanny: tcp://127.0.0.1:45237,
Local directory: /data/airline_delay_causal/dask-worker-space/worker-o63u8h3b,Local directory: /data/airline_delay_causal/dask-worker-space/worker-o63u8h3b
GPU: Quadro RTX 8000,GPU memory: 47.46 GiB

0,1
Comm: tcp://127.0.0.1:41103,Total threads: 2
Dashboard: http://127.0.0.1:38595/status,Memory: 25.17 GiB
Nanny: tcp://127.0.0.1:38661,
Local directory: /data/airline_delay_causal/dask-worker-space/worker-5xv0optj,Local directory: /data/airline_delay_causal/dask-worker-space/worker-5xv0optj
GPU: Quadro RTX 8000,GPU memory: 47.46 GiB


In [2]:
%%time

def flt_delay_csv_to_parquet(df, input_csv_file, output_dir):
    """
    Convert csv flight data to parquet.
    
    df: pd.DataFrame
        DataFrame with flight data contained. Assumes data was read externally (e.g., using zipfile.ZipFile.open()).
    input_csv_file: str
        Name of csv file.
    output_dir: str
        Output directory.
    """
    
    def convert_hhmm(df, col_names):
        """
        Convert HHMM time to expanded format.
        """
        for cc in col_names:
            series_dat = df[cc].astype(int).astype(str).str.zfill(4)
            df[cc+'_HR'] = np.minimum(series_dat.str[:2].str.strip().astype('int8'), 23) # Resolve issue with 2400
            df[cc+'_MINS'] = series_dat.str[2:].astype('int8')
            df[cc+'_QTHR'] = df[cc+'_MINS']//15

            # Only departures have YYMMDD indicated.
            df[cc+'_DT_LOCAL'] = dd.to_datetime(df[['YEAR', 'MONTH', 'DAY_OF_MONTH', cc+'_HR', cc+'_MINS']]
                   .rename(columns={'DAY_OF_MONTH': 'DAY', cc+'_HR': 'HOUR', cc+'_MINS': 'MINUTES'}))
            df = df.drop(columns=cc)
        return(df)
    
#     # Issue with C engine and EOF field appearing in text. Cannot use low_memory=False option with 'python' engine.
#     df = pd.read_csv(input_csv_file, engine='c', low_memory=False, quoting=3, error_bad_lines=False)
#     df = pd.read_csv(input_csv_file, engine='c', low_memory=False) #, quoting=3, error_bad_lines=False)
    n_rows_input = len(df)

#     # Remove columns beyond 'DIV_AIRPORT_LANDINGS':
#     div_removal_idx = list(df.columns).index('DIV_AIRPORT_LANDINGS')
#     df = df.iloc[:,:div_removal_idx]

    # Generate UID by padding YYYYMM and reserving 10M-1 for unique operations.
    yyyymm_ = input_csv_file.split('/')[-1].replace('.csv', '')
    df['UID'] = df.index.values + np.int64(yyyymm_)*10000000

    # Update ORIGIN/DEST with state information. IATA 3-digit code has issues with international flights (e.g., Alaska, etc.).
    df['ORIGIN'] = df['ORIGIN'] +'_'+ df['ORIGIN_STATE_ABR']
    df['DEST'] = df['DEST'] +'_'+ df['DEST_STATE_ABR']
    df['ORIGIN'] = df['ORIGIN'].fillna('')
    df['DEST'] = df['DEST'].fillna('')

    # Filter out cancelled or diverted flights.
    # - Remove cancelled flights since they won't have ARR metrics.
    # - Remove diverted flights since they are outliers that don't typically occur.  
    df_clean = df[(df['CANCELLED']==0) & (df['DIVERTED']==0)].copy()
    cnt_drop_for_cancel_divert = n_rows_input - len(df_clean)
    
    # Repair TAIL_NUM:
    df_clean['TAIL_NUM'] = df_clean['TAIL_NUM'].fillna('UNK')
    
    # Create OD_PAIR feature:
    df_clean['OD_PAIR'] = df_clean['ORIGIN'] + '-' + df_clean['DEST']

    # Timings needed to be as complete as possible. Time series model depends on it.
    important_timing = ['DEP_TIME', 'CRS_DEP_TIME', 'ARR_TIME', 'CRS_ARR_TIME']

    # Backfill missing ARR_TIME and DEP_TIME with scheduled times. 
    df_clean.loc[df_clean['ARR_TIME'].isna(), 'ARR_TIME'] = df_clean.loc[df_clean['ARR_TIME'].isna(), 'CRS_ARR_TIME']
    df_clean.loc[df_clean['DEP_TIME'].isna(), 'DEP_TIME'] = df_clean.loc[df_clean['DEP_TIME'].isna(), 'CRS_DEP_TIME']

    # Backfill missing CRS_ARR_TIME and CRS_DEP_TIME with actual times. 
    df_clean.loc[df_clean['CRS_ARR_TIME'].isna(), 'CRS_ARR_TIME'] = df_clean.loc[df_clean['CRS_ARR_TIME'].isna(), 'ARR_TIME']
    df_clean.loc[df_clean['CRS_DEP_TIME'].isna(), 'CRS_DEP_TIME'] = df_clean.loc[df_clean['CRS_DEP_TIME'].isna(), 'DEP_TIME']

    # Drop record if important_timing can't be resolved:
    df_clean = df_clean.dropna(subset=important_timing)
    cnt_drop_for_missing_times = n_rows_input - cnt_drop_for_cancel_divert - len(df_clean)

    # Expand HHMM formated time:
    df_clean = convert_hhmm(df_clean, important_timing)

    # Create DAY_OF_YEAR field. Captures annual seasonality not included in original data.
    df_clean['DAY_OF_YEAR'] = df_clean['DEP_TIME_DT_LOCAL'].dt.dayofyear

    update_int16_cols = ['DAY_OF_YEAR', 'YEAR', 'CRS_ELAPSED_TIME', 'DEP_DELAY', 'DISTANCE', 'DISTANCE_GROUP', 'TAXI_OUT']
    update_bool_cols = ['ARR_DEL15', 'DEP_DEL15']

    # Need logic for filling NA's before conversion to int. Alternatively, cast to float32 and deal with NA's later.
    df_clean[update_int16_cols] = df_clean[update_int16_cols].astype('float32')
    df_clean[update_bool_cols] = df_clean[update_bool_cols].astype(bool)

    # # Arrivals could be next day for overnight flights. Need to adjust for time zone and flight duration.
    # # For conveniece, assume all flights are same-day. There should be a small number of overnight flights, or late arriving flights past midnight.
    # flt_tbl_clean['ARR_DT_LOCAL'] = dd.to_datetime(flt_tbl_clean[['YEAR', 'MONTH', 'DAY_OF_MONTH', 'ARR_TIME_HR', 'ARR_TIME_MINS']]
    #                .rename(columns={'DAY_OF_MONTH': 'DAY', 'ARR_TIME_HR': 'HOUR', 'ARR_TIME_MINS': 'MINUTES'}))

    # TODO: implement geocoding in airport database and merge to get time-zone info for each airport. Use GMT time and actual flight time to determine if next-day arrival.

    n_rows_output = len(df_clean)
    n_rows_dropped = n_rows_input - n_rows_output

    # Write output to parquet:
    output_file = output_dir + '/' + input_csv_file.split('/')[-1].replace('.csv', '.parquet')
    df_clean.to_parquet(output_file)
    print('Processed ' + output_file + \
    '\n  Dropped cancelled/diverted flights: ' + str(cnt_drop_for_cancel_divert) + \
    '\n  Dropped unfixable timings: ' + str(cnt_drop_for_missing_times))

        # TODO: downcast to smallest compatible data type. Encode categoricals.
    return()


@dask.delayed
def convert_archive_flt_year(tar_fn, output_dir):
    """
    Read data from within annual tar.gz archive and process files individually.
    
    tar_fn: str
        tar file name.
    """
    
    with tarfile.open(tar_fn, 'r') as tar:
        fn_list = tar.getnames()
        for fn in fn_list:
            
            try:
                df = pd.read_csv(io.BytesIO(tar.extractfile(fn).read()), engine='c', low_memory=False)
            except:
                # Some files have issue with the default encoding='utf-8'. Using encoding='latin' seems to fix this but can stall certain files from being read.
                df = pd.read_csv(io.BytesIO(tar.extractfile(fn).read()), encoding='latin', engine='c', low_memory=False)
                
            flt_delay_csv_to_parquet(df, fn, output_dir)

        print('Completed processing ' + tar_fn)
    return()

# Extract BTS flight zip files in memory and convert to parquet:
dask_tasks = [convert_archive_flt_year(fn, parquet_output_dir) for fn in tar_files]
dask_tasks = dask.compute(dask_tasks)

CPU times: user 56.3 s, sys: 1min 3s, total: 1min 59s
Wall time: 9min 59s
