# Test Train Split
Target encoding is applied to several categorical features. To prevent leakage, target encoding needs to be applied after splitting the data into test/train sets. The python "category_encoders" package uses pandas to perform target encoding on a single CPU-core. Due to memory limitations, the following implementation slices UID, label column, and target encode columns from the master flight table to perform the encoding against the test/train set independently. Post-processing is applied to merge and prepare the fully encoded tables for XGBOOST. 

Time series modeling requires that the data be split into continuous chunks to avoid potential leakage of information between neighboring events. The data is split into continuous chunks with the following date ranges: 200306 to 201903 train; 201904 to 202003 test. Due to COVID-19, data after 202003 is omitted since the model may not have enough data to resolve the irregularities in flight demand and schedules. A large number of flights were cancelled during this time period, which may not be adequately represented since cancelled/diverted flights were removed from consideration.  

In [1]:
import gc
import os
import sys
import glob
import shutil
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pandas.api.types import CategoricalDtype
import numpy as np
from time import time
from datetime import datetime
from dateutil.relativedelta import relativedelta

import dask
import dask.dataframe as dd
from dask.distributed import Client, wait, progress, get_worker

import sklearn
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost.dask import DaskDMatrix

print('xgboost version', xgb.__version__)


study_arpt = 'NAS'
run_type = 'cpu'
storage_backend = 'local'

num_folds = 8 # Choose sensible number that's a multiple of number of nodes in cluster to avoid stragglers.
num_holdout_months = 12 # Number of months to reserve for holdout test. Removed from end of time period of interest.
apply_tgt_enc = True # Run target encoder on partially encoded data.

if study_arpt == 'NAS':
    # NAS processing excludes weather. Has additional cols for arrival/departure airports.
    pred_model = 'multi_class'
    label_col = 'DELAY_CAUSES_ENC'
    delay_causes_cols = ['LATE_AIRCRAFT_DELAY', 'CARRIER_DELAY', 'NAS_DELAY', 'OTHER_DELAY']
    excluded_features = [label_col, 'cv_idx', 'UID', 'ARR_DEL15', 'DEP_DEL15']
    target_encode_cols = ['OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'OD_PAIR', 'HOLIDAY_NAME', 'TAIL_NUM', 
                          'ORIGIN_HourlyPresentWeatherTypeCombo', 'DEST_HourlyPresentWeatherTypeCombo']
else:
    pred_model = 'binary_class'
    label_col = 'ARR_DEL15'
    excluded_features = [label_col, 'cv_idx', 'UID', 'DEL_ARR_PER_QTHR', 'DEL_DEP_PER_QTHR', 'DEP_DEL15', 'ARR_DEL']
    target_encode_cols = ['OP_UNIQUE_CARRIER', 'ORIGIN', 'HOLIDAY_NAME', 'HourlyPresentWeatherTypeCombo', 'TAIL_NUM']

if pred_model == 'binary_class':
    xgb_objective = 'binary:logistic'
elif pred_model == 'multi_class':
    # xgboost auc docs mentioned that: "When used with multi-class classification, objective should be multi:softprob instead of multi:softmax, 
    # as the latter doesn’t output probability. Also the AUC is calculated by 1-vs-rest with reference class weighted by class prevalence."
    xgb_objective = 'multi:softprob'
# elif pred_model == 'regression':
#     label_col = 'ARR_DELAY' # Regression model
#     xgb_objective = 'reg:squarederror'
    
partial_enc_input_dir = './data/encoded/'+study_arpt # Partially encoded data. Target encoding required.
enc_output_dir = './data/staging_tbl/split_target_enc/'+study_arpt # DIRECTORY WILL BE WIPED EACH RUN. Fully encoded data with test/train subfolders.
fully_enc_output_dir = './data/encoded/split/'+study_arpt # DIRECTORY WILL BE WIPED EACH RUN. 

xgb_model_name = 'xgb_'+run_type+'_airline_delay_'+study_arpt

if run_type == 'gpu':
    client = Client(n_workers=1, threads_per_worker=16)
elif run_type == 'cpu':
    client = Client(n_workers=30, threads_per_worker=1)
#     client = Client('tcp://192.168.1.232:8785')
    
client

xgboost version 1.5.0-dev


0,1
Client  Scheduler: tcp://127.0.0.1:38999  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 30  Cores: 30  Memory: 251.65 GiB


In [2]:
%%time

def init_output_dirs(output_dir, subdirectory):
    """
    Create or wipe existing directory for outputs. Directory will be wiped.
    """
    from pathlib import Path

    try:
        # RECURSIVELY DELETE DIRECTORY and then add it
        shutil.rmtree(output_dir)
    except:
        pass
    
    for sd in subdirectory:
        Path(output_dir+'/'+sd).mkdir(parents=True)
        
    print(output_dir + ' has been initialized.')
    return()

def get_data_split_files(toc, set_name):
    """
    Obtain list of files from table of content. 
    """
    # Train set is cv_idx!=-1. Only need to apply target encoder to results since remaining data pre-encoded.
    if set_name == 'test':
        out = list(toc[toc['cv_idx'] == -1]['filename'])
    elif set_name == 'train':
        out = list(toc[toc['cv_idx'] != -1]['filename'])
    elif set_name == 'cv':
        # TODO: Return list of list containing the folds.
        out = []
    else:
        raise ValueError('Case not implemented. Choose test, train, or cv.')
    return(out)


pq_files = glob.glob(partial_enc_input_dir+'/*/*.parquet')
toc_df = pd.DataFrame({'filename': pq_files})
toc_df['DATE'] = toc_df['filename'].str.split('=').str[1].str[:6]
toc_df['DATE'] = toc_df['DATE'].astype(int)
toc_df['MONTH'] = toc_df['DATE'].astype(str).str[-2:].astype('int8')
toc_df = toc_df.sort_values(['DATE', 'filename']).reset_index(drop=True)

# Split data into various sets. Test/train sets have known labels.
# Filter for specific dates. Exclude months after March 2020 from data due to COVID.
# Exclude before 200306 due to lack of delay cause attribution. Treat this as if it were test data.
end_date = 202003 # Date when data is not used in test/train set.
start_date = pd.to_datetime(end_date, format='%Y%m') - relativedelta(months=num_holdout_months)
start_date = int(start_date.strftime('%Y%m'))

# split_dates = [198701, 200305, start_date, end_date, 202106] # Full data
# split_dates = [201605, 201805, start_date, end_date, 202106] # Truncated data with ~1 years of training, 1 year test.
# split_dates = [201205, 201405, start_date, end_date, 202106] # Truncated data with ~5 years of training, 1 year test.
split_dates = [200305, 200905, start_date, end_date, 202106] # Truncated data with ~10 years of training, 1 year test.
# split_dates = [200305, 200505, start_date, end_date, 202106] # Truncated data with ~14 years of training, 1 year test.
split_labels = ['impute_past', 'train', 'test', 'predict_future']

toc_df['set_name'] = pd.cut(toc_df['DATE'], bins=split_dates, 
                            labels=split_labels)

# Need train set to be first since target encoder needs to be trained before it can be applied to subsequent data:
split_labels_reorder = ['train'] + [cc for cc in split_labels if cc != 'train']

print(toc_df.groupby('set_name')['DATE'].unique())
print()

toc_df

set_name
impute_past       [200306, 200307, 200308, 200309, 200310, 20031...
train             [200906, 200907, 200908, 200909, 200910, 20091...
test              [201904, 201905, 201906, 201907, 201908, 20190...
predict_future    [202004, 202005, 202006, 202007, 202008, 20200...
Name: DATE, dtype: object

CPU times: user 265 ms, sys: 74.7 ms, total: 340 ms
Wall time: 3.25 s


Unnamed: 0,filename,DATE,MONTH,set_name
0,./data/encoded/NAS/YYYYMM=198710/part-00000-52...,198710,10,
1,./data/encoded/NAS/YYYYMM=198710/part-00001-52...,198710,10,
2,./data/encoded/NAS/YYYYMM=198710/part-00002-52...,198710,10,
3,./data/encoded/NAS/YYYYMM=198710/part-00003-52...,198710,10,
4,./data/encoded/NAS/YYYYMM=198711/part-00000-52...,198711,11,
...,...,...,...,...
1607,./data/encoded/NAS/YYYYMM=202103/part-00003-52...,202103,3,predict_future
1608,./data/encoded/NAS/YYYYMM=202104/part-00000-52...,202104,4,predict_future
1609,./data/encoded/NAS/YYYYMM=202104/part-00001-52...,202104,4,predict_future
1610,./data/encoded/NAS/YYYYMM=202104/part-00002-52...,202104,4,predict_future


In [3]:
%%time

if label_col == 'DELAY_CAUSES_ENC':
    # Reduce number of meta-classes when processing NAS. 2^4 categories not feasible due to data imbalance. 
    # Multi-class label_col was converted to bitstring (or bitmask). python built-in bin() funtion can be used to obtain bitstring representation.
    meta_class_cnts = dd.read_parquet(partial_enc_input_dir, columns=label_col).value_counts().compute()
    meta_class_cnts = pd.DataFrame(meta_class_cnts).reset_index()
    meta_class_cnts.columns = ['CLASS_STR', 'count']
    meta_class_cnts['CLASS_STR'] = meta_class_cnts['CLASS_STR'].astype(str)
    meta_class_cnts = meta_class_cnts.set_index('CLASS_STR')


    # Number of classes in original label_col:
    num_classes_orig = len(meta_class_cnts)
    zero_class_code = '-'*(len(meta_class_cnts.index[0]))

    meta_class_cnts['delayed_frac'] = meta_class_cnts['count']/meta_class_cnts.loc[meta_class_cnts.index != zero_class_code]['count'].sum()
    meta_class_cnts.loc[zero_class_code, 'delayed_frac'] = 0

    meta_class_cnts['cumsum'] = meta_class_cnts['delayed_frac'].cumsum()

    # Select cutoff percentile to reduce number of combo-classes. 
    cumsum_thresh = 0.95

    # xgboost multi-class requies sequential int codes in label_col. Otherwise, get the following error:
    # ".../src/objective/multiclass_obj.cu:120: SoftmaxMultiClassObj: label must be in [0, num_class)"
    meta_class_cnts['new_int_code'] = np.arange(0, len(meta_class_cnts))
    thresh_mask = meta_class_cnts['cumsum'] >= cumsum_thresh

    # Map new "other" category to int outside of normal range [0,num_classes_orig-1]:
    meta_class_cnts.loc[thresh_mask, 'new_int_code'] = np.sum(~thresh_mask)

    delay_causes_remap = dict(zip(meta_class_cnts.index, meta_class_cnts['new_int_code']))
    num_classes_remap = len(meta_class_cnts['new_int_code'].unique())

    print('Number of meta-classes after applying '+str(cumsum_thresh)+' delayed fraction threshold:', num_classes_remap)
else:
    meta_class_cnts = np.nan
    

meta_class_cnts.to_csv('./data/staging_tbl/class_labels.csv')
meta_class_cnts

Number of meta-classes after applying 0.95 delayed fraction threshold: 9
CPU times: user 3.8 s, sys: 632 ms, total: 4.43 s
Wall time: 20.8 s


Unnamed: 0_level_0,count,delayed_frac,cumsum,new_int_code
CLASS_STR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
----,172731139,0.0,0.0,0
--N-,4780127,0.218095,0.218095,1
-C--,3067585,0.13996,0.358055,2
LC--,2828109,0.129033,0.487088,3
L---,2827236,0.128994,0.616082,4
-CN-,2600193,0.118635,0.734716,5
L-N-,2547345,0.116223,0.85094,6
LCN-,1797222,0.081999,0.932939,7
--NO,460206,0.020997,0.953936,8
---O,324927,0.014825,0.968761,8


In [4]:
%%time

def df_to_numeric(ddf):
    """
    Perform data pre-processing to create a fully numeric dataframe.
    """
    ddf['YYYYMM'] = ddf['UID']//10000000

    # category_encoders TargetEncoder doesn't understand dask. Need to convert to pandas prior to encoding.
    # Similarly, cuML TargetEncoder doesn't understand dask_cudf. Need to convert to cudf prior to encoding.
    df = ddf.compute()

    # Convert remaining object columns to category:
    obj_dtypes = list(df.select_dtypes('object').columns)
#     print('Object columns to categorize:', obj_dtypes)
    
    df[obj_dtypes] = df[obj_dtypes].astype('category')

    # Handle categorical features by obtaining category codes. TargetEncoder doesn't currently work with categorical/object dtypes.
    cat_dtypes = list(df.select_dtypes('category').columns)
#     print('Categorical columns detected: ', cat_dtypes)

    for cc in cat_dtypes:
        if cc == 'DELAY_CAUSES_ENC':
            # Apply remapping based on class prevalence to reduce overall number of classes.
            df[cc] = df[cc].map(delay_causes_remap).astype('int8')
        else:
            df[cc] = df[cc].cat.codes.values
    
    return(df)



# TargetEncoder from category_encoders doesn't understand dask.dataframe. Need to convert to pandas with unique index to merge against.
# Also, cuML implementation of TargetEncoder appears to be more complex than the scikit compatible implementation of category_encoders.TargetEncoder(). 
# The encoded results are expected to be different so selecting one of the methods is required to get consistent encodings.
        
if apply_tgt_enc == True:
    from category_encoders import TargetEncoder # sklearn compatible target encoder
    
    # Initialize output directories:
    init_output_dirs(enc_output_dir, split_labels_reorder)
    init_output_dirs(fully_enc_output_dir, split_labels_reorder)
    
    # Run encoder on local dask cluster:
    with Client(n_workers=2, threads_per_worker=4) as client_local:
        import dask.dataframe as hw
        
        # Select small subset of columns to limit memory usage.
        sel_cols = ['UID', label_col] + target_encode_cols            
        
        for set_name in split_labels_reorder:
            tic = time()
            ddf = hw.read_parquet(list(toc_df[toc_df['set_name']==set_name]['filename'].values), columns=sel_cols)
            df = df_to_numeric(ddf)
            
            if set_name == 'train':                
                # Train target encoder on training set only. For other data set, we need to apply the learned target encoding to avoid leakage.
                # It is assumed that the other data set are of similar composition for the encoding to be applicable.
                if run_type == 'cpu':
                    tgt_enc_fit = TargetEncoder(cols=target_encode_cols).fit(df[target_encode_cols], df[label_col])
                
                # TODO: update and fix gpu implementation....
#                 elif run_type == 'gpu':
#                     from cuml.preprocessing.TargetEncoder import TargetEncoder
#                     import cudf

#                     encoder = TargetEncoder(n_folds=5, split_method='continuous', seed=0)
#                     tmp_ = df[target_encode_cols]

#                     # cuML TargetEncoder only considers single column at a time.
#                     # String Arrays is not yet implemented in cudf so can't use .values.
#                     # Categorical dtype not implemented in cudf either. Would need to target encode int values?
#                     for cc in target_encode_cols:
#                         tmp_[cc] = encoder.fit_transform(tmp_[cc], df[label_col])

#                     # Doing direct replacement leads to weird behavior in cudf.
#                     df[target_encode_cols] = tmp_.values
                
            # Assume target encoder has been trained. Now apply it using .transform():
            df[target_encode_cols] = tgt_enc_fit.transform(df[target_encode_cols])
            
            # Casting to reduce storage:
            df[target_encode_cols] = df[target_encode_cols].astype('float32') # Half-float not supported in RAPIDS version of pyarrow yet.
            
            # Add weight column for training set:
            if set_name == 'train':
                from sklearn.utils import compute_sample_weight
                
                # 'balanced' weight in sklearn.utils.class_weight.compute_sample_weight = n_samples / (n_classes * np.bincount(y))
                df['class_weight'] = compute_sample_weight('balanced', df[label_col])
                
            # Write data to disk using dask to ensure muti-partition output.
            dd.from_pandas(df, npartitions=1).to_parquet(enc_output_dir+'/'+set_name, write_metadata_file=False, partition_on=['YYYYMM'], flavor='spark')
            
            toc = np.round(time() - tic, 2)
            print('Target encoding completed for ' + enc_output_dir+'/'+set_name +' in '+str(toc)+'s')
        

./data/staging_tbl/split_target_enc/NAS has been initialized.
./data/encoded/split/NAS has been initialized.


Perhaps you already have a cluster running?
Hosting the HTTP server on port 40949 instead
  elif pd.api.types.is_categorical(cols):


Target encoding completed for ./data/staging_tbl/split_target_enc/NAS/train in 246.09s
Target encoding completed for ./data/staging_tbl/split_target_enc/NAS/impute_past in 101.9s
Target encoding completed for ./data/staging_tbl/split_target_enc/NAS/test in 16.62s
Target encoding completed for ./data/staging_tbl/split_target_enc/NAS/predict_future in 14.87s
CPU times: user 4min, sys: 54.7 s, total: 4min 55s
Wall time: 6min 21s


In [5]:
%%time

# Initialize output directories:
init_output_dirs(fully_enc_output_dir, split_labels_reorder)

@dask.delayed
def merge_uid_by_month(set_name, month):
    flt_tbl = pd.read_parquet(partial_enc_input_dir+'/YYYYMM='+str(month))
    tgt_enc_tbl = pd.read_parquet(enc_output_dir+'/'+set_name+'/YYYYMM='+str(month))
    df_mg = flt_tbl.merge(tgt_enc_tbl, on=['UID'], how='left', suffixes=('_DROP', ''))
    
    # Drop unencoded copy of TargetEncode columns:
    to_drop = [cc for cc in df_mg if cc.endswith('_DROP')]
    df_mg = df_mg.drop(columns=to_drop)
    
    # Drop rows with missing data. Most likely due to weather fields missing.
    df_mg = df_mg.dropna()
    
    # Down cast float64:
    fp64_cols = df_mg.select_dtypes('float64').columns
    df_mg[fp64_cols] = df_mg[fp64_cols].astype('float32')
    
    # All data will be read so data alignment not as important for training.
    df_mg.to_parquet(fully_enc_output_dir+'/'+set_name+'/'+str(month)+'.parquet', flavor='spark')
    return()


# Use pandas and read data directly. Data is already aligned by YYYYMM.
# dask merge adds a lot of overhead and memory usage.
print('Merging TargetEncoder results with master flight table.')

for set_name in split_labels_reorder:
    print('Processing '+ set_name + ' set...')
    tic = time()
    dask.compute([merge_uid_by_month(set_name, mm) for mm in toc_df[toc_df['set_name']==set_name]['DATE'].unique()])
    print('  took ' + str(np.round(time() - tic, 2)) + 's.')
    print()    

./data/encoded/split/NAS has been initialized.
Merging TargetEncoder results with master flight table.
Processing train set...
  took 93.32s.

Processing impute_past set...
  took 89.29s.

Processing test set...
  took 11.84s.

Processing predict_future set...
  took 6.82s.

CPU times: user 20.3 s, sys: 3.77 s, total: 24.1 s
Wall time: 3min 21s


In [6]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
df = pd.read_parquet(fully_enc_output_dir+'/test')
df.dtypes.value_counts()

In [None]:
df.select_dtypes('int64')

In [None]:
df.isna().sum().sort_values(ascending=False)[:30]

