# Definitions

In [1]:
from pathlib import Path
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from category_encoders import OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score
import category_encoders as ce
import optuna

In [2]:
data_path = Path('.', 'data')

In [3]:
RANDOM_STATE = 648

In [4]:
target_name = 'dep_delayed_15min'

# Data

In [5]:
train = pd.read_csv(data_path.joinpath('flight_delays_train.csv'))
test = pd.read_csv(data_path.joinpath('flight_delays_test.csv'))

In [6]:
train['route'] = train.Origin + train.Dest

In [7]:
test['route'] = test.Origin + test.Dest

In [8]:
train[target_name] = train[target_name].map({"Y": 1, "N": 0})

## Split Data

In [9]:
X, y = train.drop(target_name, axis=1), train[target_name]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=RANDOM_STATE)

# Preproc

## timestamp decoding

In [11]:
def get_ts(df):
    df['hour'] = df.DepTime // 100
    df['minute'] = df.DepTime % 100
    df['d_month'] = df.DayofMonth.str[2:].astype(int)
    df['d_week'] = df.DayOfWeek.str[2:].astype(int)
    df['month'] = df.Month.str[2:].astype(int)
    
    return df

## timestamp encoding

In [12]:
def ts_features(df):
    X_ = df[df.DepTime < 2400]
    dep_dt = pd.to_datetime('2021-' +
                                  X_.month.astype(str).str.zfill(2) + '-' +
                                  X_.d_month.astype(str).str.zfill(2) + '-' +
                                  X_.hour.astype(str).str.zfill(2) + '-' +
                                  X_.minute.astype(str).str.zfill(2) + '-00',
                                  format='%Y-%m-%d-%H-%M-%S')
    dep_td = dep_dt - pd.to_datetime('2021-01-01')
    df['days_since_ny'] = dep_td.dt.days
    df['secs_since_ny'] = dep_td.dt.seconds
    
    return df

## Category encoding

In [13]:
cat_cols = X_train.select_dtypes(include=[object, 'category']).columns

In [14]:
cat_cols

Index(['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest',
       'route'],
      dtype='object')

### Target encoding

In [15]:
encoders_params = dict(cols=cat_cols,
                        handle_missing='return_nan',
                        handle_unknown='return_nan')

In [16]:
target_encoders = {'helmert': ce.HelmertEncoder(**encoders_params),
                   'js': ce.JamesSteinEncoder(**encoders_params),
                   'loo': ce.LeaveOneOutEncoder(**encoders_params),
                   'target': ce.TargetEncoder(**encoders_params),
                   'woe': ce.WOEEncoder(**encoders_params)}

In [17]:
enc = OrdinalEncoder(cols=cat_cols,
                     handle_missing='return_nan',
                     handle_unknown='return_nan')

In [18]:
%%time
enc.fit(X[cat_cols])
print('Done')

  elif pd.api.types.is_categorical(cols):


Done
CPU times: user 212 ms, sys: 28.2 ms, total: 241 ms
Wall time: 267 ms


## Sin/cos ts processing

In [19]:
t_limits = {'minute': (0, 59),
             'hour': (0, 23),
             'd_week': (1, 7),
             'd_month': (1, 31),
             'month': (1, 12),
             'days_since_ny': (0, 364)}

In [20]:
def sin_cos(df):
    for col_name, limits in t_limits.items():
        max_val = limits[1]
        min_val = limits[0]
        if col_name == 'hour':
            orig_vals = df[col_name].clip(upper=max_val)
        else:
            orig_vals = df[col_name]
        df[col_name + '_sin'] = np.sin(2*np.pi/(max_val - min_val)*orig_vals)
        df[col_name + '_cos'] = np.cos(2*np.pi/(max_val - min_val)*orig_vals)
        
    return df

## Sum up prepocessing

In [21]:
def preproc(df):
    df = get_ts(df)
    df = ts_features(df)
    df = sin_cos(df)
    
#     for name, encoder in target_encoders.items():
#         df[cat_cols + f'_{name}'] = encoder.transform(df[cat_cols])
    df[cat_cols] = enc.transform(df[cat_cols])
    df[cat_cols] = df[cat_cols].astype('category')
    
    # Special features
    df['after_midnight'] = 0
    df.loc[df.DepTime > 2400, 'after_midnight'] = 1
    
    return df

In [22]:
%%time
X_train = preproc(X_train)
X_test = preproc(X_test)
X = preproc(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df.DepTime // 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['minute'] = df.DepTime % 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['d_month'] = df.DayofMonth.str[2:].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexe

CPU times: user 3.77 s, sys: 378 ms, total: 4.15 s
Wall time: 6.78 s


# Commit

## Refit on full dataset

In [23]:
lgbm_params = {'boosting': 'gbdt',
               'bagging_freq': 1,
               'n_estimators': 434,
               'random_state': RANDOM_STATE,
               'silent': False,
               'bagging_fraction': 0.638782643397927,
               'feature_fraction': 0.5072049472753989,
               'learning_rate': 0.02238858415024258,
               'max_depth': 10,
               'min_data_in_leaf': 7,
               'reg_alpha': 4.16689412832005,
               'reg_lambda': 3.0220290348080594}

In [24]:
model = lgb.LGBMClassifier()

In [25]:
model.set_params(**lgbm_params)

LGBMClassifier(bagging_fraction=0.638782643397927, bagging_freq=1,
               boosting='gbdt', feature_fraction=0.5072049472753989,
               learning_rate=0.02238858415024258, max_depth=10,
               min_data_in_leaf=7, n_estimators=434, random_state=648,
               reg_alpha=4.16689412832005, reg_lambda=3.0220290348080594)

In [26]:
%%time
model.fit(X, y)

CPU times: user 19.2 s, sys: 993 ms, total: 20.2 s
Wall time: 9.31 s


LGBMClassifier(bagging_fraction=0.638782643397927, bagging_freq=1,
               boosting='gbdt', feature_fraction=0.5072049472753989,
               learning_rate=0.02238858415024258, max_depth=10,
               min_data_in_leaf=7, n_estimators=434, random_state=648,
               reg_alpha=4.16689412832005, reg_lambda=3.0220290348080594)

## Prepare test

In [27]:
test = preproc(test)

In [28]:
test['dep_delayed_15min'] = model.predict_proba(test)[:, 1]

In [29]:
test['dep_delayed_15min'].to_csv(data_path.joinpath('submit.csv'), index_label='id')

# Next Steps

* Grid Search through params
* datetime engineering (sin/cos transform, DepTime limits transform(?))

# Notes

* OHE sucks since model just doesn't use those features
* there are only 1 after 2400

__Best Score__: 0.72762