In [315]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
import catboost as cb
import catboost.utils as cbu
from catboost import Pool
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import sys

In [316]:
train = pd.read_csv('/kaggle/input/flight-delays-fall-2018/flight_delays_train.csv.zip')
test = pd.read_csv('/kaggle/input/flight-delays-fall-2018/flight_delays_test.csv.zip')

In [317]:
train.head(5)

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [318]:
from sklearn.base import BaseEstimator, TransformerMixin

class SeasonTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        X_transformed['Season'] = X_transformed.apply(self.get_season, axis=1)
        
        return X_transformed
    
    def get_season(self, row):
        if row.Month in ['6', '7', '8']:
            return 'summer'
        elif row.Month in ['9', '10', '11']:
            return 'autumn'
        elif row.Month in ['12', '1', '2']:
            return 'winter'
        else:
            return 'spring'

class DistanceBinningTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        X_transformed['DistBin'] = 'vshort'
        X_transformed.loc[X_transformed.Distance <= 500 , 'DistBin'] = 'vshort'
        X_transformed.loc[(X_transformed.Distance > 500) & (X_transformed.Distance <= 1000), 'DistBin'] = 'short'
        X_transformed.loc[(X_transformed.Distance > 1000) & (X_transformed.Distance <= 1500), 'DistBin'] = 'mid'
        X_transformed.loc[(X_transformed.Distance > 1500) & (X_transformed.Distance <= 2000), 'DistBin'] = 'midlong'
        X_transformed.loc[(X_transformed.Distance > 2000) & (X_transformed.Distance <= 2500), 'DistBin'] = 'long'
        X_transformed.loc[X_transformed.Distance > 2500, 'DistBin'] = 'vlong'
        
        return X_transformed

class CyclicDataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, max_values):
        self.columns = columns
        self.max_values = max_values
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for index, column in enumerate(self.columns):
            self.encode_cyclic_data(X_transformed, column, self.max_values[index])
        return X_transformed
    
    def encode_cyclic_data(self, data, col, max_val):
        data[col + '_sin'] = np.sin(2 * np.pi * data[col].astype(float) / max_val)
        data[col + '_cos'] = np.cos(2 * np.pi * data[col].astype(float) / max_val)

        return data
    
class DateTimeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        X_transformed['Hour'] = X_transformed['DepTime'].apply(self.get_hour)
        X_transformed['Minute'] = X_transformed['DepTime'].apply(lambda x: str(x)[-2:])
        X_transformed['Month'] = X_transformed['Month'].apply(lambda x: x.split('-')[1])
        X_transformed['DayofMonth'] = X_transformed['DayofMonth'].apply(lambda x: x.split('-')[1])
        X_transformed['DayOfWeek'] = X_transformed['DayOfWeek'].apply(lambda x: x.split('-')[1])
        
        return X_transformed
        
    def get_hour(self, time):
        if len(str(time)) < 3 or str(time)[:-2] == '25':
            return '1'
        elif str(time)[:-2] == '24':
            return '0'
        else:
            return str(time)[:-2]
        
class ColumnDropperTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X.drop(self.columns, axis=1)
    
def get_categorical_features(df):
    return df.select_dtypes(include='object').columns.to_list()

def get_numerical_features(df):
    return df._get_numeric_data().columns.to_list()

In [319]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Separate features and target variable
X = train.drop(columns=['dep_delayed_15min'])
y = train['dep_delayed_15min']

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=714)

# Convert target variable to binary numeric values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_valid = label_encoder.transform(y_valid)

base_transformer = Pipeline([
    ('distance binning', DistanceBinningTransformer()),
    ('datetime', DateTimeTransformer()),
    ('season', SeasonTransformer()),
    ('cyclic', CyclicDataTransformer(['Hour', 'Minute', 'Month', 'DayofMonth', 'DayOfWeek'],
                                     [24, 60, 12, 31, 7])),
    ('dropper', ColumnDropperTransformer(['DepTime']))
])

# Bundle preprocessing for numerical and categorical features
column_transformer = ColumnTransformer(
    transformers=[
        ('scaler', MinMaxScaler(), ['Distance']),
        ('target', TargetEncoder(), get_categorical_features),
    ], remainder='passthrough')

In [320]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# Define individual models
rf_model = RandomForestClassifier(random_state=714)
gbm_model = GradientBoostingClassifier(random_state=714)
logreg_model = LogisticRegression(max_iter=1000, random_state=714)
xgb_model = xgb.XGBClassifier(random_state=714)

pipeline_xgb = Pipeline([
    ('base', base_transformer),
    ('column', column_transformer),
    ('classifier', xgb_model)
])

pipeline_rf = Pipeline([
    ('base', base_transformer),
    ('column', column_transformer),
    ('classifier', rf_model)
])

pipeline_gbm = Pipeline([
    ('base', base_transformer),
    ('column', column_transformer),
    ('classifier', gbm_model)
])

pipeline_logreg = Pipeline([
    ('base', base_transformer),
    ('column', column_transformer),
    ('classifier', logreg_model)
])

In [321]:
# from sklearn import set_config

# set_config(display='diagram')
# display(pipeline_rf)

## XGB

In [322]:
# # Fit the pipelines
# pipeline_xgb.fit(X_train, y_train)

# # Evaluate each pipeline
# y_pred_proba = pipeline_xgb.predict_proba(X_valid)[:, 1]
# auc_score = roc_auc_score(y_valid, y_pred_proba)

# print(f'ROC-AUC Score: {auc_score}')

ROC-AUC Score: 0.7383033579452116


## RF

In [323]:
# # Fit the pipelines
# pipeline_rf.fit(X_train, y_train)

# # Evaluate each pipeline
# y_pred_proba = pipeline_rf.predict_proba(X_valid)[:, 1]
# auc_score = roc_auc_score(y_valid, y_pred_proba)

# print(f'ROC-AUC Score: {auc_score}')

ROC-AUC Score: 0.7300775510809031


## Catboost

In [324]:
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectFromModel

class CatBoostPreprocessingTransformer(BaseEstimator, TransformerMixin):
    categorical_suffix = "_#CAT#"
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        X_transformed = X.copy()
        column_names = X_transformed.select_dtypes(include='object').columns.to_list()
        new_column_names = {column: column + self.categorical_suffix for column in column_names}
        
        return X_transformed.rename(columns=new_column_names)
    
class CustomCatBoostClassifier(CatBoostClassifier):
    categorical_suffix = "_#CAT#"
    
    def fit(self, X, y=None, **fit_params):
        return super().fit(
            X,
            y=y,
            cat_features=X.filter(regex=f"{self.categorical_suffix}$").columns.to_list(),
            **fit_params
        )

class CustomFeatureSelection(SelectFromModel):

    def transform(self, X):
        
        # Get indices of important features
        important_features_indices = list(self.get_support(indices=True))

        # Select important features
        _X = X.iloc[:, important_features_indices].copy()

        return _X

In [325]:
column_transformer_cat = ColumnTransformer(
    transformers=[
        ('scaler', MinMaxScaler(), ['Distance'])
    ], remainder='passthrough')

column_transformer_cat.set_output(transform='pandas')

In [326]:
pipeline_cat = Pipeline([
    ('base', base_transformer),
    ('column', column_transformer_cat),
    ('cat preprocessing', CatBoostPreprocessingTransformer()),
    ("feature_selection", CustomFeatureSelection(CustomCatBoostClassifier(
           random_state=714,
           objective='Logloss',
           loss_function='Logloss',
           boosting_type='Plain',
           bootstrap_type='MVS',
           depth=9,
           od_type='IncToDec',
           od_wait=38,
           iterations=247, 
           colsample_bylevel=0.0972476818962196,
           learning_rate=0.18063480545417882,
           l2_leaf_reg=1.5783786142911755,
           random_strength=26,
           eval_metric='AUC',
           logging_level="Silent"))),
    ("estimator", CustomCatBoostClassifier(
           random_state=714,
           objective='Logloss',
           loss_function='Logloss',
           boosting_type='Plain',
           bootstrap_type='MVS',
           depth=9,
           od_type='IncToDec',
           od_wait=38,
           iterations=247, 
           colsample_bylevel=0.0972476818962196,
           learning_rate=0.18063480545417882,
           l2_leaf_reg=1.5783786142911755,
           random_strength=26,
           eval_metric='AUC',
           logging_level="Silent"))
])

In [327]:
# cat_model = pipeline_cat.fit(X_train, y_train)

In [328]:
# from catboost import CatBoostClassifier, Pool

# cat_model = cb.CatBoostClassifier(random_state=714,
#                                    objective='Logloss',
#                                    loss_function='Logloss',
#                                    boosting_type='Plain',
#                                    bootstrap_type='MVS',
#                                    depth=9,
#                                    od_type='IncToDec',
#                                    od_wait=38,
#                                    iterations=247, 
#                                    colsample_bylevel=0.0972476818962196,
#                                    learning_rate=0.18063480545417882,
#                                    l2_leaf_reg=1.5783786142911755,
#                                    random_strength=26,
#                                    eval_metric='AUC',)

# valid_set = Pool(data=X_valid_cat,
#                   label=y_valid,
#                   cat_features=get_categorical_features(X_train_cat)
#                   )


# cat_model.fit(X_train_cat, 
#                y_train, 
#                cat_features=get_categorical_features(X_train_cat),
#                eval_set=valid_set,
#                silent=True)

In [329]:
# y_pred_proba = cat_model.predict_proba(X_valid)[:, 1]
# auc_score = roc_auc_score(y_valid, y_pred_proba)
# print(f'ROC-AUC Score: {auc_score}')

ROC-AUC Score: 0.7602271355932229


In [330]:
# test_cat = pipeline_cat.fit_transform(test)

# # Make predictions on the test set using the best model
# predictions = cat_model.predict_proba(test_cat)[:, 1]

# # Prepare submission file
# submission_df = pd.DataFrame({
#     'id': test.index,
#     'dep_delayed_15min': predictions
# })

# # Save submission file
# submission_df.to_csv('submission.csv', index=False)

## Stacking

In [331]:
estimators = []
estimators.append(('rf', pipeline_rf))
estimators.append(('xgb', pipeline_xgb))
estimators.append(('gbm', pipeline_gbm))
estimators.append(('logreg', pipeline_logreg))
estimators.append(('cat', pipeline_cat))

In [332]:
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(random_state=42)

In [333]:
from sklearn.ensemble import StackingClassifier

SC = StackingClassifier(estimators=estimators, final_estimator=lgb_model, cv=10)
SC.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 15197, number of negative: 64803
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1108
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.189963 -> initscore=-1.450254
[LightGBM] [Info] Start training from score -1.450254


In [334]:
print(f"\nStacking classifier training Accuracy: {SC.score(X_train, y_train):0.2f}")
print(f"Stacking classifier test Accuracy: {SC.score(X_valid, y_valid):0.2f}")


Stacking classifier training Accuracy: 0.90
Stacking classifier test Accuracy: 0.82


In [335]:
y_pred_proba = SC.predict_proba(X_valid)[:, 1]
auc_score = roc_auc_score(y_valid, y_pred_proba)
print(f'ROC-AUC Score: {auc_score}')

ROC-AUC Score: 0.768565018958381


In [336]:
# Make predictions on the test set using the best model
predictions = SC.predict_proba(test)[:, 1]

# Prepare submission file
submission_df = pd.DataFrame({
    'id': test.index,
    'dep_delayed_15min': predictions
})

# Save submission file
submission_df.to_csv('submission.csv', index=False)

In [337]:
# categ_feat_idx = np.where((X_train.dtypes=='object') | (X_train.dtypes=='category'))[0]

# train_pool = Pool(X_train, y_train, cat_features=categ_feat_idx)
# valid_pool = Pool(X_valid, y_valid, cat_features=categ_feat_idx)

In [338]:
# def objective(trial):
#     param = {
#         "random_state": 714,
#         "loss_function": "Logloss",
#         "objective": "Logloss",
#         "silent": True,
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#         "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#         "bootstrap_type": trial.suggest_categorical(
#             "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
#         ),
#         'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
#         'od_wait' :trial.suggest_int('od_wait', 10, 50),
#         'iterations': trial.suggest_int('iterations', 500, 1000),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0),
#         'depth': trial.suggest_int('depth', 5, 14),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-4, 10.0),
#         'random_strength': trial.suggest_int('random_strength', 0, 100),
#         'border_count': trial.suggest_int('border_count', 1, 128),
#     }

#     if param["bootstrap_type"] == "Bayesian":
#         param["bagging_temperature"] = trial.suggest_int('bagging_temperature', 0, 10)
#     elif param["bootstrap_type"] == "Bernoulli":
#         param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

#     gbm = cb.CatBoostClassifier(**param)

#     gbm.fit(train_pool, eval_set=valid_pool, verbose=0, early_stopping_rounds=100)

#     y_pred_proba = gbm.predict_proba(X_valid)[:, 1]
#     auc_score = roc_auc_score(y_valid, y_pred_proba)
    
#     return auc_score

In [339]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=150)

In [340]:
# study.best_params

In [341]:
# classifier = cb.CatBoostClassifier(random_state=714,
#                                    objective='Logloss',
#                                    loss_function='Logloss',
#                                    boosting_type='Plain',
#                                    bootstrap_type='MVS',
#                                    depth=9,
#                                    od_type='IncToDec',
#                                    od_wait=38,
#                                    iterations=247, 
#                                    colsample_bylevel=0.0972476818962196,
#                                    learning_rate=0.18063480545417882,
#                                    l2_leaf_reg=1.5783786142911755,
#                                    random_strength=26,
#                                    eval_metric='AUC',)

# valid_set = Pool(data=X_valid,
#                   label=y_valid,
#                   cat_features=categ_feat_idx
#                   )


# classifier.fit(X_train, 
#                y_train, 
#                cat_features=categ_feat_idx,
#                eval_set=valid_set,
#                silent=True)

In [342]:
# feature_importance = classifier.feature_importances_
# sorted_idx = np.argsort(feature_importance)
# fig = plt.figure(figsize=(12, 6))
# plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
# plt.yticks(range(len(sorted_idx)), np.array(X_train.columns)[sorted_idx])
# plt.title('Feature Importance')

In [343]:
# perm_importance = permutation_importance(classifier, X_valid, y_valid, n_repeats=10, random_state=1066)
# sorted_idx = perm_importance.importances_mean.argsort()
# fig = plt.figure(figsize=(12, 6))
# plt.barh(range(len(sorted_idx)), perm_importance.importances_mean[sorted_idx], align='center')
# plt.yticks(range(len(sorted_idx)), np.array(X_valid.columns)[sorted_idx])
# plt.title('Permutation Importance')

In [344]:
# y_pred_proba = classifier.predict_proba(X_valid)[:, 1]
# auc_score = roc_auc_score(y_valid, y_pred_proba)
# print(f'ROC-AUC Score: {auc_score}')

In [345]:
# # Make predictions on the test set using the best model
# test = preprocess(test)
# categ_feat_idx = np.where((test.dtypes=='object') | (test.dtypes=='category') )[0]
# predictions = classifier.predict_proba(test)[:, 1]

# # Prepare submission file
# submission_df = pd.DataFrame({
#     'id': test.index,
#     'dep_delayed_15min': predictions
# })

# # Save submission file
# submission_df.to_csv('submission.csv', index=False)