In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from tqdm.notebook import tqdm as tn

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
RANDOM_STATE = 42

In [2]:
# fixed kfold indices for convenience
!wget https://github.com/andrii0yerko/INT20H-2022-Hackathon/blob/main/data/kfold.pkl?raw=true -O kfold.pkl
kf = pd.read_pickle('kfold.pkl')[['train_idx', 'test_idx']].to_records(index=False)

In [4]:
from sklearn.linear_model import LinearRegression

def get_trend_lr(array, return_intercept=False):
    xs = np.arange(len(array))
    mask = ~np.isnan(array)
    if not mask.any():
        return np.nan if not return_intercept else (np.nan, np.nan)
    lr = LinearRegression().fit(xs[mask].reshape(-1, 1), array[mask])
    return lr.coef_[0] if not return_intercept else (lr.coef_[0], lr.intercept_ )

In [6]:
def get_p_df(all_data):  # over time features
    P_cols = [f'P{i}' for i in range(1, 28)]
    P_features = all_data[['Id', 'Week'] + P_cols]
    P_features = P_features.set_index(['Id', 'Week']).unstack()
    P_features.columns = [f"{v}_week{i}" for v, i in P_features.columns]
    P_features['P_num_nan'] = P_features.isna().sum(axis=1)
    return P_features


def get_p_trend_df(P_features):
    P_cols = [f'P{i}' for i in range(1, 28)]
    P_trends = pd.DataFrame()
    for col in tn(P_cols):
        P_feat = P_features[[col+f'_week{i}' for i in range(4)]]
    #     P_trends[col+'_trend'] = P_feat.apply(get_trend_lr, axis=1)
        P_trends[[col+'_trend', col+'_intercept']] = P_feat.apply(get_trend_lr, return_intercept=True, axis=1, result_type='expand')
        P_trends[col+'_nans'] = P_feat.isna().sum(axis=1)
        P_trends[col+'_mean'] = P_feat.isna().mean(axis=1)
        P_trends[col+'_std'] = P_feat.isna().std(axis=1)
        
    return P_trends


def get_v_df(all_data):  # static features
    V_features = all_data[all_data.Week == 0].drop(columns='Week').set_index('Id').filter(regex="V\d{1,2}")
    V_features['V_num_nan'] = V_features.iloc[:, :-1].isna().sum(axis=1)
    return V_features


def get_target_df(all_data):
    return all_data[['Id', 'target']].drop_duplicates(subset=['Id']).set_index('Id')

In [7]:
def get_X_y(*args):
    all_data_fe = pd.concat(args, axis=1)
    test_idx = all_data_fe['target'].isna()
    train = all_data_fe[~test_idx]
    X_test = all_data_fe[test_idx].drop(columns=['target'])

    X, y = train.drop(columns=['target']), train['target']
    return X, y, X_test, all_data_fe

In [8]:
INPUT_DIR = '/kaggle/input/techuklon-int20h'

train = pd.read_csv(f'{INPUT_DIR}/train.csv')
test = pd.read_csv(f'{INPUT_DIR}/test.csv')

meta_test = pd.read_csv('/kaggle/input/int20h-gbdt-stacking/meta_test.csv')
meta_train = pd.read_csv('/kaggle/input/int20h-gbdt-stacking/meta_train.csv')

In [9]:
all_data = pd.concat((train, test))
target = get_target_df(all_data)
v_df = get_v_df(all_data)
p_df = get_p_df(all_data)
p_trend_df = get_p_trend_df(p_df)

In [32]:
X, y, X_test, all_data_fe = get_X_y(target, v_df, p_df, p_trend_df, pd.concat([meta_train, meta_test], axis=0).iloc[:, 1:].set_index('Id'))

In [28]:
def get_feature_pairs_lgbm(model):
    trees = model.booster_.trees_to_dataframe()
    trees = trees[trees['split_feature'].apply(lambda x: x is not None)]

    feature_interaction = []

    for i, node in trees.iterrows():
        child_features = trees.loc[trees['parent_index'] == node['node_index'], 'split_feature']
        feature_interaction += [(node['split_feature'], feat) for feat in child_features]

    feature_interaction = [tuple(set(i)) if i[0] != i[1] else i for i in feature_interaction]
    
    return feature_interaction

In [37]:
def kfold_loop(X, y, groups, kf, X_test=None, loop_verbose=0, model=None, fit_callbacks=[], *lgbm_args, **lgbm_kwargs):
    features = []
    
    if model is None:
        model = lgbm.LGBMClassifier(
            verbose=-1,
            *lgbm_args, **lgbm_kwargs
        )
    oof_pred = []
    oof_pred_train = pd.Series(index=y.index, dtype='float64')
    scores = []
    for fold, (train_idx, test_idx) in enumerate(kf):
        train_idx = groups.isin(train_idx)
        test_idx = groups.isin(test_idx)
        _X_train = X[train_idx]
        _X_val = X[test_idx]
        _y_train = y[train_idx]
        _y_val = y[test_idx]
        if loop_verbose > 1:
            print(f'--- FOLD {fold+1} ---')
        try: # LGBM-specific
            model.fit(
                _X_train, _y_train,
                eval_set=(_X_val, _y_val),
                callbacks=[lgbm.log_evaluation(period=0),
#                            lgbm.early_stopping(3)
                          ] + fit_callbacks
            )
        except TypeError: # sklearn general
            model.fit(_X_train, _y_train)
        
        features += get_feature_pairs_lgbm(model)
        
        _y_pred = model.predict_proba(_X_val)[:, 1]
        oof_pred_train[test_idx] = _y_pred
        fold_score = roc_auc_score(_y_val, _y_pred)
        if loop_verbose > 1:
            print(f'\tAUC score: {fold_score:.5f}')
        scores.append(fold_score)
        if X_test is not None:
            oof_pred.append(model.predict_proba(X_test)[:, 1])
    if loop_verbose > 0:
        print(f'fold-mean AUC score: {np.mean(scores):.5f}\t fold-std AUC score: {np.std(scores):.5f}\n')
    return np.array(oof_pred).T, np.mean(scores), np.std(scores), features, oof_pred_train, model

In [38]:
# kf = RepeatedStratifiedKFold(n_splits=5, random_state=RANDOM_STATE)
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

oof_pred = kfold_loop(
    X=X,
    y=y,
    groups=X.index,
    kf=kf,
    X_test=X_test,
    loop_verbose=2,
    random_state=RANDOM_STATE,
    use_missing=True,
)

In [35]:
def save_sub(predicted, ids, sub_name):
    sub = pd.DataFrame({'Id': ids, 'Predicted': np.array(predicted)})
    sub.groupby(by='Id').agg('mean').to_csv(f'{sub_name}.csv')

In [36]:
save_sub(oof_pred[0].mean(axis=1), X_test.index, sub_name='oof_test_es')
save_sub(oof_pred[-2], X.index, sub_name='oof_train')