In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

RANDOM_STATE = 42

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/techuklon-int20h/Samle_Submission.csv
/kaggle/input/techuklon-int20h/train.csv
/kaggle/input/techuklon-int20h/test.csv


In [2]:
!wget https://github.com/andrii0yerko/INT20H-2022-Hackathon/blob/main/data/kfold.pkl?raw=true -O kfold.pkl
kf = pd.read_pickle('kfold.pkl')[['train_idx', 'test_idx']].to_records(index=False)

--2022-01-23 17:53:50--  https://github.com/andrii0yerko/INT20H-2022-Hackathon/blob/main/data/kfold.pkl?raw=true
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/andrii0yerko/INT20H-2022-Hackathon/raw/main/data/kfold.pkl [following]
--2022-01-23 17:53:50--  https://github.com/andrii0yerko/INT20H-2022-Hackathon/raw/main/data/kfold.pkl
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/andrii0yerko/INT20H-2022-Hackathon/main/data/kfold.pkl [following]
--2022-01-23 17:53:50--  https://raw.githubusercontent.com/andrii0yerko/INT20H-2022-Hackathon/main/data/kfold.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercont

In [3]:
INPUT_DIR = '/kaggle/input/techuklon-int20h'

train = pd.read_csv(f'{INPUT_DIR}/train.csv')
test = pd.read_csv(f'{INPUT_DIR}/test.csv')

In [4]:
all_data = pd.concat((train, test))

indicators = all_data.drop(columns=['Id', 'Week', 'target']).isna()
indicators.columns = indicators.columns.map(lambda s: s+'_isna')

all_data_with_ind = pd.concat((all_data, indicators), axis=1)

train_indic = all_data_with_ind[~all_data_with_ind['target'].isna()].fillna(-1)
test_indic = all_data_with_ind[all_data_with_ind['target'].isna()].drop(columns=['target']).fillna(-1)

In [5]:
def kfold_loop(X, y, groups, kf, X_test=None, loop_verbose=0, model=None, fit_callbacks=[], *lgbm_args, **lgbm_kwargs):
    if model is None:
        model = lgbm.LGBMClassifier(
            verbose=-1,
            *lgbm_args, **lgbm_kwargs
        )
    oof_pred = []
    oof_pred_train = pd.Series(index=y.index, dtype='float64')
    scores = []
    for fold, (train_idx, test_idx) in enumerate(kf):
        
        train_idx = groups.isin(train_idx)
        test_idx = groups.isin(test_idx)
        _X_train = X[train_idx]
        _X_val = X[test_idx]
        _y_train = y[train_idx]
        _y_val = y[test_idx]
                
        if loop_verbose > 1:
            print(f'--- FOLD {fold+1} ---')
        try: # LGBM-specific
            model.fit(
                _X_train, _y_train,
                eval_set=(_X_val, _y_val),
                callbacks=[lgbm.log_evaluation(period=0)] + fit_callbacks
            )
        except TypeError: # sklearn general
            model.fit(_X_train, _y_train)
        
        # fold_acc = (model.predict(_X_val) == _y_val).mean()
        _y_pred = model.predict_proba(_X_val)[:, 1]
        
        oof_pred_train[test_idx] = _y_pred
        
        true_labels = pd.Series(_y_val).groupby(groups[test_idx]).mean()
        pred_labels = pd.Series(_y_pred, index=_y_val.index).groupby(groups[test_idx]).mean()
        fold_score = roc_auc_score(true_labels, pred_labels)
        if loop_verbose > 1:
            print(f'\tAUC score: {fold_score:.5f}')
        scores.append(fold_score)
        if X_test is not None:
            oof_pred.append(model.predict_proba(X_test)[:, 1])
    if loop_verbose > 0:
        print(f'fold-mean AUC score: {np.mean(scores):.5f}\t fold-std AUC score: {np.std(scores):.5f}\n')
    return np.array(oof_pred).T, np.mean(scores), np.std(scores), oof_pred_train, model

In [6]:

oof_pred = kfold_loop(
    X=train_indic.drop(columns=['target', 'Id']),
    y=train_indic['target'],
    groups=train_indic['Id'],
    kf=kf,
    X_test=test_indic.drop(columns=['Id']),
    loop_verbose=2,
    random_state=RANDOM_STATE,
)

--- FOLD 1 ---
	AUC score: 0.95768
--- FOLD 2 ---
	AUC score: 0.94268
--- FOLD 3 ---
	AUC score: 0.95180
--- FOLD 4 ---
	AUC score: 0.95217
--- FOLD 5 ---
	AUC score: 0.95169
fold-mean AUC score: 0.95121	 fold-std AUC score: 0.00482



In [7]:
def save_sub(predicted, ids, sub_name):
    sub = pd.DataFrame({'Id': ids, 'Predicted': predicted})
    sub.groupby(by='Id').agg('mean').to_csv(f'{sub_name}.csv')

In [8]:
save_sub(oof_pred[0].mean(axis=1), test_indic['Id'], sub_name='oof_test')
save_sub(oof_pred[-2], train_indic['Id'], sub_name='oof_train')