In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import catboost as ctb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from tqdm.notebook import tqdm


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
RANDOM_STATE = 42

In [None]:
train = pd.read_csv('/kaggle/input/techuklon-int20h/train.csv')
test = pd.read_csv('/kaggle/input/techuklon-int20h/test.csv')

In [None]:
# cols = train.columns
# train = train.set_index(['Id', 'Week']).unstack()
# train['target'].mean(axis=1).value_counts()
# train.columns = [f"{v}_week{i}" for v, i in train.columns]

# for i, c in enumerate(cols[2:]):
#     t = train[c].dropna().values.T
#     if (t[0] == t).all():
#         print(c)

In [None]:
all_data = pd.concat((train, test))

P_cols = [f'P{i}' for i in range(1, 28)]
P_features = all_data[['Id', 'Week'] + P_cols]
P_features = P_features.set_index(['Id', 'Week']).unstack()
P_features.columns = [f"{v}_week{i}" for v, i in P_features.columns]

P_features['P_num_nan'] = P_features.isna().sum(axis=1)

In [None]:
all_data = all_data[[c for c in all_data.columns if c not in P_cols]]
all_data = all_data[all_data.Week == 0].drop(columns='Week')

all_data['V_num_nan'] = all_data.iloc[:, :-1].isna().sum(axis=1)

In [None]:
all_data = all_data.merge(P_features.reset_index())

In [None]:
all_data

In [None]:
test_idx = all_data['target'].isna()
train = all_data[~test_idx]
# test = all_data[test_idx].drop(columns=['Id', 'target'])
test = all_data[test_idx].drop(columns=['target'])

# X, y = train.drop(columns=['Id', 'target']), train['target']
X, y = train.drop(columns=['target']), train[['Id', 'target']]

In [None]:
def kfold_loop(X, y, kf, X_test=None, loop_verbose=0, model=None, fit_callbacks=[], *lgbm_args, **lgbm_kwargs):
    if model is None:
        model = lgbm.LGBMClassifier(
            verbose=-1,
            *lgbm_args, **lgbm_kwargs
        )
    oof_pred = []
    scores = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        _X_train = X.iloc[train_idx]
        _X_val = X.iloc[test_idx]
        _y_train = y.iloc[train_idx]
        _y_val = y.iloc[test_idx]
        if loop_verbose > 1:
            print(f'--- FOLD {fold+1} ---')
        try: # LGBM-specific
            model.fit(
                _X_train, _y_train,
                eval_set=(_X_val, _y_val),
                callbacks=[lgbm.log_evaluation(period=0)] + fit_callbacks
            )
        except TypeError: # sklearn general
            model.fit(_X_train, _y_train)
            
        # fold_acc = (model.predict(_X_val) == _y_val).mean()
        fold_score = roc_auc_score(_y_val, model.predict_proba(_X_val)[:, 1])
        if loop_verbose > 1:
            print(f'\tAUC score: {fold_score:.5f}')
        scores.append(fold_score)
        if X_test is not None:
            oof_pred.append(model.predict_proba(X_test)[:, 1])
    if loop_verbose > 0:
        print(f'fold-mean AUC score: {np.mean(scores):.5f}\t fold-std AUC score: {np.std(scores):.5f}\n')
        
    res = {
        'model': model,
        'test_pred': np.array(oof_pred).T,
        'scores': scores,
        'scores_mean': np.mean(scores),
        'scores_std': np.std(scores)
    }
    return res

In [None]:
%%time
kf = RepeatedStratifiedKFold(n_splits=5, random_state=RANDOM_STATE)
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

kfold_result = kfold_loop(
    X=X,
    y=y,
    kf=kf,
    X_test=test,
    loop_verbose=2,
    random_state=RANDOM_STATE,
    use_missing=True,
#     n_estimators=256
)

In [None]:
fig, ax = plt.subplots(figsize=(15, 30))
lgbm.plot_importance(kfold_result['model'], ax=ax, grid=False)
plt.show()

In [None]:
def save_sub(predicted, sub_name):
    sub = pd.DataFrame({'Id': all_data.loc[test_idx, 'Id'], 'Predicted': predicted})
    sub.groupby(by='Id').agg('mean').to_csv(f'{sub_name}.csv')

In [None]:
save_sub(kfold_result['test_pred'].mean(axis=1), sub_name='lgbm_time_expanded_repeated_kfold_oof_no_fe')

In [None]:
folds = []

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y['target'])):
    folds.append([fold, all_data['Id'].values[train_idx], all_data['Id'].values[test_idx]])
    
folds_df = pd.DataFrame(folds, columns=['fold', 'train_idx', 'test_idx'])

folds_df

In [None]:
!wget https://github.com/andrii0yerko/INT20H-2022-Hackathon/blob/main/data/kfold.pkl?raw=true -O kfold.pkl
    

folds_df = pd.read_pickle('kfold.pkl')

In [None]:
folds_df

In [26]:
class KFoldMetaClassifier:
    def __init__(self, meta_model, models, kf_split):
        self.meta_model = meta_model
        self.models = models
        self.kf_split = kf_split
    
    @staticmethod
    def _meta_features_one_model(model, X_train, y_train, X_oof, *fit_args, **fit_kwargs):
        model.fit(X_train, y_train, *fit_args, **fit_kwargs)
        meta_features = model.predict_proba(X_oof)[:, 1]
        
        return meta_features
    
    def fit_predict(self, X, y, X_test, meta_kf, models_args=None, verbose=0):
        '''
        X, X_test - shape (n_models, n_samples, n_features) - specific dataset for each model
        y - shape (n_samples, )
        models_args - array of tuples (args of i-th model, kwargs of i-th model
        '''
        k = len(self.kf_split)
        N = len(self.models)
        if models_args is None:
            models_args = [[]] * N
        train_meta = pd.DataFrame(columns=['Id'] + [f'model_{i}' for i in range(N)])
        train_meta['Id'] = X[0]['Id']
        test_meta = pd.DataFrame(columns=['Id'] + [f'model_{i}' for i in range(N)])
        test_meta['Id'] = X_test[0]['Id']
        
        if verbose > 0:
            print('making train_meta...')
        
        for fold in range(k):
            if verbose > 0:
                print(f'--- FOLD {fold+1} ---')
                
            train_idx = self.kf_split['train_idx'].values[fold]
            test_idx = self.kf_split['test_idx'].values[fold]
            
            for n, model in enumerate(self.models):
                _X_train = X[n][X[n]['Id'].isin(train_idx)].drop(columns=['Id'])
                _X_oof = X[n][X[n]['Id'].isin(test_idx)].drop(columns=['Id'])
                
                _y_train = y[X[n]['Id'].isin(train_idx)]['target']
                _y_oof = y[X[n]['Id'].isin(test_idx)]['target']
                
                meta_features = self._meta_features_one_model(
                    model, _X_train, _y_train, _X_oof, *models_args[n]
                )
                
                if verbose > 0:
                    print(f'\tmodel {n}, AUC: {roc_auc_score(_y_oof, meta_features):.5f}')
                    
                train_meta.loc[train_meta['Id'].isin(test_idx), f'model_{n}'] = meta_features
                
        if verbose > 0:
            print('\nmaking test_meta...')
            print('\n')
            print('fitting meta...')
        
        for n, model in enumerate(self.models):
            meta_features = self._meta_features_one_model(
                    model, 
                    X[n].drop(columns=['Id']), 
                    y['target'], 
                    X_test[n].drop(columns=['Id']), 
                    *models_args[n]
                )
            train_meta[f'model_{n}'] = train_meta[f'model_{n}'].astype(float)
            test_meta[f'model_{n}'] = meta_features
    
                                    
#         meta_predict_result = kfold_loop(
#             model=self.meta_model,
#             X=train_meta.drop(columns=['Id']),
#             y=y['target'],
#             kf=meta_kf,
#             X_test=test_meta.drop(columns=['Id']),
#             loop_verbose=verbose
#         )
        
#         return meta_predict_result

        return train_meta, test_meta
            

In [27]:
kfmc = KFoldMetaClassifier(
    meta_model=lgbm.LGBMClassifier(random_state=RANDOM_STATE, verbose=-1), 
    models=[
        ctb.CatBoostClassifier(random_state=RANDOM_STATE, verbose=0)
    ],
    kf_split=folds_df
)

meta_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

kfmc_result = kfmc.fit_predict(
    X=[X],
    y=y,
    X_test=[test],
    meta_kf=meta_kf,
    verbose=2
)

making train_meta...
--- FOLD 1 ---


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

	model 0, AUC: 0.97998
--- FOLD 2 ---
	model 0, AUC: 0.97344
--- FOLD 3 ---
	model 0, AUC: 0.96522
--- FOLD 4 ---
	model 0, AUC: 0.97768
--- FOLD 5 ---
	model 0, AUC: 0.97322

making test_meta...


fitting meta...


In [28]:
kfmc_result[0].to_pickle('catboost_train_meta.pkl')
kfmc_result[1].to_pickle('catboost_test_meta.pkl')

In [None]:
kfmc_result[0]