In [5]:
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from copy import deepcopy
import numpy as np

In [18]:
class BaggingModel:
    def __init__(self, classifiers):
        self.classifiers = classifiers

    def fit(self, X, y):
        for classifier in self.classifiers:
            classifier.fit(X, y)
        return self

    def predict_proba(self, X):
        return np.mean(
            [classifier.predict_proba(X) for classifier in self.classifiers], axis=0
        )


def create_cv_models(X, y, base_classifier, random_state=42) -> tuple:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    models = []
    pred = np.zeros_like(y, dtype=float)
    for train_idx, valid_idx in skf.split(X, y):
        model = deepcopy(base_classifier)
        model.fit(X[train_idx], y[train_idx])
        models.append(model)
        pred[valid_idx] = model.predict_proba(X[valid_idx])[:, 1]
    return BaggingModel(models), pred

In [19]:
train_X, train_y = make_classification(
    n_samples=10000, n_features=20, n_classes=2, n_informative=4, n_redundant=2, random_state=42
)
train_X, test_X, train_y, test_y = train_test_split(
    train_X, train_y, test_size=0.2, random_state=42
)

In [20]:
model = CalibratedClassifierCV(
    LGBMClassifier(verbosity=-1, importance_type="gain"), method="sigmoid"
)

In [21]:
model, pred = create_cv_models(train_X, train_y, model)

In [22]:
model.predict_proba(test_X)

array([[0.03148731, 0.96851269],
       [0.03089386, 0.96910614],
       [0.03617015, 0.96382985],
       ...,
       [0.03125122, 0.96874878],
       [0.03083787, 0.96916213],
       [0.96628298, 0.03371702]])