In [5]:
from copy import deepcopy

import numpy as np
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold, train_test_split

In [18]:
class BaggingModel:
    def __init__(self, classifiers):
        self.classifiers = classifiers

    def fit(self, X, y):
        for classifier in self.classifiers:
            classifier.fit(X, y)
        return self

    def predict_proba(self, X):
        return np.mean(
            [classifier.predict_proba(X) for classifier in self.classifiers], axis=0
        )


def create_cv_models(X, y, base_classifier, random_state=42) -> tuple:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    models = []
    pred = np.zeros_like(y, dtype=float)
    for train_idx, valid_idx in skf.split(X, y):
        model = deepcopy(base_classifier)
        model.fit(X[train_idx], y[train_idx])
        models.append(model)
        pred[valid_idx] = model.predict_proba(X[valid_idx])[:, 1]
    return BaggingModel(models), pred

In [19]:
train_X, train_y = make_classification(
    n_samples=10000, n_features=20, n_classes=2, n_informative=4, n_redundant=2, random_state=42
)
train_X, test_X, train_y, test_y = train_test_split(
    train_X, train_y, test_size=0.2, random_state=42
)

In [20]:
model = CalibratedClassifierCV(
    LGBMClassifier(verbosity=-1, importance_type="gain"), method="sigmoid"
)

In [21]:
model, pred = create_cv_models(train_X, train_y, model)

In [22]:
model.predict_proba(test_X)

array([[0.03148731, 0.96851269],
       [0.03089386, 0.96910614],
       [0.03617015, 0.96382985],
       ...,
       [0.03125122, 0.96874878],
       [0.03083787, 0.96916213],
       [0.96628298, 0.03371702]])

In [34]:
a = np.array(
    [
        [0.1, 0.9],
        [0.2, 0.8],
        [0.3, 0.7],
        [0.4, 0.6],
        [0.5, 0.5],
        [0.6, 0.4],
        [0.7, 0.3],
        [0.8, 0.2],
        [0.9, 0.1],
    ]
)

In [None]:
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
for train_idx, valid_idx in skf.split(a, np.array([1, 1, 0, 0, 1, 1, 0, 0, 1])):
    _skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    print(train_idx, valid_idx)
    for train_idx2, valid_idx2 in _skf.split(a[train_idx], np.array([1, 0, 1, 0, 1, 0])):
        print(train_idx2, valid_idx2)

ValueError: Found input variables with inconsistent numbers of samples: [4, 6]

In [None]:
a[[0, 2, 4]]

array([0.5, 0.5])

In [38]:
train_idx

array([1, 3, 7, 8])

In [43]:
a[np.array([0, 2, 4, 6])]

array([[0.1, 0.9],
       [0.3, 0.7],
       [0.5, 0.5],
       [0.7, 0.3]])