In [1]:
import numpy as np 

In [2]:
def mean_predictions(probas):
    return np.mean(probas, axis=1) 

def max_voting(preds):
    idxs = np.argmax(preds, axis=1)
    return np.take_along_axis(preds, idxs[:, None], axis = 1)

In [6]:
import scipy.stats as stats

In [8]:
def rank_mean(probas):
    ranked = [] 
    for i in range(probas.shape[1]):
        rank_data = stats.rankdata(probas[:,i])
        print(rank_data)
        ranked.append(rank_data) 

    ranked = np.column_stack(ranked)
    return np.mean(ranked, axis=1)

rank_mean(np.array([[1,1],[1,0],[0,1]]))

[2.5 2.5 1. ]
[2.5 1.  2.5]


array([2.5 , 1.75, 1.75])

we train our random forest model, logistic regression model and our xgboost model on fold 1 and make predictions on fold 2. 

After this, we train the models from scratch on fold 2 and make predictions on fold 1. 

Thus, we have created predictions for all of the training data. 

-----------
Now to combine these models
we take fold 1 and all the predictions for fold 1 and create an optimization function that tries to find the best weights so as to minimize error or maximize AUC against the targets for fold 2. 

So, we are kind of training an optimization model on fold 1 with the
predicted probabilities for the three models and evaluating it on fold 2. 

Let’s first look at a class we can use to find the best weights of multiple models to optimize for AUC (or any kind of prediction-metric combination in general).

In [9]:
import numpy as np
from functools import partial
from scipy.optimize import fmin
from sklearn import metrics

In [10]:
class OptimizeAUC:
    def __init__(self):
        self.coef = 0.0 

    def _auc(self, coef, X, y):
        x_coef = X * coef
        predictions = np.sum(x_coef, axis=1)
        auc_score = metrics.roc_auc_score(y, predictions)
        return -1.0 * auc_score
    
    def fit(self, X, y):
        loss_partial = partial(self._auc, X=X, y=y) 
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size=1)
        self.coef_ = fmin(loss_partial, initial_coef, disp=True)

    def predict(self, X):
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis=1)
        return predictions

In [16]:
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection

In [19]:
X, y = make_classification(n_samples=10000, n_features=25)

In [20]:
xfold1, xfold2, yfold1, yfold2 = model_selection.train_test_split(
 X,
 y,
 test_size=0.5,
 stratify=y
)

In [21]:
logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()

In [22]:
logreg.fit(xfold1, yfold1)
rf.fit(xfold1, yfold1)
xgbc.fit(xfold1, yfold1)

In [23]:
pred_logreg = logreg.predict_proba(xfold2)[:, 1]
pred_rf = rf.predict_proba(xfold2)[:, 1]
pred_xgbc = xgbc.predict_proba(xfold2)[:, 1]

In [24]:
avg_pred = (pred_logreg + pred_rf + pred_xgbc) / 3

In [26]:
fold2_preds = np.column_stack((
 pred_logreg,
 pred_rf,
 pred_xgbc,
 avg_pred
))

# calculate and store individual AUC values
aucs_fold2 = []
for i in range(fold2_preds.shape[1]):
    auc = metrics.roc_auc_score(yfold2, fold2_preds[:, i])
    aucs_fold2.append(auc)

print(f"Fold-2: LR AUC = {aucs_fold2[0]}")
print(f"Fold-2: RF AUC = {aucs_fold2[1]}")
print(f"Fold-2: XGB AUC = {aucs_fold2[2]}")
print(f"Fold-2: Average Pred AUC = {aucs_fold2[3]}")


Fold-2: LR AUC = 0.9560274566795376
Fold-2: RF AUC = 0.9744605232231535
Fold-2: XGB AUC = 0.9744113231523053
Fold-2: Average Pred AUC = 0.9756911649952775


In [27]:
logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()
logreg.fit(xfold2, yfold2)
rf.fit(xfold2, yfold2)
xgbc.fit(xfold2, yfold2)
pred_logreg = logreg.predict_proba(xfold1)[:, 1]
pred_rf = rf.predict_proba(xfold1)[:, 1]
pred_xgbc = xgbc.predict_proba(xfold1)[:, 1]
avg_pred = (pred_logreg + pred_rf + pred_xgbc) / 3
fold1_preds = np.column_stack((
 pred_logreg,
 pred_rf,
 pred_xgbc,
 avg_pred
))

In [28]:
aucs_fold1 = []
for i in range(fold1_preds.shape[1]):
    auc = metrics.roc_auc_score(yfold1, fold1_preds[:, i])
    aucs_fold1.append(auc)
print(f"Fold-1: LR AUC = {aucs_fold1[0]}")
print(f"Fold-1: RF AUC = {aucs_fold1[1]}")
print(f"Fold-1: XGB AUC = {aucs_fold1[2]}")
print(f"Fold-1: Average prediction AUC = {aucs_fold1[3]}")

Fold-1: LR AUC = 0.9534254807692307
Fold-1: RF AUC = 0.9722848890493159
Fold-1: XGB AUC = 0.9714193668335791
Fold-1: Average prediction AUC = 0.9725149696383223


# find model weights 

In [29]:
opt = OptimizeAUC()

In [30]:
opt.fit(fold1_preds[:, :-1], yfold1)
opt_preds_fold2 = opt.predict(fold2_preds[:, :-1])
auc = metrics.roc_auc_score(yfold2, opt_preds_fold2)
print(f"Optimized AUC, Fold 2 = {auc}")
print(f"Coefficients = {opt.coef_}")


Optimization terminated successfully.
         Current function value: -0.973388
         Iterations: 39
         Function evaluations: 83
Optimized AUC, Fold 2 = 0.9760217254712846
Coefficients = [0.0652288  0.14094025 0.50889822]


In [31]:
opt = OptimizeAUC()
opt.fit(fold2_preds[:, :-1], yfold2)
opt_preds_fold1 = opt.predict(fold1_preds[:, :-1])
auc = metrics.roc_auc_score(yfold1, opt_preds_fold1)
print(f"Optimized AUC, Fold 1 = {auc}")
print(f"Coefficients = {opt.coef_}")

Optimization terminated successfully.
         Current function value: -0.976218
         Iterations: 65
         Function evaluations: 137
Optimized AUC, Fold 1 = 0.9731967313836324
Coefficients = [0.15688259 0.20408024 0.47410531]
