### - Ensemble/Blend the 4 model predictions into a single prediction

In [1]:
import os
import datetime
from time import time
import pathlib
import pandas as pd
import numpy as np
from collections import defaultdict
from collections import Counter

In [2]:
from sklearn.metrics import precision_recall_curve,average_precision_score
from sklearn.metrics import log_loss, roc_curve
from sklearn.metrics import auc,roc_auc_score

In [3]:
from numba import njit
from scipy.optimize import minimize, fsolve

In [4]:
file_indicator = "_subsample"
data_dir = pathlib.Path("../2.data_split/model_data")

In [5]:
cp_test = pathlib.Path(f"{data_dir}/cp/test_lvl4_data{file_indicator}.csv.gz")
L1000_test = pathlib.Path(f"{data_dir}/L1/test_lvl4_data.csv.gz")
cp_L1000_test = pathlib.Path(f"{data_dir}/merged/test_lvl4_data.csv.gz")

In [6]:
model_preds_dir = '../L1000_CP_model_predictions/'

In [7]:
df_cp_test = pd.read_csv(cp_test, compression='gzip',low_memory = False)
df_L1000_test = pd.read_csv(L1000_test, compression='gzip',low_memory = False)
df_cp_L1000_test = pd.read_csv(cp_L1000_test, compression='gzip',low_memory = False)

In [8]:
df_cp_L1000_test.shape

(7143, 2313)

In [9]:
##resnet
df_cp_resnet_test = pd.read_csv(os.path.join(model_preds_dir, f'cp_test_preds_resnet{file_indicator}.csv'))
df_L1000_resnet_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_preds_resnet.csv'))
df_cp_L1000_resnet_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_preds_resnet.csv'))

In [10]:
print(df_cp_L1000_resnet_test.shape)
df_cp_L1000_resnet_test.head()

(7143, 518)


Unnamed: 0,11-beta hydroxysteroid dehydrogenase inhibitor,11-beta-hsd1 inhibitor,"17,20 lyase inhibitor",3-ketoacyl coa thiolase inhibitor,3beta-hydroxy-delta5-steroid dehydrogenase inhibitor,5 alpha reductase inhibitor,abl kinase inhibitor,acat inhibitor,acetylcholine precursor,acetylcholine receptor agonist,...,vegfr inhibitor,vesicular monoamine transporter inhibitor,vitamin b,vitamin d receptor agonist,vitamin k antagonist,voltage-gated sodium channel blocker,voltage-gated sodium channel modulator,wee1 kinase inhibitor,xanthine oxidase inhibitor,xiap inhibitor
0,0.001475,0.00229,0.001914,0.00259,0.002135,0.002601,0.001829,0.00414,0.001667,0.006587,...,0.01115,0.002605,0.002086,0.003864,0.00214,0.001742,0.002277,0.001838,0.003219,0.002229
1,0.001552,0.001856,0.001776,0.001765,0.001695,0.003158,0.001906,0.003203,0.001767,0.005509,...,0.011468,0.001978,0.003318,0.003773,0.002463,0.00167,0.002108,0.001603,0.002322,0.002275
2,0.001683,0.002693,0.002681,0.002943,0.001907,0.002361,0.002621,0.004289,0.001961,0.007017,...,0.020123,0.002729,0.002327,0.004476,0.002184,0.001786,0.002573,0.002114,0.004231,0.002544
3,0.001325,0.001981,0.001679,0.002261,0.001715,0.002629,0.002225,0.002742,0.00202,0.007814,...,0.008979,0.002025,0.002142,0.003789,0.002166,0.001447,0.00196,0.001657,0.002691,0.001653
4,0.001568,0.002059,0.002075,0.002369,0.001714,0.001928,0.001773,0.003076,0.001587,0.007437,...,0.009487,0.002799,0.001749,0.003437,0.00236,0.00157,0.002094,0.001381,0.00344,0.001592


In [11]:
##1-d cnn
df_cp_cnn_test = pd.read_csv(os.path.join(model_preds_dir, f'cp_test_preds_1dcnn{file_indicator}.csv'))
df_L1000_cnn_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_preds_1dcnn.csv'))
df_cp_L1000_cnn_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_preds_1dcnn.csv'))

In [12]:
print(df_cp_L1000_cnn_test.shape)
df_cp_L1000_cnn_test.head()

(7143, 518)


Unnamed: 0,11-beta hydroxysteroid dehydrogenase inhibitor,11-beta-hsd1 inhibitor,"17,20 lyase inhibitor",3-ketoacyl coa thiolase inhibitor,3beta-hydroxy-delta5-steroid dehydrogenase inhibitor,5 alpha reductase inhibitor,abl kinase inhibitor,acat inhibitor,acetylcholine precursor,acetylcholine receptor agonist,...,vegfr inhibitor,vesicular monoamine transporter inhibitor,vitamin b,vitamin d receptor agonist,vitamin k antagonist,voltage-gated sodium channel blocker,voltage-gated sodium channel modulator,wee1 kinase inhibitor,xanthine oxidase inhibitor,xiap inhibitor
0,0.001584,0.001868,0.001755,0.001875,0.001901,0.002732,0.002316,0.002796,0.001966,0.006341,...,0.011777,0.001868,0.003575,0.004213,0.001955,0.001828,0.001848,0.001903,0.00326,0.002104
1,0.001677,0.001857,0.001688,0.001836,0.001788,0.002498,0.002382,0.00295,0.001887,0.005771,...,0.012437,0.001904,0.004365,0.00334,0.001876,0.001924,0.001986,0.00203,0.002957,0.00236
2,0.001677,0.001837,0.001789,0.001869,0.001858,0.002412,0.002374,0.00324,0.00196,0.00566,...,0.01163,0.001889,0.002951,0.004185,0.00195,0.001961,0.001842,0.001862,0.002986,0.002527
3,0.001567,0.001891,0.001634,0.001874,0.001777,0.002442,0.002361,0.003194,0.001868,0.005244,...,0.01166,0.00193,0.002939,0.003618,0.001778,0.002042,0.001873,0.001875,0.002891,0.002514
4,0.001921,0.001807,0.00219,0.001821,0.00202,0.002473,0.002275,0.003251,0.002087,0.00631,...,0.011577,0.001875,0.002902,0.004009,0.002046,0.001928,0.001956,0.001814,0.003092,0.00263


In [13]:
##tabnet
df_cp_tabnet_test = pd.read_csv(os.path.join(model_preds_dir, f'cp_test_preds_tabnet{file_indicator}.csv'))
df_L1000_tabnet_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_preds_tabnet.csv'))
df_cp_L1000_tabnet_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_preds_tabnet.csv'))

In [14]:
df_cp_L1000_tabnet_test.shape

(7143, 518)

In [15]:
##stagedNN
df_cp_simplenn_test = pd.read_csv(os.path.join(model_preds_dir, f'cp_test_preds_simplenn{file_indicator}.csv'))
df_L1000_simplenn_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_preds_simplenn.csv'))
df_cp_L1000_simplenn_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_preds_simplenn.csv'))

In [16]:
df_cp_L1000_simplenn_test.shape

(7143, 518)

In [17]:
df_cp_tst_targets = df_cp_test[df_cp_cnn_test.columns]
df_L1000_tst_targets = df_L1000_test[df_L1000_cnn_test.columns]
df_cp_L1000_tst_targets = df_cp_L1000_test[df_cp_L1000_cnn_test.columns]

In [18]:
df_cp_tst_targets.shape

(7368, 518)

In [19]:
df_L1000_tst_targets.shape

(7219, 518)

In [20]:
df_cp_L1000_tst_targets.shape

(7143, 518)

#### - Resnet, 1d-cnn, Tabnet, Simplenn --> 4 model predictions

In [21]:
# CPMP's logloss from https://www.kaggle.com/c/lish-moa/discussion/183010
def log_loss_numpy(y_true, y_pred):
    y_true_ravel = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = np.where(y_true_ravel == 1, - np.log(y_pred), - np.log(1 - y_pred))
    return loss.mean()

def func_numpy_metric(weights, oof, y_true):
    oof_blend = np.tensordot(weights, oof, axes = ((0), (0)))
    return log_loss_numpy(y_true, oof_blend)

def grad_func(weights, oof, y_true):
    oof_clip = np.clip(oof, 1e-15, 1 - 1e-15)
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], np.zeros((oof.shape[1], oof.shape[2]))
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
        gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients

@njit
def grad_func_jit(weights, oof, y_true):
    oof_clip = np.minimum(1 - 1e-15, np.maximum(oof, 1e-15))
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], np.zeros((oof.shape[1], oof.shape[2]))
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
    gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients

In [22]:
cp_model_preds = [df_cp_cnn_test, df_cp_resnet_test, df_cp_tabnet_test, df_cp_simplenn_test]
L1000_model_preds = [df_L1000_cnn_test, df_L1000_resnet_test, df_L1000_tabnet_test, df_L1000_simplenn_test]
cp_L1000_model_preds = [df_cp_L1000_cnn_test, df_cp_L1000_resnet_test, df_cp_L1000_tabnet_test, df_cp_L1000_simplenn_test]

In [23]:
models_name = ['1d-Cnn', 'Resnet', 'Tabnet', 'SimpleNN']

def get_optmized_blended_weights(model_oofs, df_targets, num_of_models = 4, models_name = models_name):
    """
    This function assign weights to each of the models used in predicting MOAs based on the log-loss obtained 
    when comparing each model prediction results with the actual MOA (Mechanism of actions) test labels.

    for more info:https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0/notebook
    """
    model_oof_preds = np.zeros((num_of_models, df_targets.shape[0], df_targets.shape[1]))
    for idx in range(num_of_models):
        model_oof_preds[idx] = model_oofs[idx].values
        score_oof = log_loss_numpy(df_targets, model_oof_preds[idx])
        print(f'{idx} {models_name[idx]}, Test loss:\t', score_oof)
    
    tol = 1e-10
    init_guess = [1 / model_oof_preds.shape[0]] * model_oof_preds.shape[0]
    bnds = [(0, 1) for _ in range(model_oof_preds.shape[0])]
    cons = {
        'type': 'eq',
        'fun': lambda x: np.sum(x) - 1,
        'jac': lambda x: [1] * len(x)
    }
    print('Inital Blend OOF:', func_numpy_metric(init_guess, model_oof_preds, df_targets.values))
    
    start_time = time()
    
    res_scipy = minimize(fun = func_numpy_metric, x0 = init_guess, 
                       args=(model_oof_preds, df_targets.values), 
                       method = 'SLSQP', ##L-BFGS-B ##SLSQP
                       jac = grad_func_jit, # grad_func 
                       bounds = bnds, constraints = cons, tol = tol)
    print(f'[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] Optimised Blend OOF:', res_scipy.fun)
    print('Optimised Weights:', res_scipy.x)
    return model_oof_preds, res_scipy.x

In [24]:
_, L1000_model_weights = get_optmized_blended_weights(L1000_model_preds, df_L1000_tst_targets,)

0 1d-Cnn, Test loss:	 0.014648914719429189
1 Resnet, Test loss:	 0.014334232680115551
2 Tabnet, Test loss:	 0.014635645350260953
3 SimpleNN, Test loss:	 0.013842045801886601
Inital Blend OOF: 0.014183197900880778
[00:03] Optimised Blend OOF: 0.01418319790088078
Optimised Weights: [0.25 0.25 0.25 0.25]


In [25]:
_, cp_model_weights = get_optmized_blended_weights(cp_model_preds, df_cp_tst_targets,)

0 1d-Cnn, Test loss:	 0.014624553413440775
1 Resnet, Test loss:	 0.013885899881088497
2 Tabnet, Test loss:	 0.01425797501525762
3 SimpleNN, Test loss:	 0.014148668688750988
Inital Blend OOF: 0.014005251109212347
[00:02] Optimised Blend OOF: 0.014005251112647181
Optimised Weights: [0.25000007 0.25000007 0.25000007 0.2499998 ]


In [26]:
_, cp_L1000_model_weights = get_optmized_blended_weights(cp_L1000_model_preds, df_cp_L1000_tst_targets)

0 1d-Cnn, Test loss:	 0.014298406016558518
1 Resnet, Test loss:	 0.014070003642352868
2 Tabnet, Test loss:	 0.014405625152026725
3 SimpleNN, Test loss:	 0.01750511199945077
Inital Blend OOF: 0.014366329392756628
[00:05] Optimised Blend OOF: 0.014096815549659757
Optimised Weights: [0.32575892 0.32575892 0.32575892 0.02272325]


In [27]:
def model_eval_results(df_tst, df_tst_y, df_preds):
    """
    This function prints out the model evaluation results from the train and test predictions.
    The evaluation metrics used in assessing the performance of the models are: ROC AUC score,
    log loss and Precision-Recall AUC score
    """
    eval_metrics = ['log loss', 'ROC AUC score', 'PR-AUC/Average_precision_score',]
    moa_class_list = df_tst['moa'].unique()
    val_moas = [moa for moa_list in moa_class_list for moa in moa_list.split('|')]
    print('-' * 10, 'Test data prediction results', '-' * 10)
    print(f'{eval_metrics[0]}:', log_loss(np.ravel(df_tst_y), np.ravel(df_preds)))
    print(f'{eval_metrics[1]}:', roc_auc_score(df_tst_y[val_moas],df_preds[val_moas], average='macro'))
    print(f'{eval_metrics[2]}:', average_precision_score(df_tst_y[val_moas], df_preds[val_moas], average="micro"))

In [28]:
##[1.57502187e-01,1.15142271e-16,0.00000000e+00,8.42497813e-01] <-- modify the model weights
df_L1000_blend = pd.DataFrame(np.zeros(df_L1000_cnn_test.shape), columns = df_L1000_cnn_test.columns)
df_L1000_blend = df_L1000_cnn_test*0.45 + df_L1000_resnet_test*0.05 + df_L1000_tabnet_test*0.05 + df_L1000_simplenn_test*0.45

In [29]:
0.45+(0.05*2)+0.45

1.0

In [30]:
model_eval_results(df_L1000_test, df_L1000_tst_targets, df_L1000_blend)

---------- Test data prediction results ----------
log loss: 0.014078111648250107
ROC AUC score: 0.6400930804272744
PR-AUC/Average_precision_score: 0.18777289927579524


In [31]:
##[4.29598527e-01 3.27312317e-01 2.43089156e-01 5.42101086e-18] <-- modify the model weights
df_cp_blend = pd.DataFrame(np.zeros(df_cp_cnn_test.shape), columns = df_cp_cnn_test.columns)
df_cp_blend = df_cp_cnn_test*0.35 + df_cp_resnet_test*0.35 + df_cp_tabnet_test*0.25 + df_cp_simplenn_test*0.05

In [32]:
0.35+0.35+0.25+0.05

1.0

In [33]:
model_eval_results(df_cp_test, df_cp_tst_targets, df_cp_blend)

---------- Test data prediction results ----------
log loss: 0.014015353367323473
ROC AUC score: 0.6504991202441964
PR-AUC/Average_precision_score: 0.2216802890802162


In [34]:
##[0.28574384 0.09796798 0.06528908 0.5509991 ] <-- modify the model weights
df_cp_L1000_blend = pd.DataFrame(np.zeros(df_cp_L1000_cnn_test.shape), columns = df_cp_L1000_cnn_test.columns)
df_cp_L1000_blend = df_cp_L1000_cnn_test*0.30 + df_cp_L1000_resnet_test*0.20 + df_cp_L1000_tabnet_test*0.15 + df_cp_L1000_simplenn_test*0.35

In [35]:
0.30+0.20+0.15+0.35

1.0

In [36]:
model_eval_results(df_cp_L1000_test, df_cp_L1000_tst_targets, df_cp_L1000_blend)

---------- Test data prediction results ----------
log loss: 0.014573075659141312
ROC AUC score: 0.6575588295326908
PR-AUC/Average_precision_score: 0.17427999450186066


In [37]:
def save_to_csv(df, path, file_name, compress=None):
    """save dataframes to csv"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    df.to_csv(os.path.join(path, file_name), index=False, compression=compress)

In [38]:
save_to_csv(df_cp_blend, model_preds_dir, f'cp_test_preds_blend{file_indicator}.csv')
save_to_csv(df_L1000_blend, model_preds_dir, 'L1000_test_preds_blend.csv')
save_to_csv(df_cp_L1000_blend, model_preds_dir, 'cp_L1000_test_preds_blend.csv')