### - Ensemble/Blend the 4 model predictions into a single prediction

In [1]:
import os
import datetime
from time import time
import pathlib
import pandas as pd
import numpy as np
from collections import defaultdict
from collections import Counter

In [2]:
from sklearn.metrics import precision_recall_curve,average_precision_score
from sklearn.metrics import log_loss, roc_curve
from sklearn.metrics import auc,roc_auc_score

In [3]:
from numba import njit
from scipy.optimize import minimize, fsolve

In [4]:
# The two options here are "" and "_subsample"
file_indicator = ""
data_dir = pathlib.Path("../2.data_split/model_data")

In [5]:
cp_test = pathlib.Path(f"{data_dir}/cp/test_lvl4_data{file_indicator}.csv.gz")
L1000_test = pathlib.Path(f"{data_dir}/L1/test_lvl4_data.csv.gz")
cp_L1000_test = pathlib.Path(f"{data_dir}/merged/test_lvl4_data.csv.gz")

In [6]:
model_preds_dir = '../L1000_CP_model_predictions/'

In [7]:
df_cp_test = pd.read_csv(cp_test, compression='gzip',low_memory = False)
df_L1000_test = pd.read_csv(L1000_test, compression='gzip',low_memory = False)
df_cp_L1000_test = pd.read_csv(cp_L1000_test, compression='gzip',low_memory = False)

In [8]:
df_cp_L1000_test.shape

(6377, 2296)

In [9]:
##resnet
df_cp_resnet_test = pd.read_csv(os.path.join(model_preds_dir, f'cp_test_preds_resnet{file_indicator}.csv'))
df_L1000_resnet_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_preds_resnet.csv'))
df_cp_L1000_resnet_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_preds_resnet.csv'))

In [10]:
print(df_cp_L1000_resnet_test.shape)
df_cp_L1000_resnet_test.head()

(6377, 501)


Unnamed: 0,11-beta hydroxysteroid dehydrogenase inhibitor,11-beta-hsd1 inhibitor,"17,20 lyase inhibitor",3-ketoacyl coa thiolase inhibitor,3beta-hydroxy-delta5-steroid dehydrogenase inhibitor,5 alpha reductase inhibitor,abl kinase inhibitor,acat inhibitor,acetylcholine precursor,acetylcholine receptor agonist,...,vasopressin receptor antagonist,vegfr inhibitor,vesicular monoamine transporter inhibitor,vitamin b,vitamin d receptor agonist,vitamin k antagonist,voltage-gated sodium channel blocker,voltage-gated sodium channel modulator,xanthine oxidase inhibitor,xiap inhibitor
0,0.001938,0.002205,0.00307,0.002605,0.002573,0.003133,0.002539,0.002732,0.00333,0.011808,...,0.003956,0.018773,0.003153,0.002224,0.005494,0.002787,0.002587,0.002495,0.003048,0.003461
1,0.001666,0.002112,0.00281,0.00273,0.002381,0.00281,0.002435,0.002561,0.002982,0.01197,...,0.003564,0.014047,0.002938,0.002315,0.005922,0.002802,0.002348,0.00212,0.002625,0.003325
2,0.002164,0.002208,0.002048,0.00279,0.002645,0.002897,0.003285,0.002913,0.002168,0.005874,...,0.007135,0.009742,0.002348,0.005767,0.005344,0.002812,0.002479,0.002817,0.004267,0.002891
3,0.001527,0.002445,0.002152,0.002687,0.0019,0.002352,0.002197,0.002496,0.002658,0.010135,...,0.003992,0.009475,0.002836,0.002488,0.004927,0.002799,0.00247,0.002197,0.002683,0.002621
4,0.001949,0.002003,0.003028,0.002456,0.002507,0.003437,0.002693,0.002691,0.003029,0.011117,...,0.003296,0.021935,0.002958,0.002002,0.005371,0.002576,0.002474,0.002007,0.002835,0.003141


In [11]:
##1-d cnn
df_cp_cnn_test = pd.read_csv(os.path.join(model_preds_dir, f'cp_test_preds_1dcnn{file_indicator}.csv'))
df_L1000_cnn_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_preds_1dcnn.csv'))
df_cp_L1000_cnn_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_preds_1dcnn.csv'))

In [12]:
print(df_cp_L1000_cnn_test.shape)
df_cp_L1000_cnn_test.head()

(6377, 501)


Unnamed: 0,11-beta hydroxysteroid dehydrogenase inhibitor,11-beta-hsd1 inhibitor,"17,20 lyase inhibitor",3-ketoacyl coa thiolase inhibitor,3beta-hydroxy-delta5-steroid dehydrogenase inhibitor,5 alpha reductase inhibitor,abl kinase inhibitor,acat inhibitor,acetylcholine precursor,acetylcholine receptor agonist,...,vasopressin receptor antagonist,vegfr inhibitor,vesicular monoamine transporter inhibitor,vitamin b,vitamin d receptor agonist,vitamin k antagonist,voltage-gated sodium channel blocker,voltage-gated sodium channel modulator,xanthine oxidase inhibitor,xiap inhibitor
0,0.00198,0.001917,0.001803,0.002748,0.001586,0.002306,0.001961,0.001603,0.002092,0.008626,...,0.004104,0.013053,0.002059,0.003485,0.004394,0.00233,0.00169,0.002255,0.002566,0.001881
1,0.00205,0.001922,0.001779,0.002356,0.001621,0.002425,0.002128,0.001633,0.001961,0.007785,...,0.00442,0.013642,0.002096,0.003694,0.004778,0.002242,0.001784,0.002126,0.002534,0.002302
2,0.00263,0.001854,0.001683,0.00169,0.001845,0.002902,0.002761,0.00179,0.001603,0.005536,...,0.005087,0.014557,0.001859,0.004415,0.00455,0.001797,0.001959,0.001774,0.002339,0.00406
3,0.002078,0.001973,0.001766,0.002216,0.001669,0.002413,0.002231,0.001649,0.001906,0.007271,...,0.004373,0.013555,0.002064,0.003719,0.0046,0.00216,0.001845,0.002129,0.002564,0.002482
4,0.00248,0.001804,0.001815,0.001859,0.001848,0.002976,0.002719,0.001786,0.001884,0.008072,...,0.004241,0.015414,0.001947,0.003798,0.005123,0.001901,0.001831,0.001794,0.002359,0.002465


In [13]:
##tabnet
df_cp_tabnet_test = pd.read_csv(os.path.join(model_preds_dir, f'cp_test_preds_tabnet{file_indicator}.csv'))
df_L1000_tabnet_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_preds_tabnet.csv'))
df_cp_L1000_tabnet_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_preds_tabnet.csv'))

In [14]:
df_cp_L1000_tabnet_test.shape

(6377, 501)

In [15]:
##stagedNN
df_cp_simplenn_test = pd.read_csv(os.path.join(model_preds_dir, f'cp_test_preds_simplenn{file_indicator}.csv'))
df_L1000_simplenn_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_preds_simplenn.csv'))
df_cp_L1000_simplenn_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_preds_simplenn.csv'))

In [16]:
df_cp_L1000_simplenn_test.shape

(6377, 501)

In [17]:
df_cp_tst_targets = df_cp_test[df_cp_cnn_test.columns]
df_L1000_tst_targets = df_L1000_test[df_L1000_cnn_test.columns]
df_cp_L1000_tst_targets = df_cp_L1000_test[df_cp_L1000_cnn_test.columns]

In [18]:
df_cp_tst_targets.shape

(10788, 501)

In [19]:
df_L1000_tst_targets.shape

(6461, 501)

In [20]:
df_cp_L1000_tst_targets.shape

(6377, 501)

#### - Resnet, 1d-cnn, Tabnet, Simplenn --> 4 model predictions

In [21]:
# CPMP's logloss from https://www.kaggle.com/c/lish-moa/discussion/183010
def log_loss_numpy(y_true, y_pred):
    y_true_ravel = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = np.where(y_true_ravel == 1, - np.log(y_pred), - np.log(1 - y_pred))
    return loss.mean()

def func_numpy_metric(weights, oof, y_true):
    oof_blend = np.tensordot(weights, oof, axes = ((0), (0)))
    return log_loss_numpy(y_true, oof_blend)

def grad_func(weights, oof, y_true):
    oof_clip = np.clip(oof, 1e-15, 1 - 1e-15)
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], np.zeros((oof.shape[1], oof.shape[2]))
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
        gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients

@njit
def grad_func_jit(weights, oof, y_true):
    oof_clip = np.minimum(1 - 1e-15, np.maximum(oof, 1e-15))
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], np.zeros((oof.shape[1], oof.shape[2]))
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
    gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients

In [22]:
cp_model_preds = [df_cp_cnn_test, df_cp_resnet_test, df_cp_tabnet_test, df_cp_simplenn_test]
L1000_model_preds = [df_L1000_cnn_test, df_L1000_resnet_test, df_L1000_tabnet_test, df_L1000_simplenn_test]
cp_L1000_model_preds = [df_cp_L1000_cnn_test, df_cp_L1000_resnet_test, df_cp_L1000_tabnet_test, df_cp_L1000_simplenn_test]

In [23]:
models_name = ['1d-Cnn', 'Resnet', 'Tabnet', 'SimpleNN']

def get_optmized_blended_weights(model_oofs, df_targets, num_of_models = 4, models_name = models_name):
    """
    This function assign weights to each of the models used in predicting MOAs based on the log-loss obtained 
    when comparing each model prediction results with the actual MOA (Mechanism of actions) test labels.

    for more info:https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0/notebook
    """
    model_oof_preds = np.zeros((num_of_models, df_targets.shape[0], df_targets.shape[1]))
    for idx in range(num_of_models):
        model_oof_preds[idx] = model_oofs[idx].values
        score_oof = log_loss_numpy(df_targets, model_oof_preds[idx])
        print(f'{idx} {models_name[idx]}, Test loss:\t', score_oof)
    
    tol = 1e-10
    init_guess = [1 / model_oof_preds.shape[0]] * model_oof_preds.shape[0]
    bnds = [(0, 1) for _ in range(model_oof_preds.shape[0])]
    cons = {
        'type': 'eq',
        'fun': lambda x: np.sum(x) - 1,
        'jac': lambda x: [1] * len(x)
    }
    print('Inital Blend OOF:', func_numpy_metric(init_guess, model_oof_preds, df_targets.values))
    
    start_time = time()
    
    res_scipy = minimize(fun = func_numpy_metric, x0 = init_guess, 
                       args=(model_oof_preds, df_targets.values), 
                       method = 'SLSQP', ##L-BFGS-B ##SLSQP
                       jac = grad_func_jit, # grad_func 
                       bounds = bnds, constraints = cons, tol = tol)
    print(f'[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] Optimised Blend OOF:', res_scipy.fun)
    print('Optimised Weights:', res_scipy.x)
    return model_oof_preds, res_scipy.x

In [24]:
_, L1000_model_weights = get_optmized_blended_weights(L1000_model_preds, df_L1000_tst_targets,)

0 1d-Cnn, Test loss:	 0.015094656034591088
1 Resnet, Test loss:	 0.014992913133274473
2 Tabnet, Test loss:	 0.01534340711710992
3 SimpleNN, Test loss:	 0.014535441130296516
Inital Blend OOF: 0.014739868911596956
[00:03] Optimised Blend OOF: 0.014739868911596959
Optimised Weights: [0.25 0.25 0.25 0.25]


In [25]:
_, cp_model_weights = get_optmized_blended_weights(cp_model_preds, df_cp_tst_targets,)

0 1d-Cnn, Test loss:	 0.015287790513669524
1 Resnet, Test loss:	 0.0181778456138349
2 Tabnet, Test loss:	 0.016616414191451667
3 SimpleNN, Test loss:	 0.015324007263338041
Inital Blend OOF: 0.015348981922418412
[00:03] Optimised Blend OOF: 0.01534898192241842
Optimised Weights: [0.25 0.25 0.25 0.25]


In [26]:
_, cp_L1000_model_weights = get_optmized_blended_weights(cp_L1000_model_preds, df_cp_L1000_tst_targets)

0 1d-Cnn, Test loss:	 0.015184273443034357
1 Resnet, Test loss:	 0.015370971334018558
2 Tabnet, Test loss:	 0.015159153936166096
3 SimpleNN, Test loss:	 0.015157582526896347
Inital Blend OOF: 0.014841762416996009
[00:02] Optimised Blend OOF: 0.014841762416996012
Optimised Weights: [0.25 0.25 0.25 0.25]


In [27]:
def model_eval_results(df_tst, df_tst_y, df_preds):
    """
    This function prints out the model evaluation results from the train and test predictions.
    The evaluation metrics used in assessing the performance of the models are: ROC AUC score,
    log loss and Precision-Recall AUC score
    """
    eval_metrics = ['log loss', 'ROC AUC score', 'PR-AUC/Average_precision_score',]
    moa_class_list = df_tst['moa'].unique()
    val_moas = [moa for moa_list in moa_class_list for moa in moa_list.split('|')]
    print('-' * 10, 'Test data prediction results', '-' * 10)
    print(f'{eval_metrics[0]}:', log_loss(np.ravel(df_tst_y), np.ravel(df_preds)))
    print(f'{eval_metrics[1]}:', roc_auc_score(df_tst_y[val_moas],df_preds[val_moas], average='macro'))
    print(f'{eval_metrics[2]}:', average_precision_score(df_tst_y[val_moas], df_preds[val_moas], average="micro"))

In [28]:
##[1.57502187e-01,1.15142271e-16,0.00000000e+00,8.42497813e-01] <-- modify the model weights
df_L1000_blend = pd.DataFrame(np.zeros(df_L1000_cnn_test.shape), columns = df_L1000_cnn_test.columns)
df_L1000_blend = df_L1000_cnn_test*0.45 + df_L1000_resnet_test*0.05 + df_L1000_tabnet_test*0.05 + df_L1000_simplenn_test*0.45

In [29]:
0.45+(0.05*2)+0.45

1.0

In [30]:
model_eval_results(df_L1000_test, df_L1000_tst_targets, df_L1000_blend)

---------- Test data prediction results ----------
log loss: 0.014616374548229026
ROC AUC score: 0.6311056052114028
PR-AUC/Average_precision_score: 0.1214893193877498


In [31]:
##[4.29598527e-01 3.27312317e-01 2.43089156e-01 5.42101086e-18] <-- modify the model weights
df_cp_blend = pd.DataFrame(np.zeros(df_cp_cnn_test.shape), columns = df_cp_cnn_test.columns)
df_cp_blend = df_cp_cnn_test*0.35 + df_cp_resnet_test*0.35 + df_cp_tabnet_test*0.25 + df_cp_simplenn_test*0.05

In [32]:
0.35+0.35+0.25+0.05

1.0

In [33]:
model_eval_results(df_cp_test, df_cp_tst_targets, df_cp_blend)

---------- Test data prediction results ----------
log loss: 0.015516240745277658
ROC AUC score: 0.6097720702817173
PR-AUC/Average_precision_score: 0.08623303304808463


In [34]:
##[0.28574384 0.09796798 0.06528908 0.5509991 ] <-- modify the model weights
df_cp_L1000_blend = pd.DataFrame(np.zeros(df_cp_L1000_cnn_test.shape), columns = df_cp_L1000_cnn_test.columns)
df_cp_L1000_blend = df_cp_L1000_cnn_test*0.30 + df_cp_L1000_resnet_test*0.20 + df_cp_L1000_tabnet_test*0.15 + df_cp_L1000_simplenn_test*0.35

In [35]:
0.30+0.20+0.15+0.35

1.0

In [36]:
model_eval_results(df_cp_L1000_test, df_cp_L1000_tst_targets, df_cp_L1000_blend)

---------- Test data prediction results ----------
log loss: 0.014801540340880218
ROC AUC score: 0.6191562806102473
PR-AUC/Average_precision_score: 0.11195125170174072


In [37]:
def save_to_csv(df, path, file_name, compress=None):
    """save dataframes to csv"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    df.to_csv(os.path.join(path, file_name), index=False, compression=compress)

In [38]:
save_to_csv(df_cp_blend, model_preds_dir, f'cp_test_preds_blend{file_indicator}.csv')
save_to_csv(df_L1000_blend, model_preds_dir, 'L1000_test_preds_blend.csv')
save_to_csv(df_cp_L1000_blend, model_preds_dir, 'cp_L1000_test_preds_blend.csv')