### - Ensemble/Blend the 4 model predictions into a single prediction (pathways)

In [2]:
import os
import datetime
from time import time
import pathlib
import pandas as pd
import numpy as np
from collections import defaultdict
from collections import Counter

In [3]:
from sklearn.metrics import precision_recall_curve,average_precision_score
from sklearn.metrics import log_loss, roc_curve
from sklearn.metrics import auc,roc_auc_score

In [4]:
from numba import njit
from scipy.optimize import minimize, fsolve

In [6]:
data_dir = pathlib.Path("../2.data_split/model_data")

In [7]:
cp_test = pathlib.Path(f"{data_dir}/cp/test_lvl4_data_targets_pathways.csv.gz")
L1000_test = pathlib.Path(f"{data_dir}/L1/test_lvl4_data_targets_pathways.csv.gz")

In [8]:
model_preds_dir = '../L1000_CP_model_predictions/'

In [10]:
df_cp_test = pd.read_csv(cp_test, compression='gzip',low_memory = False)
df_L1000_test = pd.read_csv(L1000_test, compression='gzip',low_memory = False)
# df_cp_L1000_test = pd.read_csv(cp_L1000_test, compression='gzip',low_memory = False)

In [11]:
##resnet
df_cp_resnet_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_test_pathway_preds_resnet.csv'))
df_L1000_resnet_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_pathway_preds_resnet.csv'))
# df_cp_L1000_resnet_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_pathway_preds_resnet.csv'))

In [12]:
##1-d cnn
df_cp_cnn_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_test_pathway_preds_1dcnn.csv'))
df_L1000_cnn_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_pathway_preds_1dcnn.csv'))
# df_cp_L1000_cnn_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_pathway_preds_1dcnn.csv'))

In [13]:
##tabnet
df_cp_tabnet_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_test_pathway_preds_tabnet.csv'))
df_L1000_tabnet_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_pathway_preds_tabnet.csv'))
# df_cp_L1000_tabnet_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_pathway_preds_tabnet.csv'))

In [14]:
##stagedNN
df_cp_simplenn_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_test_pathway_preds_simplenn.csv'))
df_L1000_simplenn_test = pd.read_csv(os.path.join(model_preds_dir, 'L1000_test_pathway_preds_simplenn.csv'))
# df_cp_L1000_simplenn_test = pd.read_csv(os.path.join(model_preds_dir, 'cp_L1000_test_pathway_preds_simplenn.csv'))

In [15]:
df_cp_tst_targets = df_cp_test[df_cp_cnn_test.columns]
df_L1000_tst_targets = df_L1000_test[df_L1000_cnn_test.columns]
# df_cp_L1000_tst_targets = df_cp_L1000_test[df_cp_L1000_cnn_test.columns]

In [16]:
df_cp_tst_targets.shape

(10788, 772)

In [17]:
df_L1000_tst_targets.shape

(6461, 772)

In [18]:
# df_cp_L1000_tst_targets.shape

#### - Resnet, 1d-cnn, Tabnet, Simplenn --> 4 model predictions

In [19]:
# CPMP's logloss from https://www.kaggle.com/c/lish-moa/discussion/183010
def log_loss_numpy(y_true, y_pred):
    y_true_ravel = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = np.where(y_true_ravel == 1, - np.log(y_pred), - np.log(1 - y_pred))
    return loss.mean()

def func_numpy_metric(weights, oof, y_true):
    oof_blend = np.tensordot(weights, oof, axes = ((0), (0)))
    return log_loss_numpy(y_true, oof_blend)

def grad_func(weights, oof, y_true):
    oof_clip = np.clip(oof, 1e-15, 1 - 1e-15)
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], np.zeros((oof.shape[1], oof.shape[2]))
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
        gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients
@njit
def grad_func_jit(weights, oof, y_true):
  oof_clip = np.minimum(1 - 1e-15, np.maximum(oof, 1e-15))
  gradients = np.zeros(oof.shape[0])
  for i in range(oof.shape[0]):
    a, b, c = y_true, oof_clip[i], np.zeros((oof.shape[1], oof.shape[2]))
    for j in range(oof.shape[0]):
      if j != i:
        c += weights[j] * oof_clip[j]
    gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
  return gradients

In [20]:
cp_model_preds = [df_cp_cnn_test, df_cp_resnet_test, df_cp_tabnet_test, df_cp_simplenn_test]
L1000_model_preds = [df_L1000_cnn_test, df_L1000_resnet_test, df_L1000_tabnet_test, df_L1000_simplenn_test]
# cp_L1000_model_preds = [df_cp_L1000_cnn_test, df_cp_L1000_resnet_test, df_cp_L1000_tabnet_test, df_cp_L1000_simplenn_test]

In [21]:
models_name = ['1d-Cnn', 'Resnet', 'Tabnet', 'SimpleNN']
def get_optmized_blended_weights(model_oofs, df_targets, num_of_models = 4, models_name = models_name):
  """
  This function assign weights to each of the models used in predicting MOAs based on the log-loss obtained 
  when comparing each model prediction results with the actual MOA (Mechanism of actions) test labels.

  for more info:https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0/notebook
  """
  model_oof_preds = np.zeros((num_of_models, df_targets.shape[0], df_targets.shape[1]))
  for idx in range(num_of_models):
    model_oof_preds[idx] = model_oofs[idx].values
    score_oof = log_loss_numpy(df_targets, model_oof_preds[idx])
    print(f'{idx} {models_name[idx]}, Test loss:\t', score_oof)
  tol = 1e-10
  init_guess = [1 / model_oof_preds.shape[0]] * model_oof_preds.shape[0]
  bnds = [(0, 1) for _ in range(model_oof_preds.shape[0])]
  cons = {'type': 'eq', 
          'fun': lambda x: np.sum(x) - 1, 
          'jac': lambda x: [1] * len(x)}
  print('Inital Blend OOF:', func_numpy_metric(init_guess, model_oof_preds, df_targets.values))
  start_time = time()
  res_scipy = minimize(fun = func_numpy_metric, x0 = init_guess, 
                       args=(model_oof_preds, df_targets.values), 
                       method = 'SLSQP', ##L-BFGS-B ##SLSQP
                       jac = grad_func_jit, # grad_func 
                       bounds = bnds, constraints = cons, tol = tol)
  print(f'[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] Optimised Blend OOF:', res_scipy.fun)
  print('Optimised Weights:', res_scipy.x)
  return model_oof_preds, res_scipy.x

In [22]:
_, L1000_model_weights = get_optmized_blended_weights(L1000_model_preds, df_L1000_tst_targets,)

0 1d-Cnn, Test loss:	 0.12563977523946374
1 Resnet, Test loss:	 0.12708090264519167
2 Tabnet, Test loss:	 0.13127478066978776
3 SimpleNN, Test loss:	 0.12491006208593337
Inital Blend OOF: 0.12557674951644074
[00:19] Optimised Blend OOF: 0.12476429006116983
Optimised Weights: [2.86857399e-01 0.00000000e+00 7.99599102e-18 7.13142601e-01]


In [23]:
_, cp_model_weights = get_optmized_blended_weights(cp_model_preds, df_cp_tst_targets,)

0 1d-Cnn, Test loss:	 0.12149796076942113
1 Resnet, Test loss:	 0.12063640984608494
2 Tabnet, Test loss:	 0.12392185165659485
3 SimpleNN, Test loss:	 0.12517063209653637
Inital Blend OOF: 0.11907521667784418
[00:30] Optimised Blend OOF: 0.11882131345311253
Optimised Weights: [0.13123236 0.49630461 0.26578783 0.1066752 ]


In [24]:
# _, cp_L1000_model_weights = get_optmized_blended_weights(cp_L1000_model_preds, df_cp_L1000_tst_targets)

In [25]:
def model_eval_results(df_tst_y, df_preds):
    """
    This function prints out the model evaluation results from the train and test predictions.
    The evaluation metrics used in assessing the performance of the models are: ROC AUC score,
    log loss and Precision-Recall AUC score
    """
    eval_metrics = ['log loss', 'ROC AUC score', 'PR-AUC/Average_precision_score',]
    print('-' * 10, 'Test data prediction results', '-' * 10)
    print(f'{eval_metrics[0]}:', log_loss(np.ravel(df_tst_y), np.ravel(df_preds)))
    print(f'{eval_metrics[1]}:', roc_auc_score(df_tst_y.values,df_preds.values, average='macro'))
    print(f'{eval_metrics[2]}:', average_precision_score(df_tst_y.values, df_preds.values, average="micro"))

In [26]:
#[2.86857399e-01 0.00000000e+00 7.99599102e-18 7.13142601e-01] <-- modify the model weights
df_L1000_blend = pd.DataFrame(np.zeros(df_L1000_cnn_test.shape), columns = df_L1000_cnn_test.columns)
df_L1000_blend = df_L1000_cnn_test*0.45 + df_L1000_resnet_test*0.05 + df_L1000_tabnet_test*0.05 + df_L1000_simplenn_test*0.45

In [27]:
0.45+(0.05*2)+0.45

1.0

In [28]:
model_eval_results(df_L1000_tst_targets, df_L1000_blend)

---------- Test data prediction results ----------
log loss: 0.12490227150443539
ROC AUC score: 0.6044960718529729
PR-AUC/Average_precision_score: 0.12883422672767436


In [29]:
##[0.13123236 0.49630461 0.26578783 0.1066752 ] <-- modify the model weights
df_cp_blend = pd.DataFrame(np.zeros(df_cp_cnn_test.shape), columns = df_cp_cnn_test.columns)
df_cp_blend = df_cp_cnn_test*0.15 + df_cp_resnet_test*0.45 + df_cp_tabnet_test*0.25 + df_cp_simplenn_test*0.15

In [30]:
0.15+0.45+0.25+0.15

1.0

In [31]:
model_eval_results(df_cp_tst_targets, df_cp_blend)

---------- Test data prediction results ----------
log loss: 0.11883654111668131
ROC AUC score: 0.6187831264088477
PR-AUC/Average_precision_score: 0.13268501471403046


In [32]:
##[0.28574384 0.09796798 0.06528908 0.5509991 ] <-- modify the model weights
# df_cp_L1000_blend = pd.DataFrame(np.zeros(df_cp_L1000_cnn_test.shape), columns = df_cp_L1000_cnn_test.columns)
# df_cp_L1000_blend = df_cp_L1000_cnn_test*0.30 + df_cp_L1000_resnet_test*0.20 + df_cp_L1000_tabnet_test*0.15 + df_cp_L1000_simplenn_test*0.35

In [33]:
# 0.30+0.20+0.15+0.35

In [34]:
# model_eval_results(df_cp_L1000_test, df_cp_L1000_tst_targets, df_cp_L1000_blend)

In [35]:
def save_to_csv(df, path, file_name, compress=None):
    """save dataframes to csv"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    df.to_csv(os.path.join(path, file_name), index=False, compression=compress)

In [36]:
save_to_csv(df_cp_blend, model_preds_dir, 'cp_test_pathway_preds_blend.csv')
save_to_csv(df_L1000_blend, model_preds_dir, 'L1000_test_pathway_preds_blend.csv')
# save_to_csv(df_cp_L1000_blend, model_preds_dir, 'cp_L1000_test_preds_blend.csv')