#  Colab Setup

In [None]:
COLAB = True 
DOWNLOAD_DATA = True

## Linking personal google drive storage with Google Colab

Mounting is the process by which the os makes files and directories of a storage service (google drive) available for the users via the computer's file system. Log in will be required.

In [None]:
if COLAB:
    %cd /content
    from google.colab import drive
    drive.mount('/content/gdrive')

## Kaggle API Setup

Run the following code to provide the config path to kaggle.json (api credentials)

In [None]:
if COLAB:
    import os
    os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

## Download the data using the API

Before start downloading the data, make sure u are in a directory outside your Google Drive; otherwise, u will put the data there and you will reach the limit storage easily.

In [None]:
if COLAB and DOWNLOAD_DATA:
    %cd /content
    !mkdir -p input/siim-isic-melanoma-classification
    %cd /content/input/siim-isic-melanoma-classification

    !pip install --upgrade kaggle
    # Go to kaggle and copy the API Command to download the dataset
    # !kaggle competitions download -c siim-isic-melanoma-classification
    # Instad of downloading all data, we select specific files.
    !kaggle competitions download siim-isic-melanoma-classification -f train.csv
    !kaggle competitions download siim-isic-melanoma-classification -f test.csv
    !kaggle competitions download siim-isic-melanoma-classification -f sample_submission.csv

    # Unzipping the zip files and deleting the zip files
    !unzip \*.zip  && rm *.zip

    # After downloading all data, go back to content directory
    %cd /content

# Ensembling AVG, POW AVG, RANK + BO

## Imports

In [None]:
!pip install bayesian-optimization

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn import metrics

from scipy.stats import rankdata
from bayes_opt import BayesianOptimization


## Load oof and test predictions

In [None]:
models = [
    "128_ef0_01",
    "128_ef1_01",
    "128_ef3_00",
    "128_ef4_01",
    "128_ef6_11",
    # 
    "192_ef0_01",
    "192_ef1_01",
    "192_ef3_00",
    "192_ef4_01",
    "192_ef6_11",
    # 
    "256_ef3_01",
    "256_ef4_00",
    "256_ef4_01",
    "256_ef6_11",
    #
    "384_ef6_00",
    "384_ef6_01",
    "384_ef6_11",
    "384_ef3_01_upsample",
    #
    "512_ef5_00",
    "512_ef5_01",
    "512_ef6_11",
    "512_ef3_01_upsample",
    #
    "768_ef5_00",
    "768_ef5_11",
    "768_ef5_01"
]



dirname = '/content/gdrive/My Drive/siim-isic-melanoma-classification/ensembling/'


In [None]:
train = pd.read_csv('./input/siim-isic-melanoma-classification/train.csv')
test = pd.read_csv('./input/siim-isic-melanoma-classification/test.csv')
sub = pd.read_csv('./input/siim-isic-melanoma-classification/sample_submission.csv')


for model in models:
    _oof = pd.read_csv(os.path.join(dirname, f"oof_{model}.csv"))
    score = metrics.roc_auc_score(_oof['target'], _oof['pred'])
    print(f"{model}: OOF auc:{score:.4}")

    _oof = _oof.rename(columns={"pred":model}).drop(["target"],axis=1)
    if "fold" in _oof.columns:
        _oof = _oof.drop(["fold"],axis=1)

    train = train.merge(_oof, on="image_name")   

    _sub = pd.read_csv(os.path.join(dirname, f"submission_{model}.csv"))
    _sub.columns = ["image_name", model]    
    test = test.merge(_sub, on="image_name")  

In [None]:
train.head()

In [None]:
test.head()

## OOF AVG, POW AVG, RANK

In [None]:
train["pred_rank"] = 0
train["pred_power"] = 0
train["pred_avg"] = 0

for c in models:
    train["pred_rank"] += train[c].rank() / train[c].rank().max()
    train["pred_power"] += np.power(train[c],2)/np.power(train[c],2).max()
    train["pred_avg"] += train [c]/train [c].max()
    
train["pred_rank"] = train["pred_rank"]/len(models)
train["pred_power"] = train["pred_power"]/len(models)
train["pred_avg"] = train["pred_avg"]/len(models)


score = metrics.roc_auc_score(train['target'], train["pred_avg"])
print(f'OOF avg_auc:{score}')
   
    
score = metrics.roc_auc_score(train['target'], train["pred_rank"])
print(f'OOF rank_auc:{score}')

score = metrics.roc_auc_score(train['target'], train["pred_power"])
print(f'OOF pow_auc:{score}')

## Study cases when target train != oof predictions

In [None]:
pd.set_option('display.max_rows', 600)
train_ = train.copy()
print(len(train_))
train_[models] = np.rint(train_[models])
train_ = train_.loc[train_['target'] != np.rint(train_['pred_avg'])]
print(len(train_))
for model in models:
    print(model, len(train_.loc[train_['target'] == np.rint(train_[model])]))

## Test AVG

In [None]:
test["target"] = 0.0
for c in models:
    test["target"] += test[c]/test[c].max()
test["target"] = test["target"]/len(models) 
    
sub = test[["image_name","target"]]
sub.to_csv(os.path.join(dirname, "submission_avg_all_models.csv") ,index=False)
sub.head()

## Test POW AVG

In [None]:
test["target"] = 0.0
for c in models:
    test["target"] += np.power(test[c],2)/np.power(test[c],2).max()
test["target"] = test["target"]/len(models) 
    
sub = test[["image_name","target"]]
sub.to_csv(os.path.join(dirname, "submission_pow_10m.csv") ,index=False)
sub.head()

##  Test RANK

In [None]:
test["target"] = 0.0
for c in models:
    test["target"] += test[c].rank() / test[c].rank().max()
test["target"] = test["target"]/len(models) 
    
sub = test[["image_name","target"]]
sub.to_csv(os.path.join(dirname, "submission_rank_10m.csv") ,index=False)
sub.head()

## Weighted AVG (Bayesian Optimization)


We compute the weights of the ensembling (c0, c1, c2, c3...) using bayesian optimisation.

In order to compute it, we need to have stored the predictions made on the training dataset, which is the oof.csv. 

NUmber of parameters c must be equal to the number of models ensembling

In [None]:
def dim_optimizer (df_oof, features, init_points = 20, n_iter = 30  ):
    pbounds = {'c0': (0.0, 1.0), 'c1': (0.0, 1.0), 'c2': (0.0, 1.0),
               'c3': (0.0, 1.0), 'c4': (0.0, 1.0), 'c5': (0.0, 1.0),
               'c6': (0.0, 1.0), 'c7': (0.0, 1.0), 'c8': (0.0, 1.0),
               'c9': (0.0, 1.0), 'c10': (0.0, 1.0), 'c11': (0.0, 1.0),
               'c12': (0.0, 1.0), 'c13': (0.0, 1.0), 'c14': (0.0, 1.0),
               'c15': (0.0, 1.0), 'c16': (0.0, 1.0), 'c17': (0.0, 1.0),
               'c18': (0.0, 1.0), 'c19': (0.0, 1.0), 'c20': (0.0, 1.0),
               'c21': (0.0, 1.0), 'c22': (0.0, 1.0), 'c23': (0.0, 1.0),
               'c24': (0.0, 1.0)}
    
    features = features

    def dim_opt (df_oof, c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24):

        x = c0*df_oof[  features[0] ] + c1*df_oof[ features[1]] + c2*df_oof[ features[2]] + \
            c3*df_oof[  features[3] ] + c4*df_oof[ features[4]] + c5*df_oof[ features[5]] + \
            c6*df_oof[  features[6] ] + c7*df_oof[ features[7]] + c8*df_oof[ features[8]] + \
            c9*df_oof[  features[9] ] + c10*df_oof[ features[10]] + c11*df_oof[ features[11]] + \
            c12*df_oof[  features[12] ] + c13*df_oof[ features[13]] + c14*df_oof[ features[14]] + \
            c15*df_oof[  features[15] ] + c16*df_oof[ features[16]] + c17*df_oof[ features[17]] + \
            c18*df_oof[  features[18] ] + c19*df_oof[ features[19]] + c20*df_oof[ features[20]] + \
            c21*df_oof[  features[21] ] + c22*df_oof[ features[22]] + c23*df_oof[ features[23]] + \
            c24*df_oof[  features[24] ]
        return metrics.roc_auc_score(df_oof['target'], x)



    def q (c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24):
        return dim_opt  ( df_oof,  c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24)

    optimizer = BayesianOptimization(
        f=q,
        pbounds=pbounds,
        random_state=42,
    )


    optimizer.maximize(
        init_points=init_points,
        n_iter=n_iter,
    )

    c0 = optimizer.max["params"]["c0"]
    c1 = optimizer.max["params"]["c1"]
    c2= optimizer.max["params"]["c2"]
    c3= optimizer.max["params"]["c3"]
    c4= optimizer.max["params"]["c4"]
    c5= optimizer.max["params"]["c5"]
    c6 = optimizer.max["params"]["c6"]
    c7 = optimizer.max["params"]["c7"]
    c8= optimizer.max["params"]["c8"]
    c9 = optimizer.max["params"]["c9"]
    c10 = optimizer.max["params"]["c10"]
    c11= optimizer.max["params"]["c11"]
    c12= optimizer.max["params"]["c12"]
    c13= optimizer.max["params"]["c13"]
    c14= optimizer.max["params"]["c14"]
    c15= optimizer.max["params"]["c15"]
    c16 = optimizer.max["params"]["c16"]
    c17= optimizer.max["params"]["c17"]
    c18 = optimizer.max["params"]["c18"]
    c19 = optimizer.max["params"]["c19"]
    c20= optimizer.max["params"]["c20"]
    c21= optimizer.max["params"]["c21"]
    c22= optimizer.max["params"]["c22"]
    c23= optimizer.max["params"]["c23"]
    c24 = optimizer.max["params"]["c24"]
    
    t = optimizer.max["target"]
    print ( f'bo_avg auc:{t}')
    
    return c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24


c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24 = dim_optimizer (train, models, init_points = 2000, n_iter = 10  )
for i in range (len(models)):
    print(models[i], eval(f'c{i}'))

In [None]:
def bo_pred (df):
    x = c0*df[  models[0] ] + c1*df[ models[1]] + c2*df[ models[2]] + \
        c3*df[  models[3] ] + c4*df[ models[4]] + c5*df[ models[5]] + \
        c6*df[  models[6] ] + c7*df[ models[7]] + c8*df[ models[8]] + \
        c9*df[  models[9] ] + c10*df[ models[10]] + c11*df[ models[11]] + \
        c12*df[  models[12] ] + c13*df[ models[13]] + c14*df[ models[14]] + \
        c15*df[  models[15] ] + c16*df[ models[16]] + c17*df[ models[17]] + \
        c18*df[  models[18] ] + c19*df[ models[19]] + c20*df[ models[20]] + \
        c21*df[  models[21] ] + c22*df[ models[22]] + c23*df[ models[23]] + \
        c24*df[  models[24] ]

    return x

train["pred"] = bo_pred (train)
score = metrics.roc_auc_score(train['target'], train['pred'])
print(f"OOF bo_avg_auc:{score}")



## Test weighted AVG

In [None]:
test["target"] = bo_pred (test)
    
sub = test[["image_name","target"]]
sub.to_csv(os.path.join(dirname, "submission_bo_avg_all_models_2000_10.csv") ,index=False)
sub.head()



## Weighted POW AVG (Bayesian Optimization)


We compute the weights of the ensembling (c0, c1, c2, c3...) using bayesian optimisation.

In order to compute it, we need to have stored the predictions made on the training dataset, which is the oof.csv. 

NUmber of parameters c must be equal to the number of models ensembling

In [None]:
def dim_optimizer (df_oof, features, init_points = 20, n_iter = 30  ):
    pbounds = {'c0': (0.0, 1.0), 'c1': (0.0, 1.0), 'c2': (0.0, 1.0),
               'c3': (0.0, 1.0), 'c4': (0.0, 1.0), 'c5': (0.0, 1.0)}
    
    features = features

    def dim_opt (df_oof, c0,c1,c2,c3,c4,c5):

        x = c0*np.power(df_oof[ models[0] ],2)/np.power(df_oof[ models[0] ],2).max() + \
            c1*np.power(df_oof[ models[1] ],2)/np.power(df_oof[ models[1] ],2).max() + \
            c2*np.power(df_oof[ models[2] ],2)/np.power(df_oof[ models[2] ],2).max() + \
            c3*np.power(df_oof[ models[3] ],2)/np.power(df_oof[ models[3] ],2).max() + \
            c4*np.power(df_oof[ models[4] ],2)/np.power(df_oof[ models[4] ],2).max() + \
            c5*np.power(df_oof[ models[5] ],2)/np.power(df_oof[ models[5] ],2).max()
        return metrics.roc_auc_score(df_oof['target'], x)



    def q (c0,c1,c2,c3,c4,c5):
        return dim_opt  ( df_oof,  c0,c1,c2,c3,c4,c5 )

    optimizer = BayesianOptimization(
        f=q,
        pbounds=pbounds,
        random_state=42,
    )


    optimizer.maximize(
        init_points=init_points,
        n_iter=n_iter,
    )

    c0 = optimizer.max["params"]["c0"]
    c1 = optimizer.max["params"]["c1"]
    c2= optimizer.max["params"]["c2"]
    c3= optimizer.max["params"]["c3"]
    c4= optimizer.max["params"]["c4"]
    c5= optimizer.max["params"]["c5"]
    
    t = optimizer.max["target"]
    print ( f'bo_pow_auc auc:{t}, c0:{c0}, c1:{c1}, c2:{c2}, c3:{c3}, c4:{c4}, c5:{c5}' )
    
    return c0, c1, c2, c3, c4, c5


c0, c1, c2, c3, c4, c5 = dim_optimizer (train, models, init_points = 300, n_iter = 300  )
print (models[0],c0)
print (models[1],c1)
print (models[2],c2)
print (models[3],c3)
print (models[4],c4)
print (models[5],c5)

In [None]:
def bo_pred (df):
    x = c0*np.power(df[ models[0] ],2)/np.power(df[ models[0] ],2).max() + \
        c1*np.power(df[ models[1] ],2)/np.power(df[ models[1] ],2).max() + \
        c2*np.power(df[ models[2] ],2)/np.power(df[ models[2] ],2).max() + \
        c3*np.power(df[ models[3] ],2)/np.power(df[ models[3] ],2).max() + \
        c4*np.power(df[ models[4] ],2)/np.power(df[ models[4] ],2).max() + \
        c5*np.power(df[ models[5] ],2)/np.power(df[ models[5] ],2).max()
    return x

train["pred"] = bo_pred (train)
score = metrics.roc_auc_score(train['target'], train['pred'])
print(f"OOF bo_pow_auc:{score}")



## Test weighted POW AVG

In [None]:
test["target"] = bo_pred (test)
    
sub = test[["image_name","target"]]
sub.to_csv(os.path.join(dirname, "submission_bo_pow_avg.csv") ,index=False)
sub.head()

## Weighted RANK (Bayesian Optimization)


We compute the weights of the ensembling (c0, c1, c2, c3...) using bayesian optimisation.

In order to compute it, we need to have stored the predictions made on the training dataset, which is the oof.csv. 

NUmber of parameters c must be equal to the number of models ensembling

In [None]:
def dim_optimizer (df_oof, features, init_points = 20, n_iter = 30  ):
    pbounds = {'c0': (0.0, 1.0), 'c1': (0.0, 1.0), 'c2': (0.0, 1.0),
               'c3': (0.0, 1.0), 'c4': (0.0, 1.0), 'c5': (0.0, 1.0)}
    
    features = features

    def dim_opt (df_oof, c0,c1,c2,c3,c4,c5):

        x = c0*df_oof[ models[0]].rank()/df_oof[ models[0] ].rank().max() + \
        c1*df_oof[ models[1]].rank()/df_oof[ models[1] ].rank().max() + \
        c2*df_oof[ models[2]].rank()/df_oof[ models[2] ].rank().max() + \
        c3*df_oof[ models[3]].rank()/df_oof[ models[3] ].rank().max() + \
        c4*df_oof[ models[4]].rank()/df_oof[ models[4] ].rank().max() + \
        c5*df_oof[ models[5]].rank()/df_oof[ models[5] ].rank().max()
        return metrics.roc_auc_score(df_oof['target'], x)



    def q (c0,c1,c2,c3,c4,c5):
        return dim_opt  ( df_oof,  c0,c1,c2,c3,c4,c5 )

    optimizer = BayesianOptimization(
        f=q,
        pbounds=pbounds,
        random_state=42,
    )


    optimizer.maximize(
        init_points=init_points,
        n_iter=n_iter,
    )

    c0 = optimizer.max["params"]["c0"]
    c1 = optimizer.max["params"]["c1"]
    c2= optimizer.max["params"]["c2"]
    c3= optimizer.max["params"]["c3"]
    c4= optimizer.max["params"]["c4"]
    c5= optimizer.max["params"]["c5"]
    
    t = optimizer.max["target"]
    print ( f'bo_rank auc:{t}, c0:{c0}, c1:{c1}, c2:{c2}, c3:{c3}, c4:{c4}, c5:{c5}' )
    
    return c0, c1, c2, c3, c4, c5


c0, c1, c2, c3, c4, c5 = dim_optimizer (train, models, init_points = 300, n_iter = 300  )
print (models[0],c0)
print (models[1],c1)
print (models[2],c2)
print (models[3],c3)
print (models[4],c4)
print (models[5],c5)

In [None]:
def bo_pred (df):
    x = c0*df[ models[0]].rank()/df[ models[0] ].rank().max() + \
        c1*df[ models[1]].rank()/df[ models[1] ].rank().max() + \
        c2*df[ models[2]].rank()/df[ models[2] ].rank().max() + \
        c3*df[ models[3]].rank()/df[ models[3] ].rank().max() + \
        c4*df[ models[4]].rank()/df[ models[4] ].rank().max() + \
        c5*df[ models[5]].rank()/df[ models[5] ].rank().max()
    return x

train["pred"] = bo_pred (train)
score = metrics.roc_auc_score(train['target'], train['pred'])
print(f"OOF bo_rank_auc:{score}")



## Test weighted RANK

In [None]:
test["target"] = bo_pred (test)
    
sub = test[["image_name","target"]]
sub.to_csv(os.path.join(dirname, "submission_bo_rank.csv") ,index=False)
sub.head()

# Ensembling MinMax

In [None]:
import numpy as np
import pandas as pd

import numpy as np
import pandas as pd 
import os 

## Load oof and test predictions

In [None]:
models = [
    "128",
    "192",
    "256",
    "384",
    "512",
    "768"
]

submissions = [
    "submission_128.csv",
    "submission_192.csv",
    "submission_256.csv",
    "submission_384.csv",
    "submission_512.csv",
    "submission_768.csv" 
]


dirname = '/content/gdrive/My Drive/siim-isic-melanoma-classification/ensembling/'


In [None]:
train = pd.read_csv('./input/siim-isic-melanoma-classification/train.csv')
test = pd.read_csv('./input/siim-isic-melanoma-classification/test.csv')
sub = pd.read_csv('./input/siim-isic-melanoma-classification/sample_submission.csv')


for model in models:
    _oof = pd.read_csv(os.path.join(dirname, f"oof_{model}.csv"))
    score = metrics.roc_auc_score(_oof['target'], _oof['pred'])
    print(f"{model}: OOF auc:{score:.4}")

    _oof = _oof.rename(columns={"pred":model}).drop(["target"],axis=1)
    if "fold" in _oof.columns:
        _oof = _oof.drop(["fold"],axis=1)

    train = train.merge(_oof, on="image_name")   

    _sub = pd.read_csv(os.path.join(dirname, f"submission_{model}.csv"))
    _sub.columns = ["image_name", model]    
    test = test.merge(_sub, on="image_name")  

In [None]:
train.head()

In [None]:
test.head()

In [None]:
def MinMaxBestBaseStacking(df, models, best_model_name):
    best_model = df[best_model_name]

    # get the data fields ready for stacking
    df['is_iceberg_max'] = df.loc[:, models].max(axis=1)
    df['is_iceberg_min'] = df.loc[:, models].min(axis=1)
    df['is_iceberg_mean'] = df.loc[:, models].mean(axis=1)
    df['is_iceberg_median'] = df.loc[:, models].median(axis=1)
    df['is_iceberg_base'] = best_model


    # set up cutoff threshold for lower and upper bounds
    cutoff_lo = 0.66
    cutoff_hi = 0.33
    
    cutoff_lo = 0.85
    cutoff_hi = 0.17

    cutoff_lo = 0.95
    cutoff_hi = 0.05

    df['pred_MinMax'] = 0
    df['pred_MinMax'] = np.where(np.all(df.loc[:, models] > cutoff_lo, axis=1),
        df['is_iceberg_max'],
        np.where(np.all(df.loc[:, models] < cutoff_hi, axis=1),
                 df['is_iceberg_min'],
                 df['is_iceberg_base']))
    
    return df


## OOF

In [None]:
train = MinMaxBestBaseStacking(train, models, '768')
score = metrics.roc_auc_score(train['target'], train["pred_MinMax"])
print(f'OOF MinMax_auc:{score}')
train.head()

## Test

In [None]:
test["target"] = 0.0
test = MinMaxBestBaseStacking(test, models, '768')

sub = test[["image_name","pred_MinMax"]]
sub = sub.rename({"pred_MinMax": "target"}, axis=1)
sub.to_csv(os.path.join(dirname, "submission_MinMax_768.csv"), index=False)
sub.head()

# Ensembling post processing

In [None]:
# General imports
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt

In [None]:
models = [
    # "128_ef0_01",
    # "128_ef1_01",
    # "128_ef3_00",
    # "128_ef4_01",
    # "128_ef6_11",
    #
    # "192_ef0_01",
    # "192_ef1_01",
    # "192_ef3_00",
    # "192_ef4_01",
    "192_ef6_11",
    #
    # "256_ef3_01",
    # "256_ef4_00",
    # "256_ef4_01",
    "256_ef6_11",
    #
    "384_ef6_00",
    "384_ef6_01",
    "384_ef6_11",
    #
    "512_ef5_00",
    "512_ef5_01",
    "512_ef6_11",
    #
    "768_ef5_11",
]



dirname = '/content/gdrive/My Drive/siim-isic-melanoma-classification/ensembling/'


In [None]:
train = pd.read_csv('./input/siim-isic-melanoma-classification/train.csv')
test = pd.read_csv('./input/siim-isic-melanoma-classification/test.csv')
sub = pd.read_csv('./input/siim-isic-melanoma-classification/sample_submission.csv')


for model in models:
    _oof = pd.read_csv(os.path.join(dirname, f"oof_{model}.csv"))
    score = metrics.roc_auc_score(_oof['target'], _oof['pred'])
    print(f"{model}: OOF auc:{score:.4}")

    _oof = _oof.rename(columns={"pred":model}).drop(["target"],axis=1)
    if "fold" in _oof.columns:
        _oof = _oof.drop(["fold"],axis=1)

    train = train.merge(_oof, on="image_name")   

    _sub = pd.read_csv(os.path.join(dirname, f"submission_{model}.csv"))
    _sub.columns = ["image_name", model]    
    test = test.merge(_sub, on="image_name")  

In [None]:
train.head()

In [None]:
test.head()

## OOF

In [None]:
WEIGHT = 1 # best to keep between 1 and 2 from the orignal authors

In [None]:
def post_processing(df):
    # Derive the given sub increases or decreases in score
    df['id'] = df.index
    df["diff_1"] = df['384_ef6_00'] - df['384_ef6_01']
    df["diff_2"] = df['384_ef6_01'] - df['192_ef6_11']
    df["diff_3"] = df['192_ef6_11'] - df['512_ef5_00']
    df["diff_4"] = df['512_ef5_00'] - df['512_ef6_11']
    df["diff_5"] = df['512_ef6_11'] - df['256_ef6_11']
    df["diff_6"] = df['256_ef6_11'] - df['384_ef6_11']
    df["diff_7"] = df['384_ef6_11'] - df['512_ef5_01']
    df["diff_8"] = df['512_ef5_01'] - df['768_ef5_11']

    # select model with highest score
    df["sub_best"] = df['768_ef5_11']

    col_comment = ["id", "image_name", "patient_id", "sub_best"]
    col_diff = [column for column in df.columns if "diff" in column]
    df_diff = df[col_comment + col_diff].reset_index(drop=True)

    # Compute avg of differences.
    df_diff["diff_avg"] = df_diff[col_diff].mean(axis=1) # the mean trend

    # Apply the post-processing technique in one line (as explained in the pseudo-code of my post.
    df_diff["sub_new"] = df_diff.apply(lambda x: (1+WEIGHT*x["diff_avg"])*x["sub_best"] if x["diff_avg"]<0 else (1-WEIGHT*x["diff_avg"])*x["sub_best"] + WEIGHT*x["diff_avg"] , axis=1)

    return df_diff
    

In [None]:
df_diff = post_processing(train)
df_diff.head()

In [None]:
train.loc[train["id"], "pred"] = df_diff["sub_new"].values

In [None]:
score = metrics.roc_auc_score(train['target'], train['pred'])
print(f"OOF post_processing_auc:{score}")

## Test

In [None]:
# best submission 
sub_best = pd.read_csv(os.path.join(dirname, f"submission_.csv"))

In [None]:
sub["target"] = sub_best["target"]
sub.head()

In [None]:
df_diff = post_processing(test)
df_diff.head()

In [None]:
sub.loc[test["id"], "target"] = df_diff["sub_new"].values

In [None]:
sub.to_csv("submission_post_processing.csv", index=False)
sub.head()

In [None]:
plt.hist(sub.target,bins=100)
plt.show()

# Ensembling with tabular data

In [None]:
import pandas as pd
import os

In [None]:
tabular_models = [
    "tabular_1",
    "tabular_2",
    "tabular_3"
]

tabular_submissions = [
    "submission_tabular_1.csv",
    "submission_tabular_2.csv",
    "submission_tabular_3.csv"
]

"submission_bo_avg_300"

dirname = '/content/gdrive/My Drive/siim-isic-melanoma-classification/ensembling/'


In [None]:
effnet = pd.read_csv(os.path.join(dirname, "submission_bo_avg_best9m_1000_200.csv"))
meta = pd.read_csv(os.path.join(dirname, "submission_tabular_2.csv"))
meta2 = pd.read_csv(os.path.join(dirname, "submission_tabular_1.csv"))

sample = effnet.copy()
sample['target'] = (effnet['target'] * 0.9 + meta['target'] * 0.1)

sample.to_csv(os.path.join(dirname, 'submission_bo_avg_best9m_1000_200_t2x1.csv'), index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# display auc distribution

fig, ax = plt.subplots(figsize=(16,6))
sns.distplot(sample['target'], hist_kws={
                 'rwidth': 0.75,
                 'edgecolor': 'black',
                 'alpha': 0.3
             }, color='#C3073F')
ax.set_title('Final Predictions')
# ax.set_xlim(0.5,1)
# ax.set_ylim(0,0.0001)
plt.show()