In [1]:
# basics
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
from collections import Counter
# import copy
import seaborn as sns
from time import time
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')
import itertools

# sklearn
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold

# pyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _WeightedLoss
from torch.optim.lr_scheduler import ReduceLROnPlateau, CyclicLR
from torch.utils.data import DataLoader, Dataset

# TabNet
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

# stratified kfold
!pip install /kaggle/input/iterative-stratification/iterative-stratification-master/
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

Looking in links: /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.0
Processing /kaggle/input/iterative-stratification/iterative-stratification-master
Building wheels for collected packages: iterative-stratification
  Building wheel for iterative-stratification (setup.py) ... [?25l- \ done
[?25h  Created wheel for iterative-stratification: filename=iterative_stratification-0.1.6-py3-none-any.whl size=8401 sha256=17e6006028c5c554df8af9e7108ce082df17fe38fbf83050b943df8b0fbb4909
  Stored in directory: /root/.cache/pip/wheels/b8/47/3f/eb4af42d124f37d23d6f13a4c8bbc32c1d70140e6e1cecb4aa
Successfully built iterative-stratification
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6


In [2]:
### General ###
import os
import copy
import tqdm
import pickle
import random
import warnings
warnings.filterwarnings("ignore")
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

### Data Wrangling ###
import numpy as np
import pandas as pd
from scipy import stats

### Machine Learning ###
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### Deep Learning ###
import torch
from torch import nn
import torch.optim as optim
from torch.nn import functional as F
from torch.nn.modules.loss import _WeightedLoss
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
# Tabnet 
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

from pickle import load,dump

### Make prettier the prints ###
from colorama import Fore
c_ = Fore.CYAN
m_ = Fore.MAGENTA
r_ = Fore.RED
b_ = Fore.BLUE
y_ = Fore.YELLOW
g_ = Fore.GREEN

In [3]:
# load raw data
data_dir = '../input/lish-moa/'
train_features = pd.read_csv(data_dir + 'train_features.csv')
train_targets_scored = pd.read_csv(data_dir + 'train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(data_dir + 'train_targets_nonscored.csv')
train_drug = pd.read_csv(data_dir + 'train_drug.csv')
test_features = pd.read_csv(data_dir + 'test_features.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

train_features_extra = pd.read_csv('../input/moa-fe-extra-data/MoA_FE_Extra_Data.csv')

# keep only nonscored targets with >0 positive labels
keep_list = (np.where(train_targets_nonscored.iloc[:,1:].values.sum(axis=0)>0)[0] + 1).tolist()
train_targets_nonscored = train_targets_nonscored.iloc[:,[0]+keep_list]

print('train_features: {}'.format(train_features.shape))
print('train_targets_scored: {}'.format(train_targets_scored.shape))
print('train_targets_nonscored: {}'.format(train_targets_nonscored.shape))
print('train_drug: {}'.format(train_drug.shape))
print('test_features: {}'.format(test_features.shape))
print('train_features_extra: {}'.format(train_features_extra.shape))
print('sample_submission: {}'.format(sample_submission.shape))

train_features: (23814, 876)
train_targets_scored: (23814, 207)
train_targets_nonscored: (23814, 332)
train_drug: (23814, 2)
test_features: (3982, 876)
train_features_extra: (3982, 876)
sample_submission: (3982, 207)


In [4]:
# function to set single seed for everything
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# set seed to 42
seed_everything(42)

In [5]:
# define g/c features
g_col = [col for col in train_features.columns if col.startswith('g-')]
c_col = [col for col in train_features.columns if col.startswith('c-')]

In [6]:
def feature_engineering(x_train, x_test, seed):
    
    # set seed
    seed_everything(seed)

    # sig_id & cp_type
    sig_train = x_train[['sig_id', 'cp_type']]
    sig_test = x_test[['sig_id', 'cp_type']]
    print('sig_id & cp_type', sig_train.shape, sig_test.shape)
    
    # OHE for cp_time, cp_dose
    cp_dose = {'D1': 0, 'D2': 1}
    cp_time = {24:0, 48:1, 72:2}
    cp_dose_train = x_train.cp_dose.map(cp_dose)
    cp_time_train = x_train.cp_time.map(cp_time)
    cp_dose_test = x_test.cp_dose.map(cp_dose)
    cp_time_test = x_test.cp_time.map(cp_time)
    cp_train = pd.DataFrame({'cp_dose':cp_dose_train, 'cp_time':cp_time_train}).reset_index(drop=True)
    cp_test = pd.DataFrame({'cp_dose':cp_dose_test, 'cp_time':cp_time_test}).reset_index(drop=True)
    print('OHE for cp_time, cp_dose', cp_train.shape, cp_test.shape)
    
    # RankGauss scaling
    n_quantiles = 100
    qt = QuantileTransformer(n_quantiles=n_quantiles, random_state=seed, output_distribution='normal').fit(x_train[g_col + c_col])
    rg_train = pd.DataFrame(qt.transform(x_train[g_col + c_col]), columns=['rg_' + col for col in g_col + c_col]).reset_index(drop=True)
    rg_test = pd.DataFrame(qt.transform(x_test[g_col + c_col]), columns=['rg_' + col for col in g_col + c_col]).reset_index(drop=True)
    print('RankGauss scaling', rg_train.shape, rg_test.shape)
    print(rg_train.head())
    
    # PCA for g_col
    g_n_comp = 600
    rg_g_col = [col for col in rg_train.columns if col.startswith('rg_g')]
    pca = PCA(n_components=g_n_comp, random_state=seed).fit(rg_train[rg_g_col])
    pca_g_train = pd.DataFrame(pca.transform(rg_train[rg_g_col]), columns=['pca_g-' + str(i) for i in range(g_n_comp)]).reset_index(drop=True)
    pca_g_test = pd.DataFrame(pca.transform(rg_test[rg_g_col]), columns=['pca_g-' + str(i) for i in range(g_n_comp)]).reset_index(drop=True)
    print('PCA for g_col', pca_g_train.shape, pca_g_test.shape)
    print(pca_g_train.head())

    # PCA for c_col
    c_n_comp = 50
    rg_c_col = [col for col in rg_train.columns if col.startswith('rg_c')]
    pca = PCA(n_components=c_n_comp, random_state=seed).fit(rg_train[rg_c_col])
    pca_c_train = pd.DataFrame(pca.transform(rg_train[rg_c_col]), columns=['pca_c-' + str(i) for i in range(c_n_comp)]).reset_index(drop=True)
    pca_c_test = pd.DataFrame(pca.transform(rg_test[rg_c_col]), columns=['pca_c-' + str(i) for i in range(c_n_comp)]).reset_index(drop=True)
    print('PCA for c_col', pca_c_train.shape, pca_c_test.shape)
    print(pca_c_train.head())

    # combine g & c PCA features
    pca_gc_train = pd.concat((pca_g_train, pca_c_train),axis=1).reset_index(drop=True)
    pca_gc_test  =pd.concat((pca_g_test, pca_c_test),axis=1).reset_index(drop=True)
    print('combine g & c PCA features', pca_gc_train.shape, pca_gc_test.shape)
    print(pca_gc_train.head())

    # FS by Variance Threshold
    thsld = 0.85
    data_train = pd.concat([rg_train, pca_gc_train], axis=1)
    data_test = pd.concat([rg_test, pca_gc_test], axis=1)
    variance_threshold = VarianceThreshold(thsld).fit(data_train)
    selected_features = variance_threshold.get_support(True).tolist()
    fs_train = data_train.iloc[:,selected_features].reset_index(drop=True)
    fs_test = data_test.iloc[:,selected_features].reset_index(drop=True)
    print('FS by Variance Threshold', fs_train.shape, fs_test.shape)
    print(fs_train.head())

    # function to cluster raw g/c
    n_clusters_g = 22
    n_clusters_c = 4
    def gc_cluster(train, test, n_clusters_g, n_clusters_c, seed):

        def create_cluster(train, test, features, kind, n_clusters):
            col_prefix = 'raw_clusters_'
            train_ = train[features].copy()
            test_ = test[features].copy()
            kmeans = KMeans(n_clusters=n_clusters, random_state=seed).fit(train_)
            data = pd.concat([train_, test_], axis=0).reset_index(drop=True)
            pred = pd.DataFrame(kmeans.predict(data[features]), columns=[col_prefix + kind])
            pred = pd.get_dummies(pred, columns=[col_prefix + kind])
            pred_train = pred.iloc[:len(train),:].reset_index(drop=True)
            pred_test = pred.iloc[len(train):,:].reset_index(drop=True)
            return pred_train, pred_test

        g_pred_train, g_pred_test = create_cluster(train, test, g_col, kind='g', n_clusters=n_clusters_g)
        c_pred_train, c_pred_test = create_cluster(train, test, c_col, kind='c', n_clusters=n_clusters_c)
        gc_pred_train = pd.concat([g_pred_train, c_pred_train], axis=1).reset_index(drop=True)
        gc_pred_test = pd.concat([g_pred_test, c_pred_test], axis=1).reset_index(drop=True)
        return gc_pred_train, gc_pred_test
    
    raw_gc_cluster_train, raw_gc_cluster_test = gc_cluster(x_train, x_test, n_clusters_g=n_clusters_g, n_clusters_c=n_clusters_c, seed=seed)
    print('cluster raw g/c', raw_gc_cluster_train.shape, raw_gc_cluster_test.shape)


    # function to cluster PCA g/c
    n_clusters = 5
    def fe_cluster_pca(train, test, n_clusters, seed):
        col_prefix = 'pca_clusters'
        kmeans = KMeans(n_clusters=n_clusters, random_state = seed).fit(train)
        data = pd.concat([train, test], axis=0).reset_index(drop=True)
        pred = pd.DataFrame(kmeans.predict(data), columns=[col_prefix])
        pred = pd.get_dummies(pred, columns=[col_prefix])
        pred_train = pred.iloc[:len(train),:].reset_index(drop=True)
        pred_test = pred.iloc[len(train):,:].reset_index(drop=True)
        return pred_train, pred_test
    pca_cluster_train, pca_cluster_test = fe_cluster_pca(pca_gc_train, pca_gc_test, n_clusters=n_clusters, seed=seed)
    print('cluster PCA g/c', pca_cluster_train.shape, pca_cluster_test.shape)


    # statistics features
    gsquarecols = ['g-574','g-211','g-216','g-0','g-255','g-577','g-153','g-389','g-60','g-370','g-248','g-167','g-203',
                   'g-177','g-301','g-332','g-517','g-6','g-744','g-224','g-162','g-3','g-736','g-486','g-283','g-22',
                   'g-359','g-361','g-440','g-335','g-106','g-307','g-745','g-146','g-416','g-298','g-666','g-91','g-17',
                   'g-549','g-145','g-157','g-768','g-568','g-396']
    def fe_stats(df):
        stat_df = dict()
        stat_df['g_sum'] = df[g_col].sum(axis = 1)
        stat_df['g_mean'] = df[g_col].mean(axis = 1)
        stat_df['g_std'] = df[g_col].std(axis = 1)
        stat_df['g_kurt'] = df[g_col].kurtosis(axis = 1)
        stat_df['g_skew'] = df[g_col].skew(axis = 1)
        stat_df['c_sum'] = df[c_col].sum(axis = 1)
        stat_df['c_mean'] = df[c_col].mean(axis = 1)
        stat_df['c_std'] = df[c_col].std(axis = 1)
        stat_df['c_kurt'] = df[c_col].kurtosis(axis = 1)
        stat_df['c_skew'] = df[c_col].skew(axis = 1)
        stat_df['gc_sum'] = df[g_col + c_col].sum(axis = 1)
        stat_df['gc_mean'] = df[g_col + c_col].mean(axis = 1)
        stat_df['gc_std'] = df[g_col + c_col].std(axis = 1)
        stat_df['gc_kurt'] = df[g_col + c_col].kurtosis(axis = 1)
        stat_df['gc_skew'] = df[g_col + c_col].skew(axis = 1)

        stat_df['c52_c42'] = df['c-52'] * df['c-42']
        stat_df['c13_c73'] = df['c-13'] * df['c-73']
        stat_df['c26_c13'] = df['c-23'] * df['c-13']
        stat_df['c33_c6'] = df['c-33'] * df['c-6']
        stat_df['c11_c55'] = df['c-11'] * df['c-55']
        stat_df['c38_c63'] = df['c-38'] * df['c-63']
        stat_df['c38_c94'] = df['c-38'] * df['c-94']
        stat_df['c13_c94'] = df['c-13'] * df['c-94']
        stat_df['c4_c52'] = df['c-4'] * df['c-52']
        stat_df['c4_c42'] = df['c-4'] * df['c-42']
        stat_df['c13_c38'] = df['c-13'] * df['c-38']
        stat_df['c55_c2'] = df['c-55'] * df['c-2']
        stat_df['c55_c4'] = df['c-55'] * df['c-4']
        stat_df['c4_c13'] = df['c-4'] * df['c-13']
        stat_df['c82_c42'] = df['c-82'] * df['c-42']
        stat_df['c66_c42'] = df['c-66'] * df['c-42']
        stat_df['c6_c38'] = df['c-6'] * df['c-38']
        stat_df['c2_c13'] = df['c-2'] * df['c-13']
        stat_df['c62_c42'] = df['c-62'] * df['c-42']
        stat_df['c90_c55'] = df['c-90'] * df['c-55']
        
        for feature in c_col:
            stat_df[f'{feature}_squared'] = df[feature] ** 2     
        for feature in gsquarecols:
            stat_df[f'{feature}_squared'] = df[feature] ** 2  
            
        stat_df = pd.DataFrame(stat_df)
        return stat_df

    stat_train, stat_test = fe_stats(x_train), fe_stats(x_test)
    print('statistics features', stat_train.shape, stat_test.shape)

    # combine all FE results
    x_train_fe = pd.concat([sig_train, cp_train, fs_train, raw_gc_cluster_train, pca_cluster_train, stat_train], axis=1).reset_index(drop=True)
    x_test_fe = pd.concat([sig_test, cp_test, fs_test, raw_gc_cluster_test, pca_cluster_test, stat_test], axis=1).reset_index(drop=True)
    print('combine all FE results', x_train_fe.shape, x_test_fe.shape)

    # remove ctrl in train and test
    x_train_fe = x_train_fe[x_train_fe.cp_type!='ctl_vehicle']
    x_test_fe = x_test_fe[x_test_fe.cp_type!='ctl_vehicle']
    x_train_fe = x_train_fe.drop('cp_type', axis=1).reset_index(drop=True)
    x_test_fe = x_test_fe.drop('cp_type', axis=1).reset_index(drop=True)
    print('remove ctrl in train and test', x_train_fe.shape, x_test_fe.shape)

    return x_train_fe, x_test_fe

In [7]:
# FE parameter estimation based on provided train and test
fe_data = train_features.append(train_features_extra).reset_index(drop=True)
_, x_train_fe = feature_engineering(fe_data, train_features, seed=42)
_, x_test_fe = feature_engineering(fe_data, test_features, seed=42)

sig_id & cp_type (27796, 2) (23814, 2)
OHE for cp_time, cp_dose (27796, 2) (23814, 2)
RankGauss scaling (27796, 872) (23814, 872)
     rg_g-0    rg_g-1    rg_g-2    rg_g-3    rg_g-4    rg_g-5    rg_g-6  \
0  1.146806  0.902075 -0.418339 -0.961202 -0.254770 -1.021300 -1.369236   
1  0.128824  0.676862  0.274345  0.090495  1.208863  0.688965  0.316734   
2  0.790372  0.939951  1.428097 -0.121817 -0.002067  1.495091  0.238763   
3 -0.729866 -0.277163 -0.441200  0.766612  2.347817 -0.862761 -2.308829   
4 -0.444558 -0.481202  0.974729  0.977467  1.468304 -0.874772 -0.372682   

     rg_g-7    rg_g-8    rg_g-9  ...   rg_c-90   rg_c-91   rg_c-92   rg_c-93  \
0 -0.029888  0.684319 -0.316668  ...  0.405455  0.362189  1.296097  0.830281   
1  0.556428 -0.539718  0.831972  ... -0.527074  1.127076  0.716060  0.047538   
2  0.363471 -0.003611  1.237966  ... -0.834469 -0.747431  0.952950  0.046551   
3  0.305225 -0.191898 -1.389591  ... -1.429097 -0.762287 -1.653318 -1.259768   
4 -0.212171 -1.0670

In [8]:
# join processed features, scored_targets, non-scored_targets, drug_ids
train = x_train_fe.merge(train_targets_scored, on='sig_id')
train = train.merge(train_drug, on='sig_id')
test = x_test_fe.copy()

In [9]:
# # stratified k-fold by drug_id

# SEEDS = [20,21,22]
# NFOLDS = 10
# DRUG_THRESH = 18

# def make_cv_folds(train, SEEDS, NFOLDS, DRUG_THRESH):
#     vc = train.drug_id.value_counts()
#     vc1 = vc.loc[vc <= DRUG_THRESH].index.sort_values()
#     vc2 = vc.loc[vc > DRUG_THRESH].index.sort_values()

#     for seed_id in SEEDS:
#         kfold_col = 'kfold_{}'.format(seed_id)
        
#         # STRATIFY DRUGS 18X OR LESS
#         dct1 = {}
#         dct2 = {}

#         skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
#         tmp = train.groupby('drug_id')[target_cols].mean().loc[vc1]

#         for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
#             dd = {k: fold for k in tmp.index[idxV].values}
#             dct1.update(dd)

#         # STRATIFY DRUGS MORE THAN 18X
#         skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
#         tmp = train.loc[train.drug_id.isin(vc2)].reset_index(drop=True)

#         for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
#             dd = {k: fold for k in tmp.sig_id[idxV].values}
#             dct2.update(dd)

#         # ASSIGN FOLDS
#         train[kfold_col] = train.drug_id.map(dct1)
#         train.loc[train[kfold_col].isna(), kfold_col] = train.loc[train[kfold_col].isna(), 'sig_id'].map(dct2)
#         train[kfold_col] = train[kfold_col].astype('int8')
        
#     return train

# train = make_cv_folds(train, SEEDS, NFOLDS, DRUG_THRESH)
# train.head()

In [10]:
target_cols = [col for col in train_targets_scored.columns if col!='sig_id']
feature_cols = [col for col in train.columns if col!='sig_id' and col not in target_cols and col!='drug_id' and 'kfold' not in col]

target=train[target_cols]
train = train[feature_cols]
test = test[feature_cols]
X_test = test.values
train.head()

Unnamed: 0,cp_dose,cp_time,rg_g-0,rg_g-1,rg_g-2,rg_g-3,rg_g-4,rg_g-5,rg_g-6,rg_g-7,...,g-298_squared,g-666_squared,g-91_squared,g-17_squared,g-549_squared,g-145_squared,g-157_squared,g-768_squared,g-568_squared,g-396_squared
0,0,0,1.146806,0.902075,-0.418339,-0.961202,-0.25477,-1.0213,-1.369236,-0.029888,...,0.10582,0.127449,1.261129,0.40221,0.929296,0.030906,0.011903,0.090481,0.0113,0.044184
1,0,2,0.128824,0.676862,0.274345,0.090495,1.208863,0.688965,0.316734,0.556428,...,0.204756,0.025091,0.148533,0.080599,0.08191,0.124186,0.01703,0.026439,0.101761,0.506802
2,0,1,0.790372,0.939951,1.428097,-0.121817,-0.002067,1.495091,0.238763,0.363471,...,0.047611,0.013572,0.183698,0.094556,0.888495,5.978025,0.129456,0.336052,0.074693,6.801664
3,0,1,-0.729866,-0.277163,-0.4412,0.766612,2.347817,-0.862761,-2.308829,0.305225,...,0.2304,0.026929,24.167056,0.731709,4.981824,2.982529,0.279524,0.03984,3.243601,0.131044
4,1,2,-0.444558,-0.481202,0.974729,0.977467,1.468304,-0.874772,-0.372682,-0.212171,...,1.127844,0.527657,0.644006,2.920681,0.365662,0.172391,0.0,1.004004,0.091023,1.340964


In [11]:
from torch.nn.modules.loss import _WeightedLoss
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [12]:
class LogitsLogLoss(Metric):

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 5e-5) + y_true * np.log(logits + 5e-5)
        return np.mean(-aux)

In [13]:
MAX_EPOCH = 200

tabnet_params = dict(
    n_d = 48,
    n_a = 24,
    n_steps = 1,
    n_independent = 1,
    n_shared = 1,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = 42,
    verbose = 10
)

In [14]:
# # model training

# test_cv_preds = []
# oof_preds = []
# oof_targets = []
# scores = []

# for seed in SEEDS:
#     tabnet_params['seed'] = seed
    
#     for fold in range(NFOLDS):
#         print("FOLDS: ",fold,'seed:',seed)

#         tr_idx = train[train['kfold_'+str(seed)]!=fold].index
#         val_idx = train[train['kfold_'+str(seed)]==fold].index
        
#         X_train = train.iloc[tr_idx,:][feature_cols].reset_index(drop=True).values
#         X_val = train.iloc[val_idx,:][feature_cols].reset_index(drop=True).values
#         y_train = train.iloc[tr_idx,:][target_cols].reset_index(drop=True).values
#         y_val = train.iloc[val_idx,:][target_cols].reset_index(drop=True).values
        
#         ### Model ###
#         model = TabNetRegressor(**tabnet_params)
        
#         ### Fit ###
#         model.fit(
#             X_train = X_train,
#             y_train = y_train,
#             eval_set = [(X_val, y_val)],
#             eval_name = ["val"],
#             eval_metric = ["logits_ll"],
#             max_epochs = MAX_EPOCH,
#             patience = 30,
#             batch_size = 1024, 
#             virtual_batch_size = 32,
#             num_workers = 1,
#             drop_last = False,
#             loss_fn = SmoothBCEwLogits(smoothing=5e-5))
    
#         ### Save validation result ###
#         preds_val = model.predict(X_val)
#         preds_val = 1 / (1 + np.exp(-preds_val))
#         oof_preds.append(preds_val)
#         oof_targets.append(y_val)
#         score = np.min(model.history["val_logits_ll"])
#         scores.append(score)
        
#         ### Save model ###
#         saving_path_name = 'TabNet_seed_'+str(seed)+'_fold_'+str(fold)
#         saved_filepath = model.save_model(saving_path_name)  
# #         loaded_model =  TabNetRegressor()
# #         loaded_model.load_model(saved_filepath)

#         ### Predict on test ###
#         model.load_model(saved_filepath)
#         preds_test = model.predict(test[feature_cols].values)
#         preds_test = 1 / (1 + np.exp(-preds_test))
#         test_cv_preds.append(preds_test)

# # oof_preds = np.concatenate(oof_preds)
# # oof_targets = np.concatenate(oof_targets)

# print('CV score:', np.mean(scores))

In [15]:
scores_auc_all = []
test_cv_preds = []

NB_SPLITS = 10
mskf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, random_state = 0, shuffle = True)

oof_preds = []
oof_targets = []
scores = []
scores_auc = []
SEED = [20,21,22]

for s in SEED:
    tabnet_params['seed'] = s
    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, target)):
        print(b_,"FOLDS: ", r_, fold_nb + 1, y_, 'seed:', tabnet_params['seed'])
        print(g_, '*' * 60, c_)
    
        X_train, y_train = train.values[train_idx, :], target.values[train_idx, :]
        X_val, y_val = train.values[val_idx, :], target.values[val_idx, :]
        ### Model ###
        model = TabNetRegressor(**tabnet_params)
        
        ### Fit ###
        model.fit(
            X_train = X_train,
            y_train = y_train,
            eval_set = [(X_val, y_val)],
            eval_name = ["val"],
            eval_metric = ["logits_ll"],
            max_epochs = MAX_EPOCH,
            patience = 20,
            batch_size = 1024, 
            virtual_batch_size = 32,
            num_workers = 1,
            drop_last = False,
            loss_fn = SmoothBCEwLogits(smoothing=5e-5))
        print(y_, '-' * 60)
    
        ### Predict on validation ###
        preds_val = model.predict(X_val)
        # Apply sigmoid to the predictions
        preds = 1 / (1 + np.exp(-preds_val))
        score = np.min(model.history["val_logits_ll"])
        saving_path_name = 'TabNet_seed_'+str(tabnet_params['seed'])+'_fold_'+str(fold_nb+1)
        saved_filepath = model.save_model(saving_path_name)
        
        loaded_model =  TabNetRegressor()
        loaded_model.load_model(saved_filepath)
    
        ### Save OOF for CV ###
        oof_preds.append(preds_val)
        oof_targets.append(y_val)
        scores.append(score)
    
        ### Predict on test ###
        model.load_model(saved_filepath)
        preds_test = model.predict(X_test)
        test_cv_preds.append(1 / (1 + np.exp(-preds_test)))

oof_preds_all = np.concatenate(oof_preds)
oof_targets_all = np.concatenate(oof_targets)


[34m FOLDS:  [31m 1 [33m seed: 20
[32m ************************************************************ [36m
Device used : cuda
epoch 0  | loss: 0.27706 | val_logits_ll: 0.02926 |  0:00:02s
epoch 10 | loss: 0.01874 | val_logits_ll: 0.01835 |  0:00:20s
epoch 20 | loss: 0.01747 | val_logits_ll: 0.01787 |  0:00:37s
epoch 30 | loss: 0.01687 | val_logits_ll: 0.01687 |  0:00:57s
epoch 40 | loss: 0.01645 | val_logits_ll: 0.01668 |  0:01:15s
epoch 50 | loss: 0.01604 | val_logits_ll: 0.01662 |  0:01:32s
epoch 60 | loss: 0.01589 | val_logits_ll: 0.01664 |  0:01:51s
epoch 70 | loss: 0.01565 | val_logits_ll: 0.01672 |  0:02:09s
epoch 80 | loss: 0.01538 | val_logits_ll: 0.01654 |  0:02:27s
epoch 90 | loss: 0.01532 | val_logits_ll: 0.01661 |  0:02:45s

Early stopping occured at epoch 98 with best_epoch = 78 and best_val_logits_ll = 0.0164
Best weights from best epoch are automatically used!
[33m ------------------------------------------------------------
Successfully saved model at TabNet_seed_20

In [16]:
aucs = []
for task_id in range(oof_preds_all.shape[1]):
    aucs.append(roc_auc_score(y_true = oof_targets_all[:, task_id],
                              y_score = oof_preds_all[:, task_id]
                             ))
print(f"{b_}Overall AUC: {r_}{np.mean(aucs)}")
print(f"{b_}Average CV: {r_}{np.mean(scores)}")

[34mOverall AUC: [31m0.7672920606312914
[34mAverage CV: [31m0.016336569404956192


In [17]:
print(oof_preds_all.shape)
print(oof_targets_all.shape)
print(oof_preds_all.shape)
print(tabnet_params['seed'])

(65844, 206)
(65844, 206)
(65844, 206)
22


In [18]:
# test set prediction and submission
test_cv_preds = np.stack(test_cv_preds).mean(axis=0)
test_cv_preds = pd.DataFrame(test_cv_preds, columns=target_cols)
test_cv_preds = pd.concat([x_test_fe[['sig_id']], test_cv_preds], axis=1)
sub2 = sample_submission[['sig_id']].merge(test_cv_preds, on = "sig_id", how = "left")
sub2.fillna(0, inplace = True)
sub2.to_csv("submission.csv", index = None)
print(sub2.shape)
sub2.head()

(3982, 207)


Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000853,0.00113,0.002237,0.016781,0.020182,0.004395,0.002568,0.006233,0.000267,...,0.000827,0.00138,0.004683,0.000668,0.00066,0.000603,0.000543,0.002035,0.002883,0.001541
1,id_001897cda,0.000428,0.0009,0.002015,0.00276,0.001541,0.001793,0.002338,0.013677,0.002401,...,0.00081,0.001335,0.004333,0.000901,0.008541,0.000565,0.00609,0.001071,0.00342,0.002619
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.000778,0.000885,0.001584,0.008685,0.016844,0.003525,0.002168,0.004139,0.000274,...,0.00064,0.00158,0.002256,0.017213,0.004731,0.00063,0.001566,0.001853,0.000449,0.001711
4,id_0027f1083,0.001467,0.001504,0.001372,0.015553,0.022895,0.004258,0.003497,0.001798,0.000473,...,0.000648,0.000696,0.002504,0.001366,0.00145,0.000665,0.001367,0.001892,0.000241,0.001677
