In [1]:
# import python packages
import os
import sys
import copy
import time
import datetime
import warnings
warnings.filterwarnings('ignore')

import random
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import torch

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [24]:
# set parameters
num_fold = 5
seed = 42
model_seeds = [42]
model_seeds = [1234, 5678, 3721, 2020, 1110]

num_quantile = 100
threshold_var = 0.8
pca_gene = 600
pca_cell = 50
cluster_gene = 35
cluster_cell = 5

kaggle_id = 'MOA'
user_id = 'smartzdp'
model_id = 'pytorch-7head'
target_id = 'multilabel'
timestamp_id = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")

unique_model_id = f"{kaggle_id}-{user_id}-{model_id}-{target_id}-{timestamp_id}"

path_input = '../input/lish-moa'
path_output = f'./{unique_model_id}'
os.mkdir(path_output)

print(f'unique model id: {unique_model_id}')

unique model id: MOA-smartzdp-pytorch-4head-multilabel-20201110143332325906


In [3]:
# random seed
def seed_everything(seed=2020):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=seed)

In [4]:
# load datasets
df_names = ['train_drug', 'train_features', 'train_targets_scored', 'train_targets_nonscored', 'test_features', 'sample_submission']
df = {}
for name in df_names:
    df[name] = pd.read_csv(f"{path_input}/{name}.csv", index_col=0)
    print(f"{name}: {df[name].shape}")

train_drug: (23814, 1)
train_features: (23814, 875)
train_targets_scored: (23814, 206)
train_targets_nonscored: (23814, 402)
test_features: (3982, 875)
sample_submission: (3982, 206)


In [5]:
# Drug and MultiLabel Stratification Code
# https://www.kaggle.com/ppicheta/lish-moa-drug-aware-multilabelstratifiedkfold
from sklearn.model_selection._split import _BaseKFold

import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master/')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

class DrugAwareMultilabelStratifiedKFold(_BaseKFold):
    SAMPLES_PER_EXPERIMENT = 6
    
    def __init__(self,
                 max_experiment_cnt=3,
                 n_splits=3,
                 shuffle=False,
                 random_state=None):
        super().__init__(n_splits=n_splits, 
                         shuffle=shuffle, 
                         random_state=random_state)
        self._skf = MultilabelStratifiedKFold(n_splits=n_splits, 
                                              shuffle=shuffle, 
                                              random_state=random_state)
        self.drug_threshold = self.SAMPLES_PER_EXPERIMENT * max_experiment_cnt
    
    def _iter_test_indices(self, X=None, y=None, groups=None):
        drug_set = X.merge(y, left_index=True, right_index=True)
        targets = y.columns
        vc = X['drug_id'].value_counts()
        vc1 = vc.loc[vc <= self.drug_threshold].index.sort_values()
        vc2 = vc.loc[vc > self.drug_threshold].index.sort_values()
        
        drug_id_to_fold = {}
        sig_id_to_fold = {}
        if len(vc1) > 0:
            tmp = drug_set.groupby('drug_id')[targets].mean().loc[vc1]
            for fold, (_, idx_val) in enumerate(self._skf.split(tmp, tmp[targets])):
                drug_id_to_fold.update({k: fold for k in tmp.index[idx_val].values})
        
        if len(vc2) > 0:
            tmp = drug_set.loc[drug_set.drug_id.isin(vc2)].reset_index()
            for fold, (_, idx_val) in enumerate(self._skf.split(tmp, tmp[targets])):
                sig_id_to_fold.update({k: fold for k in tmp.sig_id[idx_val].values})
        
        drug_set['fold'] = drug_set.drug_id.map(drug_id_to_fold)
        unset_folds = drug_set.fold.isna()
        drug_set.loc[unset_folds, 'fold'] = drug_set.loc[unset_folds].index.map(sig_id_to_fold)
        test_folds = drug_set.fold.astype('int8').values
        
        for i in range(self.n_splits):
            yield test_folds == i


In [6]:
# assign k-fold
df['train_targets'] = pd.concat([df['train_targets_scored'], df['train_targets_nonscored']], axis=1)
df['train_drug'].loc[:, 'kfold'] = -1

kf = DrugAwareMultilabelStratifiedKFold(n_splits=num_fold, shuffle=True, random_state=seed)
for fold, (_, val) in enumerate(kf.split(X=df['train_drug'], y=df['train_targets'])):
    df['train_drug'].loc[df['train_drug'].index[val], 'kfold'] = fold

print('unique drug_id:')
df['train_drug'].groupby(['kfold'])['drug_id'].nunique()

unique drug_id:


kfold
0    666
1    666
2    666
3    666
4    665
Name: drug_id, dtype: int64

In [8]:
# split the features
df['train_cp'] = df['train_features'].filter(regex='^cp_')
df['train_gene_cell'] = df['train_features'].filter(regex='^(g|c)-')
df['test_cp'] = df['test_features'].filter(regex='^cp_')
df['test_gene_cell'] = df['test_features'].filter(regex='^(g|c)-')

In [19]:
# Gauss Rank Transformation
from sklearn.preprocessing import QuantileTransformer

df['train_gene_cell_qt'] = pd.DataFrame(columns=df['train_gene_cell'].columns, index=df['train_gene_cell'].index)
df['test_gene_cell_qt'] = pd.DataFrame(columns=df['test_gene_cell'].columns, index=df['test_gene_cell'].index)

qt = QuantileTransformer(n_quantiles=num_quantile, random_state=seed, output_distribution='normal')

df['train_gene_cell_qt'].iloc[:, :] = qt.fit_transform(df['train_gene_cell'])
df['test_gene_cell_qt'].iloc[:, :] = qt.transform(df['test_gene_cell'])

In [25]:
# pca features
from sklearn.decomposition import PCA

def generate_pca_feature(df_train, df_test, prefix, num_pca, seed):
    X_pca = PCA(n_components=num_pca, random_state=seed).fit_transform(pd.concat([df_train, df_test]))
    
    col_pca = [f'pca-{prefix}-{i}' for i in range(num_pca)]
    df_train_pca = pd.DataFrame(X_pca[:df_train.shape[0], :], columns=col_pca, index=df_train.index)
    df_test_pca = pd.DataFrame(X_pca[-df_test.shape[0]:, :], columns=col_pca, index=df_test.index)
    
    return df_train_pca, df_test_pca

df['train_gene'] = df['train_gene_cell_qt'].filter(regex='^g-')
df['train_cell'] = df['train_gene_cell_qt'].filter(regex='^c-')
df['test_gene'] = df['test_gene_cell_qt'].filter(regex='^g-')
df['test_cell'] = df['test_gene_cell_qt'].filter(regex='^c-')

df['train_gene_pca'], df['test_gene_pca'] = generate_pca_feature(df['train_gene'], df['test_gene'], 'g', pca_gene, seed)
df['train_cell_pca'], df['test_cell_pca'] = generate_pca_feature(df['train_cell'], df['test_cell'], 'c', pca_cell, seed)

In [26]:
# feature selection
from sklearn.feature_selection import VarianceThreshold

def select_features(df_train, df_test, threshold_var):
    selector = VarianceThreshold(threshold=threshold_var)
    selector.fit(pd.concat([df_train, df_test], axis=0))
    
    return df_train.columns[selector.get_support()].to_list()

gene_cols = select_features(df['train_gene'], df['test_gene'], threshold_var)
cell_cols = select_features(df['train_cell'], df['test_cell'], threshold_var)
gene_pca_cols = select_features(df['train_gene_pca'], df['test_gene_pca'], threshold_var)
cell_pca_cols = select_features(df['train_cell_pca'], df['test_cell_pca'], threshold_var)

print(f'gene features: {len(gene_cols)}')
print(f'cell features: {len(cell_cols)}')
print(f'gene pca features: {len(gene_pca_cols)}')
print(f'cell pca features: {len(cell_pca_cols)}')

gene features: 772
cell features: 100
gene pca features: 156
cell pca features: 8


In [39]:
# cluster features
from sklearn.cluster import KMeans

def generate_cluster_feature(df_train, df_test, prefix, num_cluster, seed):
    y_cluster = KMeans(n_clusters=num_cluster, random_state=seed).fit(pd.concat([df_train, df_test], axis=0)).labels_
    X_cluster = pd.get_dummies(y_cluster).values
    
    col_cluster = [f'cluster-{prefix}-{i}' for i in range(num_cluster)]
    df_train_cluster = pd.DataFrame(X_cluster[:df_train.shape[0], :], columns=col_cluster, index=df_train.index)
    df_test_cluster = pd.DataFrame(X_cluster[-df_test.shape[0]:, :], columns=col_cluster, index=df_test.index)
    
    return df_train_cluster, df_test_cluster

df['train_gene_cluster'], df['test_gene_cluster'] = generate_cluster_feature(df['train_gene'], df['test_gene'], 'g', cluster_gene, seed)
df['train_cell_cluster'], df['test_cell_cluster'] = generate_cluster_feature(df['train_cell'], df['test_cell'], 'c', cluster_cell, seed)

gene_cluster_cols = df['train_gene_cluster'].columns.to_list()
cell_cluster_cols = df['train_cell_cluster'].columns.to_list()

print(f'gene cluster features: {len(gene_cluster_cols)}')
print(f'cell cluster features: {len(cell_cluster_cols)}')

gene cluster features: 35
cell cluster features: 5


In [40]:
# statistics features

def generate_statistics_feature(df_raw, prefix):
    df_stats = pd.DataFrame({f'stats-{prefix}-sum': df_raw.sum(axis=1), 
                             f'stats-{prefix}-mean': df_raw.mean(axis=1), 
                             f'stats-{prefix}-std': df_raw.std(axis=1), 
                             f'stats-{prefix}-kurtosis': df_raw.kurtosis(axis=1), 
                             f'stats-{prefix}-skew': df_raw.skew(axis=1)})
    
    return df_stats

df['train_stats'] = pd.concat([generate_statistics_feature(df['train_gene'], 'g'), 
                               generate_statistics_feature(df['train_cell'], 'c'), 
                               generate_statistics_feature(df['train_gene_cell_qt'], 'gc')], axis=1)
df['test_stats'] = pd.concat([generate_statistics_feature(df['test_gene'], 'g'), 
                              generate_statistics_feature(df['test_cell'], 'c'), 
                              generate_statistics_feature(df['test_gene_cell_qt'], 'gc')], axis=1)

stats_cols = df['train_stats'].columns.to_list()

print(f'statistics features: {len(stats_cols)}')

statistics features: 15


In [41]:
# process cp features
df['train_cpoh'] = pd.get_dummies(df['train_cp'][['cp_time', 'cp_dose']], columns=['cp_time', 'cp_dose'])
df['test_cpoh'] = pd.get_dummies(df['test_cp'][['cp_time', 'cp_dose']], columns=['cp_time', 'cp_dose'])

cpoh_cols = df['train_cpoh'].columns.to_list()
print(f'cpoh features: {len(cpoh_cols)}')

cpoh features: 5


In [42]:
# concatenate the train set
feature_cols = cpoh_cols + gene_cols + cell_cols + gene_pca_cols + cell_pca_cols + gene_cluster_cols + cell_cluster_cols + stats_cols
target_cols = df['train_targets_scored'].columns.to_list()

df['train_concat'] = pd.concat([df['train_drug']['kfold'], df['train_targets_scored'], df['train_cpoh'], 
                                df['train_gene'][gene_cols], df['train_cell'][cell_cols], 
                                df['train_gene_pca'][gene_pca_cols], df['train_cell_pca'][cell_pca_cols], 
                                df['train_gene_cluster'], df['train_cell_cluster'], df['train_stats']], axis=1)

df['test_concat'] = pd.concat([df['test_cpoh'], 
                               df['test_gene'][gene_cols], df['test_cell'][cell_cols], 
                               df['test_gene_pca'][gene_pca_cols], df['test_cell_pca'][cell_pca_cols], 
                               df['test_gene_cluster'], df['test_cell_cluster'], df['test_stats']], axis=1)

# drop ctl_vehicle
df['train_concat'] = df['train_concat'].loc[df['train_cp']['cp_type'] == 'trt_cp']
df['test_concat'] = df['test_concat'].loc[df['test_cp']['cp_type'] == 'trt_cp']

In [43]:
# rename dataset
train = df['train_concat']
test = df['test_concat']

print(f"train shape: {train.shape}")
print(f"test shape: {test.shape}")

print(f"train kfold counts: \n{train['kfold'].value_counts(sort=False)}")

train shape: (21948, 1303)
test shape: (3624, 1096)
train kfold counts: 
0    4394
1    4424
2    4360
3    4395
4    4375
Name: kfold, dtype: int64


In [44]:
# 7-head pytorch model
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, num_gene_features, num_cell_features, num_gene_pca_features, num_cell_pca_features, 
                 num_gene_cluster_features, num_cell_cluster_features, num_stats_features, num_targets, 
                 gene_hidden_size, cell_hidden_size, gene_pca_hidden_size, cell_pca_hidden_size, 
                 gene_cluster_hidden_size, cell_cluster_hidden_size, stats_hidden_size):
        super(Model, self).__init__()
        self.gene_batch_norm1 = nn.BatchNorm1d(num_gene_features)
        self.gene_dropout1 = nn.Dropout(0.20)
        self.gene_dense1 = nn.utils.weight_norm(nn.Linear(num_gene_features, gene_hidden_size))
        self.gene_batch_norm2 = nn.BatchNorm1d(gene_hidden_size)
        self.gene_dropout2 = nn.Dropout(0.30)
        self.gene_dense2 = nn.utils.weight_norm(nn.Linear(gene_hidden_size, gene_hidden_size))
        
        self.cell_batch_norm1 = nn.BatchNorm1d(num_cell_features)
        self.cell_dropout1 = nn.Dropout(0.20)
        self.cell_dense1 = nn.utils.weight_norm(nn.Linear(num_cell_features, cell_hidden_size))
        self.cell_batch_norm2 = nn.BatchNorm1d(cell_hidden_size)
        self.cell_dropout2 = nn.Dropout(0.30)
        self.cell_dense2 = nn.utils.weight_norm(nn.Linear(cell_hidden_size, cell_hidden_size))
        
        self.gene_pca_batch_norm1 = nn.BatchNorm1d(num_gene_pca_features)
        self.gene_pca_dropout1 = nn.Dropout(0.20)
        self.gene_pca_dense1 = nn.utils.weight_norm(nn.Linear(num_gene_pca_features, gene_pca_hidden_size))
        self.gene_pca_batch_norm2 = nn.BatchNorm1d(gene_pca_hidden_size)
        self.gene_pca_dropout2 = nn.Dropout(0.30)
        self.gene_pca_dense2 = nn.utils.weight_norm(nn.Linear(gene_pca_hidden_size, gene_pca_hidden_size))
        
        self.cell_pca_batch_norm1 = nn.BatchNorm1d(num_cell_pca_features)
        self.cell_pca_dropout1 = nn.Dropout(0.20)
        self.cell_pca_dense1 = nn.utils.weight_norm(nn.Linear(num_cell_pca_features, cell_pca_hidden_size))
        self.cell_pca_batch_norm2 = nn.BatchNorm1d(cell_pca_hidden_size)
        self.cell_pca_dropout2 = nn.Dropout(0.30)
        self.cell_pca_dense2 = nn.utils.weight_norm(nn.Linear(cell_pca_hidden_size, cell_pca_hidden_size))
        
        self.gene_cluster_batch_norm1 = nn.BatchNorm1d(num_gene_cluster_features)
        self.gene_cluster_dropout1 = nn.Dropout(0.20)
        self.gene_cluster_dense1 = nn.utils.weight_norm(nn.Linear(num_gene_cluster_features, gene_cluster_hidden_size))
        self.gene_cluster_batch_norm2 = nn.BatchNorm1d(gene_cluster_hidden_size)
        self.gene_cluster_dropout2 = nn.Dropout(0.30)
        self.gene_cluster_dense2 = nn.utils.weight_norm(nn.Linear(gene_cluster_hidden_size, gene_cluster_hidden_size))
        
        self.cell_cluster_batch_norm1 = nn.BatchNorm1d(num_cell_cluster_features)
        self.cell_cluster_dropout1 = nn.Dropout(0.20)
        self.cell_cluster_dense1 = nn.utils.weight_norm(nn.Linear(num_cell_cluster_features, cell_cluster_hidden_size))
        self.cell_cluster_batch_norm2 = nn.BatchNorm1d(cell_cluster_hidden_size)
        self.cell_cluster_dropout2 = nn.Dropout(0.30)
        self.cell_cluster_dense2 = nn.utils.weight_norm(nn.Linear(cell_cluster_hidden_size, cell_cluster_hidden_size))
        
        self.stats_batch_norm1 = nn.BatchNorm1d(num_stats_features)
        self.stats_dropout1 = nn.Dropout(0.20)
        self.stats_dense1 = nn.utils.weight_norm(nn.Linear(num_stats_features, stats_hidden_size))
        self.stats_batch_norm2 = nn.BatchNorm1d(stats_hidden_size)
        self.stats_dropout2 = nn.Dropout(0.30)
        self.stats_dense2 = nn.utils.weight_norm(nn.Linear(stats_hidden_size, stats_hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(gene_hidden_size + cell_hidden_size + gene_pca_hidden_size + cell_pca_hidden_size + \
                                          gene_cluster_hidden_size + cell_cluster_hidden_size + stats_hidden_size)
        self.dropout3 = nn.Dropout(0.25)
        self.dense3 = nn.utils.weight_norm(nn.Linear(gene_hidden_size + cell_hidden_size + gene_pca_hidden_size + cell_pca_hidden_size + \
                                                     gene_cluster_hidden_size + cell_cluster_hidden_size + stats_hidden_size, num_targets))
    
    def forward(self, gene_x, cell_x, gene_pca_x, cell_pca_x, gene_cluster_x, cell_cluster_x, stats_x):
        gene_x = self.gene_batch_norm1(gene_x)
        gene_x = self.gene_dropout1(gene_x)
        gene_x = nn.functional.relu(self.gene_dense1(gene_x))
        gene_x = self.gene_batch_norm2(gene_x)
        gene_x = self.gene_dropout2(gene_x)
        gene_x = nn.functional.relu(self.gene_dense2(gene_x))
        
        cell_x = self.cell_batch_norm1(cell_x)
        cell_x = self.cell_dropout1(cell_x)
        cell_x = nn.functional.relu(self.cell_dense1(cell_x))
        cell_x = self.cell_batch_norm2(cell_x)
        cell_x = self.cell_dropout2(cell_x)
        cell_x = nn.functional.relu(self.cell_dense2(cell_x))
        
        gene_pca_x = self.gene_pca_batch_norm1(gene_pca_x)
        gene_pca_x = self.gene_pca_dropout1(gene_pca_x)
        gene_pca_x = nn.functional.relu(self.gene_pca_dense1(gene_pca_x))
        gene_pca_x = self.gene_pca_batch_norm2(gene_pca_x)
        gene_pca_x = self.gene_pca_dropout2(gene_pca_x)
        gene_pca_x = nn.functional.relu(self.gene_pca_dense2(gene_pca_x))
        
        cell_pca_x = self.cell_pca_batch_norm1(cell_pca_x)
        cell_pca_x = self.cell_pca_dropout1(cell_pca_x)
        cell_pca_x = nn.functional.relu(self.cell_pca_dense1(cell_pca_x))
        cell_pca_x = self.cell_pca_batch_norm2(cell_pca_x)
        cell_pca_x = self.cell_pca_dropout2(cell_pca_x)
        cell_pca_x = nn.functional.relu(self.cell_pca_dense2(cell_pca_x))
        
        gene_cluster_x = self.gene_cluster_batch_norm1(gene_cluster_x)
        gene_cluster_x = self.gene_cluster_dropout1(gene_cluster_x)
        gene_cluster_x = nn.functional.relu(self.gene_cluster_dense1(gene_cluster_x))
        gene_cluster_x = self.gene_cluster_batch_norm2(gene_cluster_x)
        gene_cluster_x = self.gene_cluster_dropout2(gene_cluster_x)
        gene_cluster_x = nn.functional.relu(self.gene_cluster_dense2(gene_cluster_x))
        
        cell_cluster_x = self.cell_cluster_batch_norm1(cell_cluster_x)
        cell_cluster_x = self.cell_cluster_dropout1(cell_cluster_x)
        cell_cluster_x = nn.functional.relu(self.cell_cluster_dense1(cell_cluster_x))
        cell_cluster_x = self.cell_cluster_batch_norm2(cell_cluster_x)
        cell_cluster_x = self.cell_cluster_dropout2(cell_cluster_x)
        cell_cluster_x = nn.functional.relu(self.cell_cluster_dense2(cell_cluster_x))
        
        stats_x = self.stats_batch_norm1(stats_x)
        stats_x = self.stats_dropout1(stats_x)
        stats_x = nn.functional.relu(self.stats_dense1(stats_x))
        stats_x = self.stats_batch_norm2(stats_x)
        stats_x = self.stats_dropout2(stats_x)
        stats_x = nn.functional.relu(self.stats_dense2(stats_x))
                
        x = torch.cat((gene_x, cell_x, gene_pca_x, cell_pca_x, gene_cluster_x, cell_cluster_x, stats_x), dim=1)
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x


In [45]:
# 7-head pytorch dataset
class TrainDataSet:
    def __init__(self, gene_features, cell_features, gene_pca_features, cell_pca_features, 
                 gene_cluster_features, cell_cluster_features, stats_features, targets):
        self.gene_features = gene_features
        self.cell_features = cell_features
        self.gene_pca_features = gene_pca_features
        self.cell_pca_features = cell_pca_features
        self.gene_cluster_features = gene_cluster_features
        self.cell_cluster_features = cell_cluster_features
        self.stats_features = stats_features
        self.targets = targets
    
    def __len__(self):
        return (self.gene_features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'gene_x' : torch.tensor(self.gene_features[idx, :], dtype=torch.float), 
            'cell_x' : torch.tensor(self.cell_features[idx, :], dtype=torch.float), 
            'gene_pca_x' : torch.tensor(self.gene_pca_features[idx, :], dtype=torch.float), 
            'cell_pca_x' : torch.tensor(self.cell_pca_features[idx, :], dtype=torch.float), 
            'gene_cluster_x' : torch.tensor(self.gene_cluster_features[idx, :], dtype=torch.float), 
            'cell_cluster_x' : torch.tensor(self.cell_cluster_features[idx, :], dtype=torch.float), 
            'stats_x' : torch.tensor(self.stats_features[idx, :], dtype=torch.float), 
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)
        }
        
        return dct

class TestDataSet:
    def __init__(self, gene_features, cell_features, gene_pca_features, cell_pca_features, 
                 gene_cluster_features, cell_cluster_features, stats_features):
        self.gene_features = gene_features
        self.cell_features = cell_features
        self.gene_pca_features = gene_pca_features
        self.cell_pca_features = cell_pca_features
        self.gene_cluster_features = gene_cluster_features
        self.cell_cluster_features = cell_cluster_features
        self.stats_features = stats_features
        
    def __len__(self):
        return (self.gene_features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'gene_x' : torch.tensor(self.gene_features[idx, :], dtype=torch.float), 
            'cell_x' : torch.tensor(self.cell_features[idx, :], dtype=torch.float), 
            'gene_pca_x' : torch.tensor(self.gene_pca_features[idx, :], dtype=torch.float), 
            'cell_pca_x' : torch.tensor(self.cell_pca_features[idx, :], dtype=torch.float), 
            'gene_cluster_x' : torch.tensor(self.gene_cluster_features[idx, :], dtype=torch.float), 
            'cell_cluster_x' : torch.tensor(self.cell_cluster_features[idx, :], dtype=torch.float), 
            'stats_x' : torch.tensor(self.stats_features[idx, :], dtype=torch.float)
        }
        
        return dct


In [46]:
# 7-head pytorch learning network
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        gene_x = data['gene_x'].to(device)
        cell_x = data['cell_x'].to(device)
        gene_pca_x = data['gene_pca_x'].to(device)
        cell_pca_x = data['cell_pca_x'].to(device)
        gene_cluster_x = data['gene_cluster_x'].to(device)
        cell_cluster_x = data['cell_cluster_x'].to(device)
        stats_x = data['stats_x'].to(device)
        targets = data['y'].to(device)
        
        outputs = model(gene_x, cell_x, gene_pca_x, cell_pca_x, gene_cluster_x, cell_cluster_x, stats_x)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
    
    final_loss /= len(dataloader)
    
    return final_loss

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        gene_x = data['gene_x'].to(device)
        cell_x = data['cell_x'].to(device)
        gene_pca_x = data['gene_pca_x'].to(device)
        cell_pca_x = data['cell_pca_x'].to(device)
        gene_cluster_x = data['gene_cluster_x'].to(device)
        cell_cluster_x = data['cell_cluster_x'].to(device)
        stats_x = data['stats_x'].to(device)
        targets = data['y'].to(device)
        outputs = model(gene_x, cell_x, gene_pca_x, cell_pca_x, gene_cluster_x, cell_cluster_x, stats_x)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
    
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def test_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        gene_x = data['gene_x'].to(device)
        cell_x = data['cell_x'].to(device)
        gene_pca_x = data['gene_pca_x'].to(device)
        cell_pca_x = data['cell_pca_x'].to(device)
        gene_cluster_x = data['gene_cluster_x'].to(device)
        cell_cluster_x = data['cell_cluster_x'].to(device)
        stats_x = data['stats_x'].to(device)
        
        with torch.no_grad():
            outputs = model(gene_x, cell_x, gene_pca_x, cell_pca_x, gene_cluster_x, cell_cluster_x, stats_x)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds


In [47]:
# 7-head pytorch hyper-parameter
EPOCHS                   = 25
BATCH_SIZE               = 128
LEARNING_RATE            = 1e-3
WEIGHT_DECAY             = 1e-5
EARLY_STOP               = True
EARLY_STOPPING_STEPS     = 10

gene_hidden_size         = 1024
cell_hidden_size         = 512
gene_pca_hidden_size     = 156
cell_pca_hidden_size     = 8
gene_cluster_hidden_size = 35
cell_cluster_hidden_size = 5
stats_hidden_size        = 15

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


In [48]:
# 7-head training function
def run_training(fold, seed, seed_idx):
    seed_everything(seed)
    
    # training
    train_idx = np.where(train['kfold'] != fold)[0]
    valid_idx = np.where(train['kfold'] == fold)[0]
    
    train_df = train[train['kfold'] != fold]
    valid_df = train[train['kfold'] == fold]
    
    gene_x_train = train_df[gene_cols + cpoh_cols].values
    cell_x_train = train_df[cell_cols + cpoh_cols].values
    gene_pca_x_train = train_df[gene_pca_cols + cpoh_cols].values
    cell_pca_x_train = train_df[cell_pca_cols + cpoh_cols].values
    gene_cluster_x_train = train_df[gene_cluster_cols + cpoh_cols].values
    cell_cluster_x_train = train_df[cell_cluster_cols + cpoh_cols].values
    stats_x_train = train_df[stats_cols + cpoh_cols].values
    y_train = train_df[target_cols].values
    
    gene_x_valid = valid_df[gene_cols + cpoh_cols].values
    cell_x_valid = valid_df[cell_cols + cpoh_cols].values
    gene_pca_x_valid = valid_df[gene_pca_cols + cpoh_cols].values
    cell_pca_x_valid = valid_df[cell_pca_cols + cpoh_cols].values
    gene_cluster_x_valid = valid_df[gene_cluster_cols + cpoh_cols].values
    cell_cluster_x_valid = valid_df[cell_cluster_cols + cpoh_cols].values
    stats_x_valid = valid_df[stats_cols + cpoh_cols].values
    y_valid = valid_df[target_cols].values
    
    train_dataset = TrainDataSet(gene_x_train, cell_x_train, gene_pca_x_train, cell_pca_x_train, 
                                 gene_cluster_x_train, cell_cluster_x_train, stats_x_train, y_train)
    valid_dataset = TrainDataSet(gene_x_valid, cell_x_valid, gene_pca_x_valid, cell_pca_x_valid, 
                                 gene_cluster_x_valid, cell_cluster_x_valid, stats_x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_gene_features=len(gene_cols + cpoh_cols), 
        num_cell_features=len(cell_cols + cpoh_cols), 
        num_gene_pca_features=len(gene_pca_cols + cpoh_cols), 
        num_cell_pca_features=len(cell_pca_cols + cpoh_cols), 
        num_gene_cluster_features=len(gene_cluster_cols + cpoh_cols), 
        num_cell_cluster_features=len(cell_cluster_cols + cpoh_cols), 
        num_stats_features=len(stats_cols + cpoh_cols), 
        num_targets=len(target_cols), 
        gene_hidden_size=gene_hidden_size, 
        cell_hidden_size=cell_hidden_size, 
        gene_pca_hidden_size=gene_pca_hidden_size, 
        cell_pca_hidden_size=cell_pca_hidden_size, 
        gene_cluster_hidden_size=gene_cluster_hidden_size, 
        cell_cluster_hidden_size=cell_cluster_hidden_size, 
        stats_hidden_size=stats_hidden_size
    )
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                                    max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    loss_fn = nn.BCEWithLogitsLoss()
    
    early_step = 0
    early_stopping_steps = EARLY_STOPPING_STEPS
    
    oof = np.zeros((train.shape[0], len(target_cols)))
    best_loss = np.inf
    
    for epoch in range(EPOCHS):
        start = time.time()
        train_loss = train_fn(model, optimizer, scheduler, loss_fn, trainloader, DEVICE)
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        
        elapse = time.time() - start
        print(f"seed: {seed_idx}, fold: {fold}, epoch: {epoch}, train_loss: {train_loss}, valid_loss: {valid_loss}, elapse: {elapse:0.2f} sec")
        
        if valid_loss < best_loss:
            best_loss = valid_loss
            oof[valid_idx] = valid_preds
            torch.save(model.state_dict(), f"{path_output}/seed{seed_idx}-fold{fold}.pth")
        elif(EARLY_STOP == True):
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
    
    # prediction
    gene_x_test = test[gene_cols + cpoh_cols].values
    cell_x_test = test[cell_cols + cpoh_cols].values
    gene_pca_x_test = test[gene_pca_cols + cpoh_cols].values
    cell_pca_x_test = test[cell_pca_cols + cpoh_cols].values
    gene_cluster_x_test = test[gene_cluster_cols + cpoh_cols].values
    cell_cluster_x_test = test[cell_cluster_cols + cpoh_cols].values
    stats_x_test = test[stats_cols + cpoh_cols].values
    
    test_dataset = TestDataSet(gene_x_test, cell_x_test, gene_pca_x_test, cell_pca_x_test, 
                               gene_cluster_x_test, cell_cluster_x_test, stats_x_test)
    testloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_gene_features=len(gene_cols + cpoh_cols), 
        num_cell_features=len(cell_cols + cpoh_cols), 
        num_gene_pca_features=len(gene_pca_cols + cpoh_cols), 
        num_cell_pca_features=len(cell_pca_cols + cpoh_cols), 
        num_gene_cluster_features=len(gene_cluster_cols + cpoh_cols), 
        num_cell_cluster_features=len(cell_cluster_cols + cpoh_cols), 
        num_stats_features=len(stats_cols + cpoh_cols), 
        num_targets=len(target_cols), 
        gene_hidden_size=gene_hidden_size, 
        cell_hidden_size=cell_hidden_size, 
        gene_pca_hidden_size=gene_pca_hidden_size, 
        cell_pca_hidden_size=cell_pca_hidden_size, 
        gene_cluster_hidden_size=gene_cluster_hidden_size, 
        cell_cluster_hidden_size=cell_cluster_hidden_size, 
        stats_hidden_size=stats_hidden_size
    )
    model.load_state_dict(torch.load(f"{path_output}/seed{seed_idx}-fold{fold}.pth"))
    model.to(DEVICE)
    
    pred = np.zeros((test.shape[0], len(target_cols)))
    pred = test_fn(model, testloader, DEVICE)
    
    return oof, pred


In [49]:
def run_kfold(num_fold, seed, seed_idx):
    oof = np.zeros((train.shape[0], len(target_cols)))
    pred = np.zeros((test.shape[0], len(target_cols)))
    
    for fold in range(num_fold):
        oof_, pred_ = run_training(fold, seed, seed_idx)
        
        oof += oof_
        pred += pred_ / num_fold
        
    return oof, pred

In [50]:
%%time

# start calculation on multiple seeds
oof = np.zeros((train.shape[0], len(target_cols)))
pred = np.zeros((test.shape[0], len(target_cols)))

for idx, seed in enumerate(model_seeds):
    oof_, pred_ = run_kfold(num_fold, seed, idx)
    oof += oof_ / len(model_seeds)
    pred += pred_ / len(model_seeds)


seed: 0, fold: 0, epoch: 0, train_loss: 0.47928728122750053, valid_loss: 0.02427878874753203, elapse: 5.37 sec
seed: 0, fold: 0, epoch: 1, train_loss: 0.020743040435448074, valid_loss: 0.019137578617249216, elapse: 4.42 sec
seed: 0, fold: 0, epoch: 2, train_loss: 0.018492117497152176, valid_loss: 0.018708557954856327, elapse: 3.83 sec
seed: 0, fold: 0, epoch: 3, train_loss: 0.017567428503779396, valid_loss: 0.017998169868120126, elapse: 4.43 sec
seed: 0, fold: 0, epoch: 4, train_loss: 0.017091220325749855, valid_loss: 0.01766360950257097, elapse: 4.11 sec
seed: 0, fold: 0, epoch: 5, train_loss: 0.01688156658918529, valid_loss: 0.017925729363092353, elapse: 4.77 sec
seed: 0, fold: 0, epoch: 6, train_loss: 0.017046048852574568, valid_loss: 0.017943310711000648, elapse: 4.95 sec
seed: 0, fold: 0, epoch: 7, train_loss: 0.017049757250841114, valid_loss: 0.01808332505502871, elapse: 4.68 sec
seed: 0, fold: 0, epoch: 8, train_loss: 0.017143429182739794, valid_loss: 0.01789156049489975, elapse

In [51]:
# build prediction dataframe
df['train_predict'] = pd.DataFrame(columns=target_cols, index=df['train_features'].index).fillna(0)
df['test_predict'] = pd.DataFrame(columns=target_cols, index=df['test_features'].index).fillna(0)

df['train_predict'][df['train_cp']['cp_type'] == 'trt_cp'] = oof
df['test_predict'][df['test_cp']['cp_type'] == 'trt_cp'] = pred

In [52]:
# compute cv logloss
from sklearn.metrics import log_loss

logloss = log_loss(df['train_targets_scored'].values.ravel(), df['train_predict'].values.ravel())
print(f"cv log loss: {logloss}")

cv log loss: 0.01567781859401251


In [53]:
# create submission file
df['test_predict'].to_csv(f"{path_output}/submission.csv", index=True)
df['test_predict'].to_csv(f"{path_output}/{path_output}-submission.csv", index=True)