# Model 1 - NN

In [1]:
# basics
import numpy as np
import itertools
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
from collections import Counter
# import copy
import seaborn as sns
from time import time
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

# sklearn
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold

# pyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _WeightedLoss

# stratified kfold
!pip install /kaggle/input/iterative-stratification/iterative-stratification-master/
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

Processing /kaggle/input/iterative-stratification/iterative-stratification-master
Building wheels for collected packages: iterative-stratification
  Building wheel for iterative-stratification (setup.py) ... [?25l- \ done
[?25h  Created wheel for iterative-stratification: filename=iterative_stratification-0.1.6-py3-none-any.whl size=8401 sha256=c92c4154e7174f6c70c0898c996b0b5ceb6179b09811b8948c17109198455ef7
  Stored in directory: /root/.cache/pip/wheels/b8/47/3f/eb4af42d124f37d23d6f13a4c8bbc32c1d70140e6e1cecb4aa
Successfully built iterative-stratification
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6


In [2]:
# load raw data
data_dir = '../input/lish-moa/'
train_features = pd.read_csv(data_dir + 'train_features.csv')
train_targets_scored = pd.read_csv(data_dir + 'train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(data_dir + 'train_targets_nonscored.csv')
train_drug = pd.read_csv(data_dir + 'train_drug.csv')
test_features = pd.read_csv(data_dir + 'test_features.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

train_features_extra = pd.read_csv('../input/moa-fe-extra-data/MoA_FE_Extra_Data.csv')

# keep only nonscored targets with >0 positive labels
keep_list = (np.where(train_targets_nonscored.iloc[:,1:].values.sum(axis=0)>0)[0] + 1).tolist()
train_targets_nonscored = train_targets_nonscored.iloc[:,[0]+keep_list]

print('train_features: {}'.format(train_features.shape))
print('train_targets_scored: {}'.format(train_targets_scored.shape))
print('train_targets_nonscored: {}'.format(train_targets_nonscored.shape))
print('train_drug: {}'.format(train_drug.shape))
print('test_features: {}'.format(test_features.shape))
print('train_features_extra: {}'.format(train_features_extra.shape))
print('sample_submission: {}'.format(sample_submission.shape))

train_features: (23814, 876)
train_targets_scored: (23814, 207)
train_targets_nonscored: (23814, 332)
train_drug: (23814, 2)
test_features: (3982, 876)
train_features_extra: (3982, 876)
sample_submission: (3982, 207)


In [3]:
# function to set single seed for everything
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# set seed to 42
seed_everything(42)

In [4]:
# define g/c features
g_col = [col for col in train_features.columns if col.startswith('g-')]
c_col = [col for col in train_features.columns if col.startswith('c-')]

In [5]:
def feature_engineering(x_train, x_test, seed):
    
    # set seed
    seed_everything(seed)

    # sig_id & cp_type
    sig_train = x_train[['sig_id', 'cp_type']]
    sig_test = x_test[['sig_id', 'cp_type']]
    print('sig_id & cp_type', sig_train.shape, sig_test.shape)
    
    # OHE for cp_time, cp_dose
    cp_dose = {'D1': 0, 'D2': 1}
    cp_time = {24:0, 48:1, 72:2}
    cp_dose_train = x_train.cp_dose.map(cp_dose)
    cp_time_train = x_train.cp_time.map(cp_time)
    cp_dose_test = x_test.cp_dose.map(cp_dose)
    cp_time_test = x_test.cp_time.map(cp_time)
    cp_train = pd.DataFrame({'cp_dose':cp_dose_train, 'cp_time':cp_time_train}).reset_index(drop=True)
    cp_test = pd.DataFrame({'cp_dose':cp_dose_test, 'cp_time':cp_time_test}).reset_index(drop=True)
    print('OHE for cp_time, cp_dose', cp_train.shape, cp_test.shape)
    
    # RankGauss scaling
    n_quantiles = 100
    qt = QuantileTransformer(n_quantiles=n_quantiles, random_state=seed, output_distribution='normal').fit(x_train[g_col + c_col])
    rg_train = pd.DataFrame(qt.transform(x_train[g_col + c_col]), columns=['rg_' + col for col in g_col + c_col]).reset_index(drop=True)
    rg_test = pd.DataFrame(qt.transform(x_test[g_col + c_col]), columns=['rg_' + col for col in g_col + c_col]).reset_index(drop=True)
    print('RankGauss scaling', rg_train.shape, rg_test.shape)
    print(rg_train.head())
    
    # PCA for g_col
    g_n_comp = 600
    rg_g_col = [col for col in rg_train.columns if col.startswith('rg_g')]
    pca = PCA(n_components=g_n_comp, random_state=seed).fit(rg_train[rg_g_col])
    pca_g_train = pd.DataFrame(pca.transform(rg_train[rg_g_col]), columns=['pca_g-' + str(i) for i in range(g_n_comp)]).reset_index(drop=True)
    pca_g_test = pd.DataFrame(pca.transform(rg_test[rg_g_col]), columns=['pca_g-' + str(i) for i in range(g_n_comp)]).reset_index(drop=True)
    print('PCA for g_col', pca_g_train.shape, pca_g_test.shape)
    print(pca_g_train.head())

    # PCA for c_col
    c_n_comp = 50
    rg_c_col = [col for col in rg_train.columns if col.startswith('rg_c')]
    pca = PCA(n_components=c_n_comp, random_state=seed).fit(rg_train[rg_c_col])
    pca_c_train = pd.DataFrame(pca.transform(rg_train[rg_c_col]), columns=['pca_c-' + str(i) for i in range(c_n_comp)]).reset_index(drop=True)
    pca_c_test = pd.DataFrame(pca.transform(rg_test[rg_c_col]), columns=['pca_c-' + str(i) for i in range(c_n_comp)]).reset_index(drop=True)
    print('PCA for c_col', pca_c_train.shape, pca_c_test.shape)
    print(pca_c_train.head())

    # combine g & c PCA features
    pca_gc_train = pd.concat((pca_g_train, pca_c_train),axis=1).reset_index(drop=True)
    pca_gc_test  =pd.concat((pca_g_test, pca_c_test),axis=1).reset_index(drop=True)
    print('combine g & c PCA features', pca_gc_train.shape, pca_gc_test.shape)
    print(pca_gc_train.head())

    # FS by Variance Threshold
    thsld = 0.85
    data_train = pd.concat([rg_train, pca_gc_train], axis=1)
    data_test = pd.concat([rg_test, pca_gc_test], axis=1)
    variance_threshold = VarianceThreshold(thsld).fit(data_train)
    selected_features = variance_threshold.get_support(True).tolist()
    fs_train = data_train.iloc[:,selected_features].reset_index(drop=True)
    fs_test = data_test.iloc[:,selected_features].reset_index(drop=True)
    print('FS by Variance Threshold', fs_train.shape, fs_test.shape)
    print(fs_train.head())

    # function to cluster raw g/c
    n_clusters_g = 22
    n_clusters_c = 4
    def gc_cluster(train, test, n_clusters_g, n_clusters_c, seed):

        def create_cluster(train, test, features, kind, n_clusters):
            col_prefix = 'raw_clusters_'
            train_ = train[features].copy()
            test_ = test[features].copy()
            kmeans = KMeans(n_clusters=n_clusters, random_state=seed).fit(train_)
            data = pd.concat([train_, test_], axis=0).reset_index(drop=True)
            pred = pd.DataFrame(kmeans.predict(data[features]), columns=[col_prefix + kind])
            pred = pd.get_dummies(pred, columns=[col_prefix + kind])
            pred_train = pred.iloc[:len(train),:].reset_index(drop=True)
            pred_test = pred.iloc[len(train):,:].reset_index(drop=True)
            return pred_train, pred_test

        g_pred_train, g_pred_test = create_cluster(train, test, g_col, kind='g', n_clusters=n_clusters_g)
        c_pred_train, c_pred_test = create_cluster(train, test, c_col, kind='c', n_clusters=n_clusters_c)
        gc_pred_train = pd.concat([g_pred_train, c_pred_train], axis=1).reset_index(drop=True)
        gc_pred_test = pd.concat([g_pred_test, c_pred_test], axis=1).reset_index(drop=True)
        return gc_pred_train, gc_pred_test
    
    raw_gc_cluster_train, raw_gc_cluster_test = gc_cluster(x_train, x_test, n_clusters_g=n_clusters_g, n_clusters_c=n_clusters_c, seed=seed)
    print('cluster raw g/c', raw_gc_cluster_train.shape, raw_gc_cluster_test.shape)


    # function to cluster PCA g/c
    n_clusters = 5
    def fe_cluster_pca(train, test, n_clusters, seed):
        col_prefix = 'pca_clusters'
        kmeans = KMeans(n_clusters=n_clusters, random_state = seed).fit(train)
        data = pd.concat([train, test], axis=0).reset_index(drop=True)
        pred = pd.DataFrame(kmeans.predict(data), columns=[col_prefix])
        pred = pd.get_dummies(pred, columns=[col_prefix])
        pred_train = pred.iloc[:len(train),:].reset_index(drop=True)
        pred_test = pred.iloc[len(train):,:].reset_index(drop=True)
        return pred_train, pred_test
    pca_cluster_train, pca_cluster_test = fe_cluster_pca(pca_gc_train, pca_gc_test, n_clusters=n_clusters, seed=seed)
    print('cluster PCA g/c', pca_cluster_train.shape, pca_cluster_test.shape)


    # statistics features
    gsquarecols = ['g-574','g-211','g-216','g-0','g-255','g-577','g-153','g-389','g-60','g-370','g-248','g-167','g-203',
                   'g-177','g-301','g-332','g-517','g-6','g-744','g-224','g-162','g-3','g-736','g-486','g-283','g-22',
                   'g-359','g-361','g-440','g-335','g-106','g-307','g-745','g-146','g-416','g-298','g-666','g-91','g-17',
                   'g-549','g-145','g-157','g-768','g-568','g-396']
    def fe_stats(df):
        stat_df = dict()
        stat_df['g_sum'] = df[g_col].sum(axis = 1)
        stat_df['g_mean'] = df[g_col].mean(axis = 1)
        stat_df['g_std'] = df[g_col].std(axis = 1)
        stat_df['g_kurt'] = df[g_col].kurtosis(axis = 1)
        stat_df['g_skew'] = df[g_col].skew(axis = 1)
        stat_df['c_sum'] = df[c_col].sum(axis = 1)
        stat_df['c_mean'] = df[c_col].mean(axis = 1)
        stat_df['c_std'] = df[c_col].std(axis = 1)
        stat_df['c_kurt'] = df[c_col].kurtosis(axis = 1)
        stat_df['c_skew'] = df[c_col].skew(axis = 1)
        stat_df['gc_sum'] = df[g_col + c_col].sum(axis = 1)
        stat_df['gc_mean'] = df[g_col + c_col].mean(axis = 1)
        stat_df['gc_std'] = df[g_col + c_col].std(axis = 1)
        stat_df['gc_kurt'] = df[g_col + c_col].kurtosis(axis = 1)
        stat_df['gc_skew'] = df[g_col + c_col].skew(axis = 1)

        stat_df['c52_c42'] = df['c-52'] * df['c-42']
        stat_df['c13_c73'] = df['c-13'] * df['c-73']
        stat_df['c26_c13'] = df['c-23'] * df['c-13']
        stat_df['c33_c6'] = df['c-33'] * df['c-6']
        stat_df['c11_c55'] = df['c-11'] * df['c-55']
        stat_df['c38_c63'] = df['c-38'] * df['c-63']
        stat_df['c38_c94'] = df['c-38'] * df['c-94']
        stat_df['c13_c94'] = df['c-13'] * df['c-94']
        stat_df['c4_c52'] = df['c-4'] * df['c-52']
        stat_df['c4_c42'] = df['c-4'] * df['c-42']
        stat_df['c13_c38'] = df['c-13'] * df['c-38']
        stat_df['c55_c2'] = df['c-55'] * df['c-2']
        stat_df['c55_c4'] = df['c-55'] * df['c-4']
        stat_df['c4_c13'] = df['c-4'] * df['c-13']
        stat_df['c82_c42'] = df['c-82'] * df['c-42']
        stat_df['c66_c42'] = df['c-66'] * df['c-42']
        stat_df['c6_c38'] = df['c-6'] * df['c-38']
        stat_df['c2_c13'] = df['c-2'] * df['c-13']
        stat_df['c62_c42'] = df['c-62'] * df['c-42']
        stat_df['c90_c55'] = df['c-90'] * df['c-55']
        
        for feature in c_col:
            stat_df[f'{feature}_squared'] = df[feature] ** 2     
        for feature in gsquarecols:
            stat_df[f'{feature}_squared'] = df[feature] ** 2  
            
        stat_df = pd.DataFrame(stat_df)
        return stat_df

    stat_train, stat_test = fe_stats(x_train), fe_stats(x_test)
    print('statistics features', stat_train.shape, stat_test.shape)

    # combine all FE results
    x_train_fe = pd.concat([sig_train, cp_train, fs_train, raw_gc_cluster_train, pca_cluster_train, stat_train], axis=1).reset_index(drop=True)
    x_test_fe = pd.concat([sig_test, cp_test, fs_test, raw_gc_cluster_test, pca_cluster_test, stat_test], axis=1).reset_index(drop=True)
    print('combine all FE results', x_train_fe.shape, x_test_fe.shape)

    # remove ctrl in train and test
    x_train_fe = x_train_fe[x_train_fe.cp_type!='ctl_vehicle']
    x_test_fe = x_test_fe[x_test_fe.cp_type!='ctl_vehicle']
    x_train_fe = x_train_fe.drop('cp_type', axis=1).reset_index(drop=True)
    x_test_fe = x_test_fe.drop('cp_type', axis=1).reset_index(drop=True)
    print('remove ctrl in train and test', x_train_fe.shape, x_test_fe.shape)

    return x_train_fe, x_test_fe

In [6]:
# FE parameter estimation based on provided train and test
fe_data = train_features.append(train_features_extra).reset_index(drop=True)
_, x_train_fe = feature_engineering(fe_data, train_features, seed=42)
_, x_test_fe = feature_engineering(fe_data, test_features, seed=42)

sig_id & cp_type (27796, 2) (23814, 2)
OHE for cp_time, cp_dose (27796, 2) (23814, 2)
RankGauss scaling (27796, 872) (23814, 872)
     rg_g-0    rg_g-1    rg_g-2    rg_g-3    rg_g-4    rg_g-5    rg_g-6  \
0  1.146806  0.902075 -0.418339 -0.961202 -0.254770 -1.021300 -1.369236   
1  0.128824  0.676862  0.274345  0.090495  1.208863  0.688965  0.316734   
2  0.790372  0.939951  1.428097 -0.121817 -0.002067  1.495091  0.238763   
3 -0.729866 -0.277163 -0.441200  0.766612  2.347817 -0.862761 -2.308829   
4 -0.444558 -0.481202  0.974729  0.977467  1.468304 -0.874772 -0.372682   

     rg_g-7    rg_g-8    rg_g-9  ...   rg_c-90   rg_c-91   rg_c-92   rg_c-93  \
0 -0.029888  0.684319 -0.316668  ...  0.405455  0.362189  1.296097  0.830281   
1  0.556428 -0.539718  0.831972  ... -0.527074  1.127076  0.716060  0.047538   
2  0.363471 -0.003611  1.237966  ... -0.834469 -0.747431  0.952950  0.046551   
3  0.305225 -0.191898 -1.389591  ... -1.429097 -0.762287 -1.653318 -1.259768   
4 -0.212171 -1.0670

In [7]:
# function of basic One-Hot Encoding for cp featuers
def process_data(data):
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    return data

train = process_data(x_train_fe)
test = process_data(x_test_fe)

In [8]:
# join processed features, scored_targets, non-scored_targets, drug_ids
train = train.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_nonscored, on='sig_id')
train = train.merge(train_drug, on='sig_id')

In [9]:
# define target column set
target_cols = [x for x in train_targets_scored.columns if x != 'sig_id']
aux_target_cols = [x for x in train_targets_nonscored.columns if x != 'sig_id']
all_target_cols = target_cols + aux_target_cols

num_targets = len(target_cols)
num_aux_targets = len(aux_target_cols)
num_all_targets = len(all_target_cols)

print('num_targets: {}'.format(num_targets))
print('num_aux_targets: {}'.format(num_aux_targets))
print('num_all_targets: {}'.format(num_all_targets))

num_targets: 206
num_aux_targets: 331
num_all_targets: 537


In [10]:
# define number of features input to NN
feature_cols = [c for c in train.columns if c not in all_target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold', 'sig_id', 'drug_id']]
num_features = len(feature_cols)
num_features

1240

In [11]:
print(train.shape)
print(test.shape)
print(sample_submission.shape)

(21948, 1779)
(3624, 1241)
(3982, 207)


In [12]:
# dataset classes
# remarks: for saving mamories

class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)
        }
        
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }

        return dct

In [13]:
# remarks: "inference" serves similar purpose as making prediction
def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    return preds

In [14]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets):
        super(Model, self).__init__()
        self.hidden_size = [1500, 1250, 1000, 750]
        self.dropout_value = [0.5, 0.35, 0.3, 0.25]

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, self.hidden_size[0])
        
        self.batch_norm2 = nn.BatchNorm1d(self.hidden_size[0])
        self.dropout2 = nn.Dropout(self.dropout_value[0])
        self.dense2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])

        self.batch_norm3 = nn.BatchNorm1d(self.hidden_size[1])
        self.dropout3 = nn.Dropout(self.dropout_value[1])
        self.dense3 = nn.Linear(self.hidden_size[1], self.hidden_size[2])

        self.batch_norm4 = nn.BatchNorm1d(self.hidden_size[2])
        self.dropout4 = nn.Dropout(self.dropout_value[2])
        self.dense4 = nn.Linear(self.hidden_size[2], self.hidden_size[3])

        self.batch_norm5 = nn.BatchNorm1d(self.hidden_size[3])
        self.dropout5 = nn.Dropout(self.dropout_value[3])
        self.dense5 = nn.utils.weight_norm(nn.Linear(self.hidden_size[3], num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.leaky_relu(self.dense3(x))

        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = F.leaky_relu(self.dense4(x))

        x = self.batch_norm5(x)
        x = self.dropout5(x)
        x = self.dense5(x)
        return x
 

In [15]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
# EPOCHS = 24
BATCH_SIZE = 128

# WEIGHT_DECAY = {'ALL_TARGETS': 1e-5, 'SCORED_ONLY': 3e-6}
# MAX_LR = {'ALL_TARGETS': 1e-2, 'SCORED_ONLY': 3e-3}
# DIV_FACTOR = {'ALL_TARGETS': 1e3, 'SCORED_ONLY': 1e2}
# PCT_START = 0.1

In [16]:
# Show model architecture
model = Model(num_features, num_all_targets)
model

Model(
  (batch_norm1): BatchNorm1d(1240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dense1): Linear(in_features=1240, out_features=1500, bias=True)
  (batch_norm2): BatchNorm1d(1500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (dense2): Linear(in_features=1500, out_features=1250, bias=True)
  (batch_norm3): BatchNorm1d(1250, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout3): Dropout(p=0.35, inplace=False)
  (dense3): Linear(in_features=1250, out_features=1000, bias=True)
  (batch_norm4): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout4): Dropout(p=0.3, inplace=False)
  (dense4): Linear(in_features=1000, out_features=750, bias=True)
  (batch_norm5): BatchNorm1d(750, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout5): Dropout(p=0.25, inplace=False)
  (dense5): Linear(in_features=750, out_features=537, 

In [17]:
pred1 = []

for seed, fold in itertools.product(range(7), range(7)):
    
    # Load the fine-tuned model with the best loss
    model = Model(num_features, num_targets)
#     model.load_state_dict(torch.load(f"NN_SCORED_ONLY_SEED{seed}_FOLD{fold}_.pth"))
    model.load_state_dict(torch.load('../input/moa-nn-pretrained/NN_SCORED_ONLY_SEED'+str(seed)+'_FOLD'+str(fold)+'_.pth'))
    model.to(DEVICE)

    x_test = test[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    predictions = np.zeros((len(test), num_targets))
    predictions = inference_fn(model, testloader, DEVICE)
    pred1.append(predictions)

# create submission file
pred1 = np.stack(pred1).mean(axis=0)
pred1 = pd.DataFrame(pred1, columns=target_cols)
pred1 = pd.concat([test[['sig_id']], pred1], axis=1).reset_index(drop=True)
sub1 = test_features[['sig_id']].merge(pred1, on='sig_id', how='left').fillna(0).reset_index(drop=True)
print(sub1.shape)
sub1.head()

(3982, 207)


Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000902,0.000955,0.002047,0.017687,0.0215,0.004837,0.003854,0.005194,0.000397,...,0.001267,0.000958,0.004336,0.001701,0.00132,0.000593,0.001325,0.002217,0.002516,0.001712
1,id_001897cda,0.000626,0.000829,0.001074,0.002075,0.001293,0.001704,0.003664,0.014857,0.026774,...,0.000581,0.000903,0.001758,0.000693,0.010186,0.000667,0.00599,0.00172,0.001191,0.00374
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.000744,0.000865,0.001587,0.011485,0.020147,0.004411,0.003006,0.005945,0.000454,...,0.000594,0.00121,0.002089,0.003233,0.004167,0.000535,0.001421,0.001841,0.001978,0.001835
4,id_0027f1083,0.002174,0.001318,0.001602,0.020017,0.025909,0.005222,0.003786,0.002311,0.000471,...,0.000888,0.000812,0.002916,0.001612,0.001579,0.000635,0.001098,0.002022,0.000831,0.001618


# Model 2 - TabNet

In [18]:
# basics
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
from collections import Counter
# import copy
import seaborn as sns
from time import time
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')
import itertools

# sklearn
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold

# pyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _WeightedLoss
from torch.optim.lr_scheduler import ReduceLROnPlateau, CyclicLR
from torch.utils.data import DataLoader, Dataset

# TabNet
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

# stratified kfold
!pip install /kaggle/input/iterative-stratification/iterative-stratification-master/
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

Looking in links: /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.0
Processing /kaggle/input/iterative-stratification/iterative-stratification-master
Building wheels for collected packages: iterative-stratification
  Building wheel for iterative-stratification (setup.py) ... [?25l- done
[?25h  Created wheel for iterative-stratification: filename=iterative_stratification-0.1.6-py3-none-any.whl size=8401 sha256=ed9fc024b4eb8c8b1b3f3e03f7b6e57fe417b26b01089b6009ac28914efc553e
  Stored in directory: /root/.cache/pip/wheels/b8/47/3f/eb4af42d124f37d23d6f13a4c8bbc32c1d70140e6e1cecb4aa
Successfully built iterative-stratification
Installing collected packages: iterative-stratification
  Attempting uninstall: iterative-stratification
    Found existing installation: iterative-stratification 0.1.6
  

In [19]:
### General ###
import os
import copy
import tqdm
import pickle
import random
import warnings
warnings.filterwarnings("ignore")
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

### Data Wrangling ###
import numpy as np
import pandas as pd
from scipy import stats

### Machine Learning ###
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### Deep Learning ###
import torch
from torch import nn
import torch.optim as optim
from torch.nn import functional as F
from torch.nn.modules.loss import _WeightedLoss
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
# Tabnet 
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

from pickle import load,dump

### Make prettier the prints ###
from colorama import Fore
c_ = Fore.CYAN
m_ = Fore.MAGENTA
r_ = Fore.RED
b_ = Fore.BLUE
y_ = Fore.YELLOW
g_ = Fore.GREEN

In [20]:
# load raw data
data_dir = '../input/lish-moa/'
train_features = pd.read_csv(data_dir + 'train_features.csv')
train_targets_scored = pd.read_csv(data_dir + 'train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(data_dir + 'train_targets_nonscored.csv')
train_drug = pd.read_csv(data_dir + 'train_drug.csv')
test_features = pd.read_csv(data_dir + 'test_features.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

train_features_extra = pd.read_csv('../input/moa-fe-extra-data/MoA_FE_Extra_Data.csv')

# keep only nonscored targets with >0 positive labels
keep_list = (np.where(train_targets_nonscored.iloc[:,1:].values.sum(axis=0)>0)[0] + 1).tolist()
train_targets_nonscored = train_targets_nonscored.iloc[:,[0]+keep_list]

print('train_features: {}'.format(train_features.shape))
print('train_targets_scored: {}'.format(train_targets_scored.shape))
print('train_targets_nonscored: {}'.format(train_targets_nonscored.shape))
print('train_drug: {}'.format(train_drug.shape))
print('test_features: {}'.format(test_features.shape))
print('train_features_extra: {}'.format(train_features_extra.shape))
print('sample_submission: {}'.format(sample_submission.shape))

train_features: (23814, 876)
train_targets_scored: (23814, 207)
train_targets_nonscored: (23814, 332)
train_drug: (23814, 2)
test_features: (3982, 876)
train_features_extra: (3982, 876)
sample_submission: (3982, 207)


In [21]:
# function to set single seed for everything
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# set seed to 42
seed_everything(42)

In [22]:
# define g/c features
g_col = [col for col in train_features.columns if col.startswith('g-')]
c_col = [col for col in train_features.columns if col.startswith('c-')]

In [23]:
def feature_engineering(x_train, x_test, seed):
    
    # set seed
    seed_everything(seed)

    # sig_id & cp_type
    sig_train = x_train[['sig_id', 'cp_type']]
    sig_test = x_test[['sig_id', 'cp_type']]
    print('sig_id & cp_type', sig_train.shape, sig_test.shape)
    
    # OHE for cp_time, cp_dose
    cp_dose = {'D1': 0, 'D2': 1}
    cp_time = {24:0, 48:1, 72:2}
    cp_dose_train = x_train.cp_dose.map(cp_dose)
    cp_time_train = x_train.cp_time.map(cp_time)
    cp_dose_test = x_test.cp_dose.map(cp_dose)
    cp_time_test = x_test.cp_time.map(cp_time)
    cp_train = pd.DataFrame({'cp_dose':cp_dose_train, 'cp_time':cp_time_train}).reset_index(drop=True)
    cp_test = pd.DataFrame({'cp_dose':cp_dose_test, 'cp_time':cp_time_test}).reset_index(drop=True)
    print('OHE for cp_time, cp_dose', cp_train.shape, cp_test.shape)
    
    # RankGauss scaling
    n_quantiles = 100
    qt = QuantileTransformer(n_quantiles=n_quantiles, random_state=seed, output_distribution='normal').fit(x_train[g_col + c_col])
    rg_train = pd.DataFrame(qt.transform(x_train[g_col + c_col]), columns=['rg_' + col for col in g_col + c_col]).reset_index(drop=True)
    rg_test = pd.DataFrame(qt.transform(x_test[g_col + c_col]), columns=['rg_' + col for col in g_col + c_col]).reset_index(drop=True)
    print('RankGauss scaling', rg_train.shape, rg_test.shape)
    print(rg_train.head())
    
    # PCA for g_col
    g_n_comp = 600
    rg_g_col = [col for col in rg_train.columns if col.startswith('rg_g')]
    pca = PCA(n_components=g_n_comp, random_state=seed).fit(rg_train[rg_g_col])
    pca_g_train = pd.DataFrame(pca.transform(rg_train[rg_g_col]), columns=['pca_g-' + str(i) for i in range(g_n_comp)]).reset_index(drop=True)
    pca_g_test = pd.DataFrame(pca.transform(rg_test[rg_g_col]), columns=['pca_g-' + str(i) for i in range(g_n_comp)]).reset_index(drop=True)
    print('PCA for g_col', pca_g_train.shape, pca_g_test.shape)
    print(pca_g_train.head())

    # PCA for c_col
    c_n_comp = 50
    rg_c_col = [col for col in rg_train.columns if col.startswith('rg_c')]
    pca = PCA(n_components=c_n_comp, random_state=seed).fit(rg_train[rg_c_col])
    pca_c_train = pd.DataFrame(pca.transform(rg_train[rg_c_col]), columns=['pca_c-' + str(i) for i in range(c_n_comp)]).reset_index(drop=True)
    pca_c_test = pd.DataFrame(pca.transform(rg_test[rg_c_col]), columns=['pca_c-' + str(i) for i in range(c_n_comp)]).reset_index(drop=True)
    print('PCA for c_col', pca_c_train.shape, pca_c_test.shape)
    print(pca_c_train.head())

    # combine g & c PCA features
    pca_gc_train = pd.concat((pca_g_train, pca_c_train),axis=1).reset_index(drop=True)
    pca_gc_test  =pd.concat((pca_g_test, pca_c_test),axis=1).reset_index(drop=True)
    print('combine g & c PCA features', pca_gc_train.shape, pca_gc_test.shape)
    print(pca_gc_train.head())

    # FS by Variance Threshold
    thsld = 0.85
    data_train = pd.concat([rg_train, pca_gc_train], axis=1)
    data_test = pd.concat([rg_test, pca_gc_test], axis=1)
    variance_threshold = VarianceThreshold(thsld).fit(data_train)
    selected_features = variance_threshold.get_support(True).tolist()
    fs_train = data_train.iloc[:,selected_features].reset_index(drop=True)
    fs_test = data_test.iloc[:,selected_features].reset_index(drop=True)
    print('FS by Variance Threshold', fs_train.shape, fs_test.shape)
    print(fs_train.head())

    # function to cluster raw g/c
    n_clusters_g = 22
    n_clusters_c = 4
    def gc_cluster(train, test, n_clusters_g, n_clusters_c, seed):

        def create_cluster(train, test, features, kind, n_clusters):
            col_prefix = 'raw_clusters_'
            train_ = train[features].copy()
            test_ = test[features].copy()
            kmeans = KMeans(n_clusters=n_clusters, random_state=seed).fit(train_)
            data = pd.concat([train_, test_], axis=0).reset_index(drop=True)
            pred = pd.DataFrame(kmeans.predict(data[features]), columns=[col_prefix + kind])
            pred = pd.get_dummies(pred, columns=[col_prefix + kind])
            pred_train = pred.iloc[:len(train),:].reset_index(drop=True)
            pred_test = pred.iloc[len(train):,:].reset_index(drop=True)
            return pred_train, pred_test

        g_pred_train, g_pred_test = create_cluster(train, test, g_col, kind='g', n_clusters=n_clusters_g)
        c_pred_train, c_pred_test = create_cluster(train, test, c_col, kind='c', n_clusters=n_clusters_c)
        gc_pred_train = pd.concat([g_pred_train, c_pred_train], axis=1).reset_index(drop=True)
        gc_pred_test = pd.concat([g_pred_test, c_pred_test], axis=1).reset_index(drop=True)
        return gc_pred_train, gc_pred_test
    
    raw_gc_cluster_train, raw_gc_cluster_test = gc_cluster(x_train, x_test, n_clusters_g=n_clusters_g, n_clusters_c=n_clusters_c, seed=seed)
    print('cluster raw g/c', raw_gc_cluster_train.shape, raw_gc_cluster_test.shape)


    # function to cluster PCA g/c
    n_clusters = 5
    def fe_cluster_pca(train, test, n_clusters, seed):
        col_prefix = 'pca_clusters'
        kmeans = KMeans(n_clusters=n_clusters, random_state = seed).fit(train)
        data = pd.concat([train, test], axis=0).reset_index(drop=True)
        pred = pd.DataFrame(kmeans.predict(data), columns=[col_prefix])
        pred = pd.get_dummies(pred, columns=[col_prefix])
        pred_train = pred.iloc[:len(train),:].reset_index(drop=True)
        pred_test = pred.iloc[len(train):,:].reset_index(drop=True)
        return pred_train, pred_test
    pca_cluster_train, pca_cluster_test = fe_cluster_pca(pca_gc_train, pca_gc_test, n_clusters=n_clusters, seed=seed)
    print('cluster PCA g/c', pca_cluster_train.shape, pca_cluster_test.shape)


    # statistics features
    gsquarecols = ['g-574','g-211','g-216','g-0','g-255','g-577','g-153','g-389','g-60','g-370','g-248','g-167','g-203',
                   'g-177','g-301','g-332','g-517','g-6','g-744','g-224','g-162','g-3','g-736','g-486','g-283','g-22',
                   'g-359','g-361','g-440','g-335','g-106','g-307','g-745','g-146','g-416','g-298','g-666','g-91','g-17',
                   'g-549','g-145','g-157','g-768','g-568','g-396']
    def fe_stats(df):
        stat_df = dict()
        stat_df['g_sum'] = df[g_col].sum(axis = 1)
        stat_df['g_mean'] = df[g_col].mean(axis = 1)
        stat_df['g_std'] = df[g_col].std(axis = 1)
        stat_df['g_kurt'] = df[g_col].kurtosis(axis = 1)
        stat_df['g_skew'] = df[g_col].skew(axis = 1)
        stat_df['c_sum'] = df[c_col].sum(axis = 1)
        stat_df['c_mean'] = df[c_col].mean(axis = 1)
        stat_df['c_std'] = df[c_col].std(axis = 1)
        stat_df['c_kurt'] = df[c_col].kurtosis(axis = 1)
        stat_df['c_skew'] = df[c_col].skew(axis = 1)
        stat_df['gc_sum'] = df[g_col + c_col].sum(axis = 1)
        stat_df['gc_mean'] = df[g_col + c_col].mean(axis = 1)
        stat_df['gc_std'] = df[g_col + c_col].std(axis = 1)
        stat_df['gc_kurt'] = df[g_col + c_col].kurtosis(axis = 1)
        stat_df['gc_skew'] = df[g_col + c_col].skew(axis = 1)

        stat_df['c52_c42'] = df['c-52'] * df['c-42']
        stat_df['c13_c73'] = df['c-13'] * df['c-73']
        stat_df['c26_c13'] = df['c-23'] * df['c-13']
        stat_df['c33_c6'] = df['c-33'] * df['c-6']
        stat_df['c11_c55'] = df['c-11'] * df['c-55']
        stat_df['c38_c63'] = df['c-38'] * df['c-63']
        stat_df['c38_c94'] = df['c-38'] * df['c-94']
        stat_df['c13_c94'] = df['c-13'] * df['c-94']
        stat_df['c4_c52'] = df['c-4'] * df['c-52']
        stat_df['c4_c42'] = df['c-4'] * df['c-42']
        stat_df['c13_c38'] = df['c-13'] * df['c-38']
        stat_df['c55_c2'] = df['c-55'] * df['c-2']
        stat_df['c55_c4'] = df['c-55'] * df['c-4']
        stat_df['c4_c13'] = df['c-4'] * df['c-13']
        stat_df['c82_c42'] = df['c-82'] * df['c-42']
        stat_df['c66_c42'] = df['c-66'] * df['c-42']
        stat_df['c6_c38'] = df['c-6'] * df['c-38']
        stat_df['c2_c13'] = df['c-2'] * df['c-13']
        stat_df['c62_c42'] = df['c-62'] * df['c-42']
        stat_df['c90_c55'] = df['c-90'] * df['c-55']
        
        for feature in c_col:
            stat_df[f'{feature}_squared'] = df[feature] ** 2     
        for feature in gsquarecols:
            stat_df[f'{feature}_squared'] = df[feature] ** 2  
            
        stat_df = pd.DataFrame(stat_df)
        return stat_df

    stat_train, stat_test = fe_stats(x_train), fe_stats(x_test)
    print('statistics features', stat_train.shape, stat_test.shape)

    # combine all FE results
    x_train_fe = pd.concat([sig_train, cp_train, fs_train, raw_gc_cluster_train, pca_cluster_train, stat_train], axis=1).reset_index(drop=True)
    x_test_fe = pd.concat([sig_test, cp_test, fs_test, raw_gc_cluster_test, pca_cluster_test, stat_test], axis=1).reset_index(drop=True)
    print('combine all FE results', x_train_fe.shape, x_test_fe.shape)

    # remove ctrl in train and test
    x_train_fe = x_train_fe[x_train_fe.cp_type!='ctl_vehicle']
    x_test_fe = x_test_fe[x_test_fe.cp_type!='ctl_vehicle']
    x_train_fe = x_train_fe.drop('cp_type', axis=1).reset_index(drop=True)
    x_test_fe = x_test_fe.drop('cp_type', axis=1).reset_index(drop=True)
    print('remove ctrl in train and test', x_train_fe.shape, x_test_fe.shape)

    return x_train_fe, x_test_fe

In [24]:
# FE parameter estimation based on provided train and test
fe_data = train_features.append(train_features_extra).reset_index(drop=True)
_, x_train_fe = feature_engineering(fe_data, train_features, seed=42)
_, x_test_fe = feature_engineering(fe_data, test_features, seed=42)

sig_id & cp_type (27796, 2) (23814, 2)
OHE for cp_time, cp_dose (27796, 2) (23814, 2)
RankGauss scaling (27796, 872) (23814, 872)
     rg_g-0    rg_g-1    rg_g-2    rg_g-3    rg_g-4    rg_g-5    rg_g-6  \
0  1.146806  0.902075 -0.418339 -0.961202 -0.254770 -1.021300 -1.369236   
1  0.128824  0.676862  0.274345  0.090495  1.208863  0.688965  0.316734   
2  0.790372  0.939951  1.428097 -0.121817 -0.002067  1.495091  0.238763   
3 -0.729866 -0.277163 -0.441200  0.766612  2.347817 -0.862761 -2.308829   
4 -0.444558 -0.481202  0.974729  0.977467  1.468304 -0.874772 -0.372682   

     rg_g-7    rg_g-8    rg_g-9  ...   rg_c-90   rg_c-91   rg_c-92   rg_c-93  \
0 -0.029888  0.684319 -0.316668  ...  0.405455  0.362189  1.296097  0.830281   
1  0.556428 -0.539718  0.831972  ... -0.527074  1.127076  0.716060  0.047538   
2  0.363471 -0.003611  1.237966  ... -0.834469 -0.747431  0.952950  0.046551   
3  0.305225 -0.191898 -1.389591  ... -1.429097 -0.762287 -1.653318 -1.259768   
4 -0.212171 -1.0670

In [25]:
# join processed features, scored_targets, non-scored_targets, drug_ids
train = x_train_fe.merge(train_targets_scored, on='sig_id')
train = train.merge(train_drug, on='sig_id')
test = x_test_fe.copy()

In [26]:
target_cols = [col for col in train_targets_scored.columns if col!='sig_id']
feature_cols = [col for col in train.columns if col!='sig_id' and col not in target_cols and col!='drug_id' and 'kfold' not in col]

target=train[target_cols]
train = train[feature_cols]
test = test[feature_cols]
X_test = test.values

In [27]:
# scores_auc_all = []
pred2 = []

SEED = [20,21,22]
FOLDS = [1,2,3,4,5,6,7,8,9,10]

for seed, fold in itertools.product(SEED, FOLDS):
    model = TabNetRegressor()
    model.load_model('../input/moa-tabnet-correct/TabNet_seed_'+str(seed)+'_fold_'+str(fold)+'.zip')
    preds_test = model.predict(X_test)
    pred2.append(1 / (1 + np.exp(-preds_test)))

pred2 = np.stack(pred2).mean(axis=0)
pred2 = pd.DataFrame(pred2, columns=target_cols)
pred2 = pd.concat([x_test_fe[['sig_id']], pred2], axis=1).reset_index(drop=True)
sub2 = test_features[['sig_id']].merge(pred2, on='sig_id', how='left').fillna(0).reset_index(drop=True)
print(sub2.shape)
sub2.head()


Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used 

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000853,0.00113,0.002237,0.016781,0.020182,0.004395,0.002568,0.006233,0.000267,...,0.000827,0.00138,0.004683,0.000668,0.00066,0.000603,0.000543,0.002035,0.002883,0.001541
1,id_001897cda,0.000428,0.0009,0.002015,0.00276,0.001541,0.001793,0.002338,0.013677,0.002401,...,0.00081,0.001335,0.004333,0.000901,0.008541,0.000565,0.00609,0.001071,0.00342,0.002619
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.000778,0.000885,0.001584,0.008685,0.016844,0.003525,0.002168,0.004139,0.000274,...,0.00064,0.00158,0.002256,0.017213,0.004731,0.00063,0.001566,0.001853,0.000449,0.001711
4,id_0027f1083,0.001467,0.001504,0.001372,0.015553,0.022895,0.004258,0.003497,0.001798,0.000473,...,0.000648,0.000696,0.002504,0.001366,0.00145,0.000665,0.001367,0.001892,0.000241,0.001677


# Final blending and submission

In [28]:
# final_sub = np.stack([sub1.iloc[:,1:].values, sub2.iloc[:,1:].values]).mean(axis=0)
final_sub = 0.75 * sub1.iloc[:,1:].values + 0.25 * sub2.iloc[:,1:].values
final_sub = pd.DataFrame(final_sub, columns=target_cols)
final_sub = pd.concat([test_features[['sig_id']], final_sub], axis=1).reset_index(drop=True)
final_sub.to_csv('submission.csv', index=False)
print(final_sub.shape)
final_sub.head()

(3982, 207)


Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.00089,0.000999,0.002095,0.017461,0.021171,0.004727,0.003533,0.005454,0.000365,...,0.001157,0.001063,0.004423,0.001443,0.001155,0.000596,0.00113,0.002172,0.002608,0.001669
1,id_001897cda,0.000576,0.000847,0.001309,0.002246,0.001355,0.001726,0.003332,0.014562,0.02068,...,0.000638,0.001011,0.002402,0.000745,0.009775,0.000641,0.006015,0.001558,0.001748,0.00346
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.000753,0.00087,0.001586,0.010785,0.019321,0.004189,0.002797,0.005494,0.000409,...,0.000605,0.001302,0.002131,0.006728,0.004308,0.000559,0.001458,0.001844,0.001596,0.001804
4,id_0027f1083,0.001997,0.001364,0.001545,0.018901,0.025156,0.004981,0.003714,0.002183,0.000472,...,0.000828,0.000783,0.002813,0.001551,0.001546,0.000642,0.001166,0.001989,0.000684,0.001632
