In [1]:
import os
import sys
import numpy as np
import pandas as pd
import random

import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA

In [2]:
# Requires iterative-stratification by Y.Nakama
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed) 
    tf.random.set_seed(seed)

seed_everything(42)

## 1. Loading dataframes

In [4]:
moa_dir = '/kaggle/input/lish-moa/'

train_features = pd.read_csv(moa_dir + 'train_features.csv')
train_targets = pd.read_csv(moa_dir + 'train_targets_scored.csv')
test_features = pd.read_csv(moa_dir + 'test_features.csv')
train_drug = pd.read_csv(moa_dir + 'train_drug.csv')

## 2. Data processing

### 2.1 Feature engineering

In [5]:
# Adding drug_id to train_features
edrug_train = train_features.copy()
edrug_train['sig_id'] = edrug_train['sig_id'].map(lambda d: train_drug.loc[train_drug.sig_id == d, 'drug_id'].values[0])
edrug_train.rename(columns={'sig_id': 'drug_id'}, inplace=True)

# One Hot Encoding
ohe_edrug_train = pd.get_dummies(edrug_train, columns=['cp_type', 'cp_dose'])

# group by drug_id using mean
grouped_ohe_edrug_train = ohe_edrug_train.groupby('drug_id').mean()
grouped_ohe_edrug_train.reset_index(inplace=True)
grouped_ohe_edrug_train.drop('drug_id', axis=1, inplace=True)

# Append rows of grouped drug features to original features dataset
temp_features = pd.get_dummies(train_features, columns=['cp_type', 'cp_dose'])
full_edrug_train = pd.concat([grouped_ohe_edrug_train, temp_features])
full_edrug_train.drop('sig_id', axis=1, inplace=True)

In [6]:
# Adding drug_id to train_targets
edrug_targets = train_targets.copy()
edrug_targets['sig_id'] = edrug_targets['sig_id'].map(lambda d: train_drug.loc[train_drug.sig_id == d, 'drug_id'].values[0])
edrug_targets.rename(columns={'sig_id': 'drug_id'}, inplace=True)

# group by drug_id using mean
grouped_edrug_targets = edrug_targets.groupby('drug_id').mean()
grouped_edrug_targets.reset_index(inplace=True)
grouped_edrug_targets.drop('drug_id', axis=1, inplace=True)

# Append rows of grouped drug targets to original targets dataset
full_edrug_targets = pd.concat([grouped_edrug_targets, train_targets])
full_edrug_targets.drop('sig_id', axis=1, inplace=True)

In [7]:
# One Hot Encoding - convert categorical variable into dummy/indicator variables
train_features = pd.get_dummies(train_features, columns=['cp_type', 'cp_dose'])
test_features = pd.get_dummies(test_features, columns=['cp_type', 'cp_dose'])

In [8]:
# Remove sig_id
all_dfs = (train_features, train_targets, test_features)

for df in all_dfs:
    if 'sig_id' in df.columns:
        df.drop('sig_id', axis=1, inplace=True)

## 3. Principal Component Analysis (PCA)

In [9]:
GENE_COLS = [col for col in train_features.columns if col.startswith('g-')]
CELL_COLS = [col for col in train_features.columns if col.startswith('c-')]

train_features_gene = train_features.loc[:, GENE_COLS]
test_features_gene = test_features.loc[:, GENE_COLS]
train_features_cell = train_features.loc[:, CELL_COLS]
test_features_cell = test_features.loc[:, CELL_COLS]

In [10]:
"""
0.9 var is 354 genes, 13 cells
0.95 var is 513 genes, 46 cells
0.65 var is 50 genes

variations to try:
    gene 50, cell 15
    gene 11, cell 10
    gene 30, cell 10
"""

N_COMP_GENE = 50
N_COMP_CELL = 15

# PCA for gene
pca_gene_train = PCA(n_components=N_COMP_GENE).fit_transform(train_features_gene)
pca_gene_train = pd.DataFrame(data=pca_gene_train, columns=[f'pc-g-{i}' for i in range(N_COMP_GENE)])
pca_gene_test = PCA(n_components=N_COMP_GENE).fit_transform(test_features_gene)
pca_gene_test = pd.DataFrame(data=pca_gene_test, columns=[f'pc-g-{i}' for i in range(N_COMP_GENE)])

# PCA for cell
pca_cell_train = PCA(n_components=N_COMP_CELL).fit_transform(train_features_cell)
pca_cell_train = pd.DataFrame(data=pca_cell_train, columns=[f'pc-c-{i}' for i in range(N_COMP_CELL)])
pca_cell_test = PCA(n_components=N_COMP_CELL).fit_transform(test_features_cell)
pca_cell_test = pd.DataFrame(data=pca_cell_test, columns=[f'pc-c-{i}' for i in range(N_COMP_CELL)])

In [11]:
def get_pca_dataset(dataset, pca_gene, pca_cell, concat=True):
    """ returns final dataset with dimension increase or decrease """
    
    # dimension increase
    if concat:
        return pd.concat([dataset, pca_gene, pca_cell], axis=1)
    
    # dimension replacement (usually means decrease)
    temp_df = dataset.drop(GENE_COLS + CELL_COLS, axis=1)
    return pd.concat([temp_df, pca_gene, pca_cell], axis=1)

In [12]:
pca_train = get_pca_dataset(train_features, pca_gene_train, pca_cell_train)
pca_test = get_pca_dataset(test_features, pca_gene_test, pca_cell_test)

## 4. Model-building function

In [13]:
def get_model(train_df, *units):
    model = tf.keras.Sequential(name="naiveboi")
    
    model.add(tf.keras.layers.Input(shape=(train_df.shape[1],)))
    model.add(tf.keras.layers.BatchNormalization())
    
    for u in units[:-1]:
        model.add(tf.keras.layers.Dense(units=u, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(0.5))
    
    model.add(tf.keras.layers.Dense(units=units[-1], activation="sigmoid"))
    
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

## 5. Preparing datasets and stuff

In [14]:
### MODIFY THIS FOR DIFFERENT DATASETS ###

THE_TRAIN = train_features
THE_TEST = test_features
THE_TARGET = train_targets

##########################################

pred_val = np.zeros((THE_TRAIN.shape[0], 206))
pred_test = np.zeros((THE_TEST.shape[0], 206))

features = THE_TRAIN.values
targets = THE_TARGET.values
tests = THE_TEST.values

validation_scores = []

## 6. Run model

In [15]:
def run(features, targets, tests, pred, pe, n_split=5, stratified=False):
    if stratified:
        print(f"Using stratified KFold ({n_split})\n")
        kf = MultilabelStratifiedKFold(n_splits=n_split)
        kf_split = kf.split(features, targets)
    else:
        print(f"Using KFold ({n_split})\n")
        kf = KFold(n_split)
        kf_split = kf.split(features)
    
    kfoldnumber = 0
    
    for train_index, validation_index in kf_split:
        kfoldnumber += 1
        print(f'{"#" * 30} Fold {kfoldnumber} {"#" * 30}')

        reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss',
                                           factor=0.1, patience=3, verbose=0,
                                           epsilon=1e-4, mode='min')
        
        nn_layers = (512, 1024, 206)
        
        model = get_model(features, *nn_layers)

        model.fit(features[train_index],
                  targets[train_index],
                  batch_size=128,
                  epochs=35,
                  validation_data=(features[validation_index], targets[validation_index]),
                  verbose=0,
                  callbacks=[reduce_lr_loss])

        print('\ntrain loss:\t', model.evaluate(features[train_index], targets[train_index],
                                     verbose=0, batch_size=128))
        
        validate_score = model.evaluate(features[validation_index],
                                         targets[validation_index],
                                        verbose=0, batch_size=128)
        print('validate loss:\t', validate_score)
        validation_scores.append(validate_score)
        

        print('\npredicting validation...')

        pred[validation_index] = model.predict(features[validation_index],
                                              verbose=0, batch_size=128)

        print('predicting test...\n')

        pe += model.predict(tests, verbose=0, batch_size=128) / n_split
    
    print('###########################################################################\n\nFIN')
    
    return pred, pe

In [16]:
# Run model
pred_val, pred_test = run(features, targets, tests, pred_val, pred_test)

Using KFold (5)

############################## Fold 1 ##############################

train loss:	 0.010042713023722172
validate loss:	 0.015097200870513916

predicting validation...
predicting test...

############################## Fold 2 ##############################

train loss:	 0.013382095843553543
validate loss:	 0.015531923621892929

predicting validation...
predicting test...

############################## Fold 3 ##############################

train loss:	 0.011185313574969769
validate loss:	 0.014890827238559723

predicting validation...
predicting test...

############################## Fold 4 ##############################

train loss:	 0.013323813676834106
validate loss:	 0.015441926196217537

predicting validation...
predicting test...

############################## Fold 5 ##############################

train loss:	 0.013228744268417358
validate loss:	 0.015688350424170494

predicting validation...
predicting test...

################################################

In [17]:
# Average prediction validation score
print(sum(validation_scores)/(len(validation_scores)))

0.015330045670270919



## Submission

In [18]:
columns = pd.read_csv(moa_dir + "train_targets_scored.csv")
columns.drop('sig_id', axis=1, inplace=True)
submission = pd.DataFrame(data=pred_test, columns=columns.columns)
sample = pd.read_csv(moa_dir + "sample_submission.csv")
submission.insert(0, column='sig_id', value=sample['sig_id'])

submission.to_csv('submission.csv', index=False)

In [19]:
submission

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000838,0.001092,0.001142,0.015157,0.029094,0.004719,0.002404,0.003873,0.000187,...,0.000353,0.000757,0.002410,0.001695,0.000286,0.000320,0.000371,0.001670,0.002145,0.001511
1,id_001897cda,0.000247,0.000692,0.001400,0.001946,0.003007,0.001901,0.001763,0.012937,0.003083,...,0.000361,0.001409,0.001673,0.000435,0.015093,0.000353,0.009957,0.000836,0.001391,0.001238
2,id_002429b5b,0.000005,0.000004,0.000056,0.000376,0.000198,0.000040,0.000097,0.000066,0.000010,...,0.000007,0.000022,0.000110,0.000039,0.000162,0.000004,0.000112,0.000048,0.000020,0.000030
3,id_00276f245,0.000279,0.000495,0.001560,0.007956,0.007886,0.003552,0.001641,0.003648,0.000279,...,0.000212,0.001270,0.002634,0.032975,0.008590,0.000223,0.002812,0.001133,0.001366,0.001293
4,id_0027f1083,0.001248,0.001157,0.001599,0.012579,0.021428,0.003806,0.005118,0.003001,0.000241,...,0.000378,0.000762,0.006634,0.005553,0.000526,0.000413,0.000802,0.001257,0.000815,0.001293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.000221,0.000730,0.001241,0.004662,0.007864,0.002074,0.000961,0.006762,0.000513,...,0.000236,0.002074,0.001543,0.010592,0.005199,0.000374,0.003328,0.000977,0.000698,0.000706
3978,id_ff925dd0d,0.002526,0.001475,0.000708,0.004719,0.032519,0.004803,0.004367,0.003628,0.000553,...,0.000210,0.000596,0.002439,0.000751,0.001253,0.000333,0.002649,0.001248,0.000460,0.001337
3979,id_ffb710450,0.000760,0.000623,0.000547,0.008706,0.029259,0.004488,0.002794,0.004285,0.000225,...,0.000146,0.000252,0.001058,0.000538,0.000864,0.000244,0.000381,0.000779,0.000598,0.000904
3980,id_ffbb869f2,0.000701,0.000688,0.001048,0.013988,0.023366,0.006923,0.003021,0.003326,0.000335,...,0.000208,0.000363,0.002311,0.000468,0.000796,0.000261,0.001332,0.001220,0.000685,0.002341
