In [None]:
import os
import numpy as np
import pandas as pd
import random

import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau

from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

from statistics import *

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed) 
    tf.random.set_seed(seed)

seed_everything(42)

## 1. Loading dataframes

In [None]:
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
train_drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')
gene_cluster = pd.read_csv('/kaggle/input/list-moa/gene_cluster.csv')

## 2. Data processing

In [None]:
# Convert categorical variable into dummy/indicator variables
train_features = pd.get_dummies(train_features, columns=['cp_type', 'cp_dose'])
test_features = pd.get_dummies(test_features, columns=['cp_type', 'cp_dose'])

In [None]:
# Remove sig_id
all_dfs = (train_features, train_targets, test_features)

for df in all_dfs:
    if 'sig_id' in df.columns:
        df.drop('sig_id', axis=1, inplace=True)

In [None]:
# Gene clustering
GENE_COLS = [col for col in train_features.columns if col.startswith('g-')]

group1 = gene_cluster['cluster'].isin(['1'])
group2 = gene_cluster['cluster'].isin(['2'])
group3 = gene_cluster['cluster'].isin(['3'])
group4 = gene_cluster['cluster'].isin(['4'])

group1_indices = np.where(group1)[0]
group2_indices = np.where(group2)[0]
group3_indices = np.where(group3)[0]
group4_indices = np.where(group4)[0]

GENE_COLS_1 = [GENE_COLS[i] for i in group1_indices]
GENE_COLS_2 = [GENE_COLS[i] for i in group2_indices]
GENE_COLS_3 = [GENE_COLS[i] for i in group3_indices]
GENE_COLS_4 = [GENE_COLS[i] for i in group4_indices]

trainrows_list = np.arange(0, train_features.shape[0], 1).tolist()
train_features['cluster1'] = [mean(list(train_features.loc[i,GENE_COLS_1])) for i in trainrows_list]
train_features['cluster2'] = [mean(list(train_features.loc[i,GENE_COLS_2])) for i in trainrows_list]
train_features['cluster3'] = [mean(list(train_features.loc[i,GENE_COLS_3])) for i in trainrows_list]
train_features['cluster4'] = [mean(list(train_features.loc[i,GENE_COLS_4])) for i in trainrows_list]

testrows_list = np.arange(0, test_features.shape[0], 1).tolist()
test_features['cluster1'] = [mean(list(test_features.loc[i,GENE_COLS_1])) for i in testrows_list]
test_features['cluster2'] = [mean(list(test_features.loc[i,GENE_COLS_2])) for i in testrows_list]
test_features['cluster3'] = [mean(list(test_features.loc[i,GENE_COLS_3])) for i in testrows_list]
test_features['cluster4'] = [mean(list(test_features.loc[i,GENE_COLS_4])) for i in testrows_list]

## 3. PCA

In [None]:
CELL_COLS = [col for col in train_features.columns if col.startswith('c-')]

train_features_gene = train_features.loc[:, GENE_COLS]
test_features_gene = test_features.loc[:, GENE_COLS]
train_features_cell = train_features.loc[:, CELL_COLS]
test_features_cell = test_features.loc[:, CELL_COLS]

N_COMP_GENE = 30
N_COMP_CELL = 10

# # PCA for gene
pca_gene_train = PCA(n_components=N_COMP_GENE).fit_transform(train_features_gene)
pca_gene_train = pd.DataFrame(data=pca_gene_train, columns=[f'pc-g-{i}' for i in range(N_COMP_GENE)])
pca_gene_test = PCA(n_components=N_COMP_GENE).fit_transform(test_features_gene)
pca_gene_test = pd.DataFrame(data=pca_gene_test, columns=[f'pc-g-{i}' for i in range(N_COMP_GENE)])

# # PCA for cell
pca_cell_train = PCA(n_components=N_COMP_CELL).fit_transform(train_features_cell)
pca_cell_train = pd.DataFrame(data=pca_cell_train, columns=[f'pc-c-{i}' for i in range(N_COMP_CELL)])
pca_cell_test = PCA(n_components=N_COMP_CELL).fit_transform(test_features_cell)
pca_cell_test = pd.DataFrame(data=pca_cell_test, columns=[f'pc-c-{i}' for i in range(N_COMP_CELL)])

In [None]:
# # Appending new components to existing features
pca_train = train_features.copy()
pca_train = pd.concat([pca_train, pca_gene_train, pca_cell_train], axis=1)

pca_test = test_features.copy()
pca_test= pd.concat([pca_test, pca_gene_test, pca_cell_test], axis=1)

## 4. Model-building function

In [None]:
def get_model(train_df, *units):
    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.Input(shape=(train_df.shape[1],)))
    model.add(tf.keras.layers.BatchNormalization())
    
    for u in units[:-1]:
        model.add(tf.keras.layers.Dense(units=u, activation='relu'))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(0.55))
    
    model.add(tf.keras.layers.Dense(units=units[-1], activation="sigmoid"))
    
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

## 5. Preparing datasets and stuff

In [None]:
### MODIFY THIS FOR DIFFERENT DATASETS ###

THE_TRAIN = pca_train
THE_TEST = pca_test

##########################################

pred_val = np.zeros((THE_TRAIN.shape[0], 206))
pred_test = np.zeros((THE_TEST.shape[0], 206))

features = THE_TRAIN.values
targets = train_targets.values
tests = THE_TEST.values

validation_scores = []

## 6. Run model

In [None]:
def run(features, targets, tests, pred, pe, n_split=5):
    kfoldnumber = 0

    for train_index, validation_index in KFold(n_split).split(features):
        kfoldnumber += 1
        print(f'{"#" * 30} Fold number {kfoldnumber} {"#" * 30}')

        reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss',
                                           factor=0.1, patience=3, verbose=0,
                                           epsilon=1e-4, mode='min')
        
        nn_layers = (512, 1024, 206)
        model = get_model(features, *nn_layers)
        #model.summary()

        model.fit(features[train_index],
                  targets[train_index],
                  batch_size=128,
                  epochs=35,
                  validation_data=(features[validation_index], targets[validation_index]),
                  verbose=0,
                  callbacks=[reduce_lr_loss])

        print()
        print('train loss:\t', model.evaluate(features[train_index], targets[train_index],
                                     verbose=0, batch_size=128))
        
        validate_score = model.evaluate(features[validation_index],
                                         targets[validation_index],
                                        verbose=0, batch_size=128)
        print('validate loss:\t', validate_score)
        validation_scores.append(validate_score)
        

        print()
        print('predict validation...')

        pred[validation_index] = model.predict(features[validation_index],
                                              verbose=0, batch_size=128)

        print('predict test...')

        pe += model.predict(tests, verbose=0, batch_size=128) / n_split
        print()
    
    print('###########################################################################\n\nFIN')
    return pred, pe

In [None]:
# Run model
pred_val, pred_test = run(features, targets, tests, pred_val, pred_test)

In [None]:
# Average prediction validation score
print(sum(validation_scores)/(len(validation_scores)))


## Submission

In [None]:
moa_dir = '../input/lish-moa/'
columns = pd.read_csv(moa_dir + "train_targets_scored.csv")
columns.drop('sig_id', axis=1, inplace=True)
submission = pd.DataFrame(data=pred_test, columns=columns.columns)
sample = pd.read_csv(moa_dir + "sample_submission.csv")
submission.insert(0, column='sig_id', value=sample['sig_id'])

submission.to_csv('submission.csv', index=False)
print(submission)