In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization

from sklearn.model_selection import StratifiedKFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb

import matplotlib.pyplot as plt
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/test_features.csv


Ensemble related source: https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/ <br>
<br>
Notebooks used for individual models:<br>
<br>
Logistic Regression:<br>
https://www.kaggle.com/sg1993/logistic-regression-model/notebook<br>
https://www.kaggle.com/barteksadlej123/basic-logistic-regression<br>
<br>
LightGBM:<br>
https://www.kaggle.com/nroman/moa-lightgbm-206-models<br>
https://www.kaggle.com/pavelvpster/moa-lgb-optuna<br>
<br>
FNN:<br>
https://www.kaggle.com/simakov/keras-multilabel-neural-network-v1-2/notebook<br>
https://www.kaggle.com/elcaiseri/moa-keras-multilabel-classifier-nn-starter/notebook<br>
https://www.kaggle.com/gogo827jz/moa-lstm-pure-transformer-fast-and-not-bad/notebook?scriptVersionId=42679125<br>

## Reading data

In [2]:
path = "/kaggle/input/lish-moa/"
train_features = pd.read_csv(path+"train_features.csv")
test_features = pd.read_csv(path+"test_features.csv")
train_targets_scored = pd.read_csv(path+"train_targets_scored.csv")
sub = pd.read_csv(path+"sample_submission.csv")

# From https://www.kaggle.com/carlmcbrideellis/moa-setting-ctl-vehicle-0-improves-score
train_features.at[train_features['cp_type'].str.contains('ctl_vehicle'),train_features.filter(regex='-.*').columns] = 0.0
test_features.at[test_features['cp_type'].str.contains('ctl_vehicle'),test_features.filter(regex='-.*').columns] = 0.0

## General data processing

In [3]:
def onehotencode(data):
    data = pd.get_dummies(data)
    return data

In [4]:
# Dropping sig_id columns and onehot encoding

X_test = test_features.copy()
X_train = train_features.copy()
y_train = train_targets_scored.copy()
X_train = X_train.drop("sig_id", axis=1)
y_train = y_train.drop("sig_id", axis=1)

# changing cp_time to categorical before onehotencoding
X_test = test_features.copy()
X_train['cp_time'] = X_train['cp_time'].apply(lambda x:str(x))
X_test['cp_time'] = X_test['cp_time'].apply(lambda x:str(x))

X_train = onehotencode(X_train)

sig_id = X_test["sig_id"]  # getting the ids
X_test = X_test.drop("sig_id", axis=1)
X_test = onehotencode(X_test)

columns = X_test.columns  # getting the feature columns
sub = pd.read_csv(path+"sample_submission.csv")

## Logistic Regression

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
model_lr = MultiOutputClassifier(LogisticRegression(C=0.01, solver="sag", max_iter=200, random_state=0), n_jobs=-1)
model_lr.fit(X_train_scaled, y_train)

MultiOutputClassifier(estimator=LogisticRegression(C=0.01, max_iter=200,
                                                   random_state=0,
                                                   solver='sag'),
                      n_jobs=-1)

In [7]:
sub1 = sub.copy()
preds1 = model_lr.predict_proba(X_test_scaled)
preds1 = np.array(preds1)[:,:,1] * 0.27
sub1[sub.columns.to_list()[1:]] = preds1.T
sub1.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.000935,0.000873,0.000999,0.004248,0.005246,0.000992,0.000499,0.001237,0.000558,...,0.001118,0.002949,0.001203,0.000533,0.000367,0.000789,0.001445,0.001714,0.00283,0.000842
1,id_001897cda,0.000356,0.000704,0.000488,0.000365,0.001236,0.002581,0.000976,0.002127,0.001231,...,0.000783,0.000673,0.001021,0.000255,0.001483,0.000661,0.000952,0.000783,0.001099,0.001271
2,id_002429b5b,0.000699,0.000662,0.000706,0.000873,0.000998,0.000797,0.000731,0.000785,0.000655,...,0.00064,0.00063,0.000689,0.000982,0.000697,0.000654,0.000724,0.000683,0.000683,0.000652
3,id_00276f245,0.000545,0.000658,0.000817,0.001728,0.000805,0.001726,0.00048,0.000945,0.000509,...,0.000683,0.000502,0.001621,0.0067,0.002463,0.000637,0.000626,0.000722,0.000709,0.000893
4,id_0027f1083,0.001089,0.001166,0.001683,0.002999,0.00595,0.00057,0.001605,0.000784,0.000707,...,0.000847,0.000293,0.002168,0.001209,0.000401,0.000924,0.001812,0.000969,0.000477,0.001038


## LightGBM

In [8]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': "gbdt",
    'num_leaves': 500,
    'min_child_weight': 0.01,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.6,
    'min_data_in_leaf': 100,
    'max_depth': -1,
    'learning_rate': 0.01,
    'bagging_seed': 11,
    'verbosity': 0,
    'reg_alpha': 0.4,
    'reg_lambda': 0.6,
    'random_state': 0
         }

skf = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)

sub2 = sub.copy()

# 206 different models. One for each label
for model, target in enumerate(y_train, 1):
    y = y_train[target]
    preds = np.zeros(X_test.shape[0])
    oof = np.zeros(X_train.shape[0])

    for trn_idx, test_idx in skf.split(X_train, y):
        
        trn_data = lgb.Dataset(X_train.iloc[trn_idx], label=y.iloc[trn_idx])
        val_data = lgb.Dataset(X_train.iloc[test_idx], label=y.iloc[test_idx])
        clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=0, early_stopping_rounds=20)
        oof[test_idx] = clf.predict(X_train.iloc[test_idx])
        preds += clf.predict(X_test) / skf.n_splits

    sub2[target] = preds*0.33



In [9]:
print(sub2.head())

         sig_id  5-alpha_reductase_inhibitor  11-beta-hsd1_inhibitor  \
0  id_0004d9e33                     0.000123                0.000219   
1  id_001897cda                     0.000105                0.000217   
2  id_002429b5b                     0.000097                0.000216   
3  id_00276f245                     0.000103                0.000218   
4  id_0027f1083                     0.000128                0.000217   

   acat_inhibitor  acetylcholine_receptor_agonist  \
0        0.000311                        0.001984   
1        0.000317                        0.002118   
2        0.000310                        0.001149   
3        0.000325                        0.002093   
4        0.000313                        0.002400   

   acetylcholine_receptor_antagonist  acetylcholinesterase_inhibitor  \
0                           0.004111                        0.000927   
1                           0.002706                        0.001002   
2                           0.00

## Feed Forward Neural Network

In [10]:
def create_model(n_input):
    model = Sequential()
    model.add(Input(n_input))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(300, kernel_initializer='he_uniform', activation='sigmoid'))
    model.add(Dropout(0.6)) 
    model.add(Dense(300, activation='sigmoid'))
    model.add(Dropout(0.6))
    model.add(Dense(700, activation='sigmoid'))
    model.add(Dropout(0.6))
    model.add(Dense(206, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adamax', metrics=['accuracy'])
    return model

In [11]:
model = create_model(len(columns))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization (BatchNo (None, 879)               3516      
_________________________________________________________________
dropout (Dropout)            (None, 879)               0         
_________________________________________________________________
dense (Dense)                (None, 300)               264000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               90300     
_________________________________________________________________
dropout_2 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 700)               2

In [12]:
hist = model.fit(X_train, y_train, batch_size=32, validation_split=0.2, verbose=1, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [13]:
preds3 = model.predict(X_test)
preds3 = preds3 * 0.4
sub3 = pd.DataFrame.from_records(preds3)
sub3.insert(0, "sig_id", sig_id)
sub3.columns = sub.columns

## Combining into submission

In [14]:
sub1 = sub1.drop("sig_id", axis=1)
sub2 = sub2.drop("sig_id", axis=1)
sub3 = sub3.drop("sig_id", axis=1)
submission = sub1.add(sub2, fill_value=0)
submission = submission.add(sub3, fill_value=0)
submission.insert(0, "sig_id", sig_id)
print(submission.head())
print(sub3.shape)
print(submission.shape)

         sig_id  5-alpha_reductase_inhibitor  11-beta-hsd1_inhibitor  \
0  id_0004d9e33                     0.001526                0.001528   
1  id_001897cda                     0.000733                0.001025   
2  id_002429b5b                     0.000796                0.000878   
3  id_00276f245                     0.000998                0.001212   
4  id_0027f1083                     0.001743                0.001756   

   acat_inhibitor  acetylcholine_receptor_agonist  \
0        0.002107                        0.013758   
1        0.001055                        0.003278   
2        0.001016                        0.002022   
3        0.002629                        0.012193   
4        0.002530                        0.010436   

   acetylcholine_receptor_antagonist  acetylcholinesterase_inhibitor  \
0                           0.019733                        0.004189   
1                           0.005710                        0.004266   
2                           0.00

In [15]:
submission.to_csv('submission.csv', index=False)