In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb

import warnings
warnings.simplefilter('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/test_features.csv


Used notebooks:<br>
https://www.kaggle.com/nroman/moa-lightgbm-206-models<br>
https://www.kaggle.com/pavelvpster/moa-lgb-optuna<br>

## Reading data

In [2]:
path = "/kaggle/input/lish-moa/"
train_features = pd.read_csv(path+"train_features.csv")
test_features = pd.read_csv(path+"test_features.csv")
train_targets_scored = pd.read_csv(path+"train_targets_scored.csv")

# From https://www.kaggle.com/carlmcbrideellis/moa-setting-ctl-vehicle-0-improves-score
train_features.at[train_features['cp_type'].str.contains('ctl_vehicle'),train_features.filter(regex='-.*').columns] = 0.0
test_features.at[test_features['cp_type'].str.contains('ctl_vehicle'),test_features.filter(regex='-.*').columns] = 0.0

## Data Preparations

In [3]:
def onehotencode(data):
    data = pd.get_dummies(data)
    return data

In [4]:
# train data
X = train_features.copy()
y_train = train_targets_scored.copy()
X = X.drop("sig_id", axis=1)
y_train = y_train.drop("sig_id", axis=1)

X = onehotencode(X)

X_test = test_features.copy()
sig_id = X_test["sig_id"]
X_test = X_test.drop("sig_id", axis=1)
X_test = onehotencode(X_test)

columns = X_test.columns
sub = pd.read_csv(path+"sample_submission.csv")

X = X.copy()
y_train = y_train.copy()

## Modelling

In [5]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': "gbdt",
    'num_leaves': 500,
    'min_child_weight': 0.01,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.6,
    'min_data_in_leaf': 100,
    'max_depth': -1,
    'learning_rate': 0.01,
    'bagging_seed': 11,
    'verbosity': 0,
    'reg_alpha': 0.4,
    'reg_lambda': 0.6,
    'random_state': 0
         }

accumulative_loss = 0
skf = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)

# 206 different models. One for each label
for model, target in enumerate(y_train, 1):
    y = y_train[target]
    preds = np.zeros(X_test.shape[0])
    oof = np.zeros(X.shape[0])

    for trn_idx, test_idx in skf.split(X, y):
        
        trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
        val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
        clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=0, early_stopping_rounds=20)
        oof[test_idx] = clf.predict(X.iloc[test_idx])
        preds += clf.predict(X_test) / skf.n_splits

    sub[target] = preds
    loss = log_loss(y, oof)
    accumulative_loss += loss
    print(f"num: {model}, loss: {loss}")

num: 1, loss: 0.005500134934250741
num: 2, loss: 0.006127405857487018
num: 3, loss: 0.007958126867749563
num: 4, loss: 0.04568846247628025
num: 5, loss: 0.06682891170129239
num: 6, loss: 0.020697632301132855
num: 7, loss: 0.01587163125010799
num: 8, loss: 0.02596268290381516
num: 9, loss: 0.0033754323265396415
num: 10, loss: 0.05852526625181986
num: 11, loss: 0.07681585052995592
num: 12, loss: 0.014610311026591525
num: 13, loss: 0.0020186398188088354
num: 14, loss: 0.011400896074435753
num: 15, loss: 0.004318534089361443
num: 16, loss: 0.004273171671320986
num: 17, loss: 0.014515755752692844
num: 18, loss: 0.024580478736376768
num: 19, loss: 0.022421103393187647
num: 20, loss: 0.011312791607561087
num: 21, loss: 0.011566051499098586
num: 22, loss: 0.020205856465480315
num: 23, loss: 0.0023393104996041717
num: 24, loss: 0.012675414367280667
num: 25, loss: 0.004298916916073611
num: 26, loss: 0.004507812422094568
num: 27, loss: 0.004304433207179347
num: 28, loss: 0.00618326038239716
num: 

In [6]:
print('Overall loss: {:.5f}'.format(accumulative_loss / 206))

Overall loss: 0.01617


In [7]:
sub.to_csv('submission.csv', index=False)