In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


import warnings
warnings.simplefilter('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/test_features.csv


Notebooks used:<br>
https://www.kaggle.com/sg1993/logistic-regression-model/notebook<br>
https://www.kaggle.com/barteksadlej123/basic-logistic-regression

## Reading data

In [2]:
path = "/kaggle/input/lish-moa/"
train_features = pd.read_csv(path+"train_features.csv")
test_features = pd.read_csv(path+"test_features.csv")
train_targets_scored = pd.read_csv(path+"train_targets_scored.csv")

# From https://www.kaggle.com/carlmcbrideellis/moa-setting-ctl-vehicle-0-improves-score
train_features.at[train_features['cp_type'].str.contains('ctl_vehicle'),train_features.filter(regex='-.*').columns] = 0.0
test_features.at[test_features['cp_type'].str.contains('ctl_vehicle'),test_features.filter(regex='-.*').columns] = 0.0

## Data Preparations

In [3]:
def onehotencode(data):
    data = pd.get_dummies(data)
    return data

In [4]:
# Dropping sig_id columns and onehot encoding

X_test = test_features.copy()
X_train = train_features.copy()
y_train = train_targets_scored.copy()
X_train = X_train.drop("sig_id", axis=1)
y_train = y_train.drop("sig_id", axis=1)

# changing cp_time to categorical before onehotencoding
X_test = test_features.copy()
X_train['cp_time'] = X_train['cp_time'].apply(lambda x:str(x))
X_test['cp_time'] = X_test['cp_time'].apply(lambda x:str(x))

X_train = onehotencode(X_train)

sig_id = X_test["sig_id"]  # getting the ids
X_test = X_test.drop("sig_id", axis=1)
X_test = onehotencode(X_test)

columns = X_test.columns  # getting the feature columns
sub = pd.read_csv(path+"sample_submission.csv")

In [5]:
# Scaling the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Modelling

In [6]:
model_lr = MultiOutputClassifier(LogisticRegression(C=0.01, solver="sag", max_iter=200, random_state=0), n_jobs=-1)

# 3 fold CV on LogisticRegression that uses MultiOutputClassifier
# kf = KFold(n_splits=3, shuffle=True)
# score = cross_val_score(model_lr, X_train_transformed, y_train, cv=kf, n_jobs=-1)
# print(score)

In [7]:
model_lr.fit(X_train, y_train)

MultiOutputClassifier(estimator=LogisticRegression(C=0.01, max_iter=200,
                                                   random_state=0,
                                                   solver='sag'),
                      n_jobs=-1)

In [8]:
preds = model_lr.predict_proba(X_test)
preds = np.array(preds)[:,:,1]
sub[sub.columns.to_list()[1:]] = preds.T
sub.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.003464,0.003232,0.0037,0.015732,0.019428,0.003676,0.00185,0.004582,0.002065,...,0.00414,0.010924,0.004457,0.001976,0.001357,0.002923,0.005352,0.006348,0.010482,0.003119
1,id_001897cda,0.001319,0.002607,0.001806,0.00135,0.004579,0.00956,0.003614,0.007876,0.004559,...,0.002899,0.002493,0.003783,0.000943,0.005494,0.002449,0.003527,0.0029,0.004072,0.004707
2,id_002429b5b,0.002589,0.00245,0.002615,0.003232,0.003695,0.002953,0.002708,0.002908,0.002426,...,0.002371,0.002332,0.002551,0.003636,0.002582,0.002423,0.002681,0.00253,0.002528,0.002416
3,id_00276f245,0.002019,0.002437,0.003028,0.006401,0.002983,0.006393,0.001779,0.003501,0.001886,...,0.002529,0.00186,0.006003,0.024815,0.009123,0.002359,0.00232,0.002674,0.002625,0.003309
4,id_0027f1083,0.004032,0.00432,0.006233,0.011106,0.022038,0.002113,0.005945,0.002903,0.002617,...,0.003135,0.001085,0.00803,0.00448,0.001485,0.003423,0.006711,0.003588,0.001768,0.003843


In [9]:
sub.to_csv('submission.csv',index=False)