In [1]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import prepare

In [2]:
# Load train, target and test datasets

X_train = pd.read_csv('train_features.csv', index_col=0)
Y_train = pd.read_csv('train_targets_scored.csv', index_col=0)
X_test = pd.read_csv('test_features.csv', index_col=0)

# Load submission sample
sample = pd.read_csv('sample_submission.csv', index_col=0)

# Print their shapes
X_train.shape, Y_train.shape, X_test.shape, sample.shape # Correct

((23814, 875), (23814, 206), (3982, 875), (3982, 206))

In [3]:
# Take a look at X_train
X_train.head()

Unnamed: 0_level_0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [4]:
# Take a look at Y_train
Y_train.head()

Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_000779bfc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_000a6266a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_0015fd391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_001626bd3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Take a look at X_test
X_test.head()

Unnamed: 0_level_0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_0004d9e33,trt_cp,24,D1,-0.5458,0.1306,-0.5135,0.4408,1.55,-0.1644,-0.214,...,0.0981,0.7978,-0.143,-0.2067,-0.2303,-0.1193,0.021,-0.0502,0.151,-0.775
id_001897cda,trt_cp,72,D1,-0.1829,0.232,1.208,-0.4522,-0.3652,-0.3319,-1.882,...,-0.119,-0.1852,-1.031,-1.367,-0.369,-0.5382,0.0359,-0.4764,-1.381,-0.73
id_002429b5b,ctl_vehicle,24,D1,0.1852,-0.1404,-0.3911,0.131,-1.438,0.2455,-0.339,...,-0.2261,0.337,-1.384,0.8604,-1.953,-1.014,0.8662,1.016,0.4924,-0.1942
id_00276f245,trt_cp,24,D2,0.4828,0.1955,0.3825,0.4244,-0.5855,-1.202,0.5998,...,0.126,0.157,-0.1784,-1.12,-0.4325,-0.9005,0.8131,-0.1305,0.5645,-0.5809
id_0027f1083,trt_cp,48,D1,-0.3979,-1.268,1.913,0.2057,-0.5864,-0.0166,0.5128,...,0.4965,0.7578,-0.158,1.051,0.5742,1.09,-0.2962,-0.5313,0.9931,1.838


## Baseline Model 1

### Data Preparation

In [27]:
# Create a mask for treated samples for train
mask = (X_train.cp_type != 'ctl_vehicle')
mask.shape

(23814,)

In [28]:
# Remove controls in X_train
X_train = X_train[mask]

# Remove controls in Y_train
Y_train = Y_train[mask]

# Print the shape of X and Y train
print(X_train.shape)
print(Y_train.shape)

(21948, 875)
(21948, 206)


In [29]:
# Create a mask for treated samples for train
mask = (X_test.cp_type != 'ctl_vehicle')
mask.shape

(3982,)

In [30]:
# Remove controls in X_test
X_test = X_test[mask]

# Print the shape of X and Y test
print(X_test.shape)

(3624, 875)
(3624, 206)


In [76]:
# Remove controls in sample
sample = sample[mask]

# Print the shape of the submission sample
sample.shape

(3624, 206)

In [31]:
# Remove the condition columns in X_train and _test

remove_cols = ['cp_type', 'cp_time', 'cp_dose']

# Remove controls in X_train
X_train.drop(columns=remove_cols, inplace=True)

# Remove controls in X_test
X_test.drop(columns=remove_cols, inplace=True)

# Print the shape of X and Y train
print(X_train.shape)
print(X_test.shape) 

(21948, 872)
(3624, 872)


**Takeaways**: Build helper function

### Independent models for the 206 labels
- OneVsRestClassifier with LogisticRegression as estimator

In [39]:
# Convert all the dataset to np.assy

X_train = X_train.values
Y_train = Y_train.values
X_test = X_test.values

# Print out the dtypes of X_ and Y_
type(X_train), type(Y_train), type(X_test)

In [46]:
# Create a scaler object
scaler = StandardScaler()

In [47]:
# Scale the dataset
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [52]:
# Fit an independent logistic regression model for each class using the OneVsRestClassifier wrapper.

base_lr = LogisticRegression(max_iter=10000) # Create a lr object and change the max_iter to 10000

ovr = OneVsRestClassifier(base_lr) # Create a ovr object using lr

In [53]:
# fit on train
ovr.fit(X_train_scaled, Y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=10000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [79]:
# Predict Y
Y_pred_ovr = ovr.predict(X_test_scaled)

In [80]:
# Convert predicted Y in a submission format

df_sub = pd.DataFrame(Y_pred_ovr, 
                      index=sample.index, 
                      columns = sample.columns) # Don't remove the controls

# Print the shape of the df_sub
print(df_sub.shape)

# Quick view of df_sub
df_sub.head()

(3624, 206)


Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_0004d9e33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_001897cda,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_00276f245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_0027f1083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_006fc47b8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
# Compute the counts for each drug target

df_sub.sum(axis=0).sort_values(ascending=False)

nfkb_inhibitor                           161
proteasome_inhibitor                     142
egfr_inhibitor                            44
tubulin_inhibitor                         44
adrenergic_receptor_agonist               41
                                        ... 
lxr_agonist                                0
mdm_inhibitor                              0
mek_inhibitor                              0
mineralocorticoid_receptor_antagonist      0
5-alpha_reductase_inhibitor                0
Length: 206, dtype: int64

### Model: OVR with LR as estimator
- The strategy consists in fitting one classifier per class. For each classifier, the class is fitted against all the other classes.
- It also supports multilabel classification. To use this feature, feed the classifier an indicator matrix, in which cell [i, j] indicates the presence of label j in sample i.

In [6]:
# Prepare scaled train and test datasets for modeling
scaler, X_train_scaled, X_test_scaled = prepare.prep_moa_v1(X_train, X_test)

# Print scaler and the shapes of the datasts
scaler, X_train_scaled.shape, X_test_scaled.shape

(MinMaxScaler(copy=True, feature_range=(0, 1)), (23814, 876), (3982, 876))

In [7]:
# Take a look at X_train_scaled
X_train_scaled.head()

Unnamed: 0_level_0,cp_type_trt_cp,cp_time_48,cp_time_72,cp_dose_D2,g-0_scaled,g-1_scaled,g-2_scaled,g-3_scaled,g-4_scaled,g-5_scaled,...,c-90_scaled,c-91_scaled,c-92_scaled,c-93_scaled,c-94_scaled,c-95_scaled,c-96_scaled,c-97_scaled,c-98_scaled,c-99_scaled
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,1,0,0,0,0.423838,0.584141,0.510115,0.336117,0.377213,0.520079,...,0.731125,0.734842,0.776018,0.776133,0.713523,0.831778,0.710989,0.790305,0.791709,0.754625
id_000779bfc,1,0,1,0,0.360169,0.570314,0.541622,0.378697,0.451341,0.608766,...,0.680468,0.770365,0.751835,0.737202,0.748942,0.818628,0.751736,0.783357,0.809069,0.777769
id_000a6266a,1,1,0,0,0.395862,0.586368,0.613905,0.370146,0.387116,0.65033,...,0.659251,0.671225,0.761851,0.737151,0.63112,0.755627,0.692951,0.756832,0.655328,0.774582
id_0015fd391,1,1,0,0,0.322259,0.509271,0.509095,0.407976,0.637241,0.531796,...,0.561589,0.670193,0.313779,0.634157,0.66464,0.679881,0.620437,0.705788,0.733155,0.66531
id_001626bd3,1,0,1,1,0.334403,0.495184,0.580266,0.418171,0.475716,0.530934,...,0.711081,0.716676,0.765922,0.814137,0.767607,0.756743,0.748567,0.796077,0.733842,0.775987


In [8]:
# Take a look at X_test_scaled
X_test_scaled.head()

Unnamed: 0_level_0,cp_type_trt_cp,cp_time_48,cp_time_72,cp_dose_D2,g-0_scaled,g-1_scaled,g-2_scaled,g-3_scaled,g-4_scaled,g-5_scaled,...,c-90_scaled,c-91_scaled,c-92_scaled,c-93_scaled,c-94_scaled,c-95_scaled,c-96_scaled,c-97_scaled,c-98_scaled,c-99_scaled
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_0004d9e33,1,0,0,0,0.320196,0.544506,0.494816,0.402475,0.48378,0.569124,...,0.717755,0.773481,0.707762,0.720307,0.710679,0.771086,0.742021,0.76987,0.774235,0.668236
id_001897cda,1,0,1,0,0.343589,0.553916,0.593975,0.346656,0.366779,0.559432,...,0.702324,0.703066,0.644001,0.634966,0.700589,0.738396,0.743125,0.736893,0.657387,0.671496
id_002429b5b,0,0,0,0,0.367318,0.519358,0.501866,0.38311,0.30124,0.592842,...,0.694712,0.740473,0.618654,0.798794,0.585364,0.701264,0.804606,0.852368,0.800275,0.710308
id_00276f245,1,0,0,1,0.386502,0.550529,0.546426,0.40145,0.35332,0.509085,...,0.719738,0.727579,0.70522,0.653133,0.69597,0.710122,0.800674,0.763657,0.805774,0.682296
id_0027f1083,1,1,0,0,0.32973,0.414718,0.634583,0.38778,0.353265,0.577676,...,0.746073,0.770616,0.706685,0.812813,0.769201,0.86546,0.718534,0.732645,0.838464,0.857515


In [11]:
# Convert all the dataset to np.assy
X_train_scaled = X_train_scaled.values
Y_train = Y_train.values
X_test_scaled = X_test_scaled.values

# Print out the dtypes of X_ and Y_
type(X_train_scaled), type(Y_train), type(X_test_scaled)

(numpy.ndarray, numpy.ndarray, numpy.ndarray)

In [13]:
# Create the LogisticRegression as the estimator
base_lr = LogisticRegression(max_iter=1000)

# Create a OneVsRest classifier
ovr = OneVsRestClassifier(base_lr)

In [14]:
# Fit on train
ovr.fit(X_train_scaled, Y_train) # Took 13 mins to complete

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [87]:
ovr.estimators_[:5]

[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 

In [89]:
ovr.n_classes_

206

In [91]:
ovr.multilabel_

True

In [66]:
# Predict Y
Y_pred_ovrlr = ovr.predict(X_test_scaled)

# Print the shape
Y_pred_ovrlr.shape

(3982, 206)

In [67]:
# Convert predicted Y in a submission format
sub_ovrlr = pd.DataFrame(Y_pred_ovrlr, index=sample.index, columns = sample.columns)

# Print the shape
print(sub_ovrlr.shape)

# Quick view of df_sub
sub_ovrlr.head()

(3982, 206)


Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_0004d9e33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_001897cda,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_002429b5b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_00276f245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_0027f1083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
# How many ids have more than 1 moa annotations?
sub_ovrlr[(sub_moclr.sum(axis=1) > 1)].sum(axis=1) # 169

sig_id
id_02072e865    2
id_07d981850    2
id_09415477c    2
id_0a77c28d0    3
id_0be3355d6    3
               ..
id_faade4732    2
id_fb082adbd    2
id_fbdcbf21b    3
id_fcb6dfb60    2
id_fd272822d    2
Length: 169, dtype: int64

In [68]:
# Compute the counts for each moa annotation
sub_ovrlr.sum(axis=0).sort_values(ascending=False)

nfkb_inhibitor                           142
proteasome_inhibitor                     141
tubulin_inhibitor                         47
glucocorticoid_receptor_agonist           40
egfr_inhibitor                            31
                                        ... 
mineralocorticoid_receptor_antagonist      0
membrane_integrity_inhibitor               0
mek_inhibitor                              0
mdm_inhibitor                              0
5-alpha_reductase_inhibitor                0
Length: 206, dtype: int64

### Model: MO with LR as estimator
- Multioutput classification support can be added to any classifier with MultiOutputClassifier.
- This strategy consists of fitting one classifier per target. 
- ? It is thus comparable to running n_classes binary classification tasks, for example with sklearn.multioutput.MultiOutputClassifier.
- ? This approach treats each label independently whereas multilabel classifiers may treat the multiple classes simultaneously, accounting for correlated behavior among them.

In [36]:
from sklearn.multioutput import MultiOutputClassifier

In [39]:
# Create the LogisticRegression as the estimator
base_lr = LogisticRegression(max_iter=1000)

# Create a multioutput classifier
moc = MultiOutputClassifier(base_lr, n_jobs=2)

In [40]:
# Fit on train
moc.fit(X_train_scaled, Y_train) # Took 18 mins to complete. 

MultiOutputClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                   dual=False,
                                                   fit_intercept=True,
                                                   intercept_scaling=1,
                                                   l1_ratio=None, max_iter=1000,
                                                   multi_class='auto',
                                                   n_jobs=None, penalty='l2',
                                                   random_state=None,
                                                   solver='lbfgs', tol=0.0001,
                                                   verbose=0,
                                                   warm_start=False),
                      n_jobs=2)

In [69]:
# Predict Y
Y_pred_moclr = moc.predict(X_test_scaled)

# Print the shape
Y_pred_moclr.shape

(3982, 206)

In [70]:
# Convert predicted Y in a submission format

sub_moclr = pd.DataFrame(Y_pred_moclr, 
                         index=sample.index, 
                         columns = sample.columns)

# Print the shape of the df_sub
print(sub_moclr.shape)

# Quick view of df_sub
sub_moclr.head()

(3982, 206)


Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_0004d9e33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_001897cda,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_002429b5b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_00276f245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_0027f1083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
sub_moclr[(sub_moclr.sum(axis=1) > 1)].sum(axis=1)

sig_id
id_02072e865    2
id_07d981850    2
id_09415477c    2
id_0a77c28d0    3
id_0be3355d6    3
               ..
id_faade4732    2
id_fb082adbd    2
id_fbdcbf21b    3
id_fcb6dfb60    2
id_fd272822d    2
Length: 169, dtype: int64

In [71]:
# Compute the counts for each drug target
sub_moclr.sum(axis=0).sort_values(ascending=False)

nfkb_inhibitor                           142
proteasome_inhibitor                     141
tubulin_inhibitor                         47
glucocorticoid_receptor_agonist           40
egfr_inhibitor                            31
                                        ... 
mineralocorticoid_receptor_antagonist      0
membrane_integrity_inhibitor               0
mek_inhibitor                              0
mdm_inhibitor                              0
5-alpha_reductase_inhibitor                0
Length: 206, dtype: int64

In [77]:
# Are the model's predictions the same as the baseline?
(sub_ovrlr != sub_moclr).any().any()

False

**Takeaways**
1. (moc + lr) returns exactly the same lableling as the baseline model 2 (ovr + lr). 
2. (moc + lr) took longer time than baseline model 2. 
3. moc doesn't have the predit_proba