In [30]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain

from sklearn.metrics import classification_report, confusion_matrix, jaccard_score, log_loss

### Preprocess

In [2]:
# Load train, validate and test

X_train_cos = pd.read_csv('X_train_cos.csv', index_col=0)
Y_train = pd.read_csv('Y_train.csv', index_col=0)

X_validate_cos = pd.read_csv('X_validate_cos.csv', index_col=0)
Y_validate = pd.read_csv('Y_validate.csv', index_col=0)

X_test_cos = pd.read_csv('X_test_cos.csv', index_col=0)
Y_test = pd.read_csv('Y_test.csv', index_col=0)

# Print their shapes
X_train_cos.shape, Y_train.shape, X_validate_cos.shape, Y_validate.shape, X_test_cos.shape, Y_test.shape

((15802, 50), (15802, 206), (3951, 50), (3951, 206), (2195, 50), (2195, 206))

In [3]:
# Take the top 5 labels from Y_train

Y_train = Y_train.loc[:, ['nfkb_inhibitor', 'proteasome_inhibitor', 'cyclooxygenase_inhibitor', 
                               'dopamine_receptor_antagonist', 'dna_inhibitor']]

Y_validate = Y_validate.loc[:, ['nfkb_inhibitor', 'proteasome_inhibitor', 'cyclooxygenase_inhibitor', 
                               'dopamine_receptor_antagonist', 'dna_inhibitor']]

Y_test = Y_test.loc[:, ['nfkb_inhibitor', 'proteasome_inhibitor', 'cyclooxygenase_inhibitor', 
                               'dopamine_receptor_antagonist', 'dna_inhibitor']]

In [4]:
# Print the shapes
Y_train.shape, Y_validate.shape, Y_test.shape

((15802, 5), (3951, 5), (2195, 5))

In [5]:
# Check if index of Y matches index of X
(X_train_cos.index == Y_train.index).sum() # Match

15802

In [6]:
# Check if index of Y matches index of X
(X_validate_cos.index == Y_validate.index).sum() # Match

3951

In [7]:
# Check if index of Y matches index of X
(X_test_cos.index == Y_test.index).sum() # Match

2195

In [8]:
# Take a peek at X_train_cos
X_train_cos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
id_887911684,-0.113079,-0.126488,-0.045746,-0.065418,0.009277,0.058472,-0.015255,0.286149,-0.071498,0.029081,...,-0.004725,-0.046098,-0.000499,0.0327,-0.090504,0.031419,-0.122123,0.024048,-0.122287,-0.012305
id_294d38ce4,-0.23977,-0.242513,0.469707,0.132345,-0.079247,0.053615,-0.062062,0.159616,-0.169681,0.026081,...,-0.023376,0.004647,0.002614,0.044699,0.061037,-0.002453,-0.014456,-0.020474,0.008282,0.037586
id_b12cf3d9c,-0.116704,-0.175076,-0.216695,0.093239,0.098658,-0.023396,-0.040551,-0.069963,0.081209,-0.049178,...,0.071413,-0.054937,0.077372,-0.072166,-0.020397,-0.001381,-0.006411,-0.01482,-0.020163,0.062042
id_d8f5ca938,-0.108337,0.198954,-0.001508,0.331887,-0.138065,-0.079667,-0.033503,-0.174493,-0.137442,0.065832,...,-0.014569,0.025528,0.02239,-0.048285,0.023898,0.037983,-0.026742,0.069053,0.004214,0.06217
id_c4eaa645d,0.113585,-0.318411,0.101633,-0.034343,0.128269,-0.208788,-0.105685,0.149461,-0.098587,0.329119,...,0.014009,-0.024623,0.016682,0.044054,0.003013,-0.019702,-0.029526,0.003733,-0.052697,0.008851


In [9]:
# Take a peek at Y_train
Y_train.head()

Unnamed: 0_level_0,nfkb_inhibitor,proteasome_inhibitor,cyclooxygenase_inhibitor,dopamine_receptor_antagonist,dna_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id_887911684,0,0,0,0,0
id_294d38ce4,0,0,0,0,0
id_b12cf3d9c,0,0,0,0,0
id_d8f5ca938,0,0,0,0,0
id_c4eaa645d,0,0,0,0,0


In [10]:
# Create the order for the 5 labels

order= [2,3,4,0,1]

### Classifier Chain

In [11]:
# Create estimator
clf = MLPClassifier(random_state=123)

In [12]:
# Create Classifier Chain Object
chain = ClassifierChain(clf, order=order, random_state=123)
chain

ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=[2, 3, 4, 0, 1], random_state=123)

### Model on Train

In [13]:
# Fit the chain on train
chain.fit(X_train_cos, Y_train) # Took 2.0 mins

# Print out the order
chain.order_

[2, 3, 4, 0, 1]

In [14]:
# Use chain to make a prediction

Y_pred = chain.predict(X_train_cos)
Y_proba = chain.predict_proba(X_train_cos)

In [15]:
# Take a look at Y_pred
# Print the size of Y_pred

print(Y_pred.size)
Y_pred

79010


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [16]:
# Take a look at Y_proba
# Print the size of Y_proba

print(Y_proba.size)
Y_proba

79010


array([[1.20664410e-04, 3.67213568e-05, 4.40077611e-02, 7.53040934e-05,
        3.94730253e-03],
       [2.85779216e-05, 3.49705562e-05, 9.33335080e-04, 1.33202254e-02,
        2.06955642e-03],
       [6.40683407e-06, 1.03271232e-04, 4.82104750e-02, 9.63632918e-04,
        3.22295132e-03],
       ...,
       [3.39517857e-04, 4.48546379e-05, 1.99760640e-01, 2.93465749e-03,
        3.41742376e-04],
       [1.78814470e-03, 3.24517350e-05, 3.44829733e-02, 1.18535904e-01,
        7.74326172e-03],
       [6.78940857e-05, 6.11574654e-05, 1.50048899e-04, 2.95036616e-04,
        2.75934220e-03]])

In [17]:
# Take a look at Y_train
# Print the size of Y_train

print(Y_train.values.size)
Y_train.values

79010


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [18]:
# Compute jaccard score
jaccard_score = jaccard_score(Y_train, Y_pred, average='samples')
jaccard_score

0.04892840568704384

In [19]:
# Compute log loss
log_loss = log_loss(Y_train, Y_proba)
log_loss

0.0612450250619657

In [20]:
# Compare predicted Y_train ith Y_train
(Y_train.values == Y_pred)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [21]:
# How many labels the model predicing right
(Y_train.values == Y_pred).sum()

78287

In [22]:
# Summarize Y_train

features_train = Y_train.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_train:', features_train.sum())
features_train.head()

The total count of targets in Y_train: 2003


nfkb_inhibitor                  598
proteasome_inhibitor            526
cyclooxygenase_inhibitor        304
dopamine_receptor_antagonist    302
dna_inhibitor                   273
dtype: int64

In [23]:
# Summarize Y_pred

features_chain = pd.DataFrame(Y_pred, columns=Y_train.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain.sum())
features_chain.head()

The total count of targets: 1320.0


nfkb_inhibitor                  569.0
proteasome_inhibitor            531.0
dopamine_receptor_antagonist     99.0
dna_inhibitor                    93.0
cyclooxygenase_inhibitor         28.0
dtype: float64

In [24]:
# Concat features_train, features_baseline and features_chain

features_sum = pd.concat([features_train, features_chain], axis=1)
features_sum.head()

Unnamed: 0,0,1
nfkb_inhibitor,598,569.0
proteasome_inhibitor,526,531.0
cyclooxygenase_inhibitor,304,28.0
dopamine_receptor_antagonist,302,99.0
dna_inhibitor,273,93.0


### Model on Validate

In [25]:
# Predict the labels of Y_validate
Y_pred_v = chain.predict(X_validate_cos)

# Predcit the probability of Y
Y_proba_v = chain.predict_proba(X_validate_cos) 

In [26]:
# Take a look at Y_pred_v
# Print the size of Y_pred_v

print(Y_pred_v.size)
Y_pred_v

19755


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [27]:
# Take a look at Y_proba_v
# Print the size of Y_proba_v

print(Y_proba_v.size)
Y_proba_v

19755


array([[1.19849853e-02, 1.64321446e-04, 4.29146398e-03, 1.48855055e-03,
        5.09377576e-02],
       [1.08652435e-04, 4.16563757e-05, 1.38079430e-02, 1.74233826e-03,
        7.88779200e-03],
       [4.33824381e-06, 1.08219649e-03, 3.57879095e-06, 3.47904463e-10,
        4.76668849e-05],
       ...,
       [1.34807974e-04, 1.17740245e-04, 7.57295927e-03, 1.07910187e-02,
        6.50334737e-02],
       [1.12219579e-04, 2.53831593e-04, 1.96021452e-02, 7.03434646e-03,
        1.56103716e-02],
       [2.66423573e-03, 8.73690303e-05, 4.64739891e-04, 1.37899568e-02,
        1.14579976e-02]])

In [28]:
# Take a look at Y_validate
# Print the size of Y_validate

print(Y_validate.values.size)
Y_validate.values

19755


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [31]:
# Compute jaccard score for validate

jaccard_score_v = jaccard_score(Y_validate, Y_pred_v, average='samples')
jaccard_score_v

0.03416856492027335

In [32]:
# Compute log loss for validate

log_loss_v = log_loss(Y_validate, Y_proba_v)
log_loss_v

0.15800981086902433

In [33]:
# Compare Y_validate and Y_pred_v
(Y_validate.values == Y_pred_v)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [34]:
# How many labels are predicted right
(Y_validate.values == Y_pred_v).sum()

19467

In [35]:
# Summarize Y_validate

features_validate = Y_validate.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_validate:', features_validate.sum())
features_validate.head()

The total count of targets in Y_validate: 520


nfkb_inhibitor                  149
proteasome_inhibitor            128
cyclooxygenase_inhibitor         89
dna_inhibitor                    82
dopamine_receptor_antagonist     72
dtype: int64

In [36]:
# Summarize Y_pred_v

features_chain_v = pd.DataFrame(Y_pred_v, columns=Y_validate.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain_v.sum())
features_chain_v.head()

The total count of targets: 294.0


proteasome_inhibitor            133.0
nfkb_inhibitor                  133.0
dna_inhibitor                    18.0
dopamine_receptor_antagonist      5.0
cyclooxygenase_inhibitor          5.0
dtype: float64

In [37]:
# Add features_validate and features_chain_v to feature_sum

features_sum = pd.concat([features_sum, features_validate, features_chain_v], axis=1)
features_sum.head()

Unnamed: 0,0,1,0.1,1.1
nfkb_inhibitor,598,569.0,149,133.0
proteasome_inhibitor,526,531.0,128,133.0
cyclooxygenase_inhibitor,304,28.0,89,5.0
dopamine_receptor_antagonist,302,99.0,72,5.0
dna_inhibitor,273,93.0,82,18.0


### Model on Test

In [38]:
# Predict the labels of Y_test
Y_pred_t = chain.predict(X_test_cos)

# Predcit the probability of Y_test
Y_proba_t = chain.predict_proba(X_test_cos) 

In [39]:
# Take a look at Y_pred_t

print(Y_pred_t.size)
Y_pred_t

10975


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [40]:
# Take a look at Y_proba_t

print(Y_proba_t.size)
Y_proba_t

10975


array([[4.00309489e-03, 6.53229596e-05, 1.24926873e-02, 6.16755707e-03,
        1.57650939e-02],
       [9.59797244e-06, 6.24568664e-06, 7.13755466e-06, 6.35337289e-07,
        1.48307187e-05],
       [7.11597563e-02, 2.02506188e-04, 7.48941918e-04, 7.71075251e-04,
        5.00107194e-05],
       ...,
       [1.17767270e-04, 9.21378870e-05, 3.77296121e-05, 4.22801336e-08,
        4.07402462e-08],
       [3.67988834e-05, 2.32703150e-04, 8.98009860e-03, 3.07739159e-02,
        2.29119645e-03],
       [5.99814288e-04, 3.75208030e-04, 3.60636936e-02, 2.72446408e-02,
        4.72938809e-02]])

In [41]:
# Take a look at Y_test
# Print the size of Y_test

print(Y_test.values.size)
Y_test.values

10975


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0]])

In [42]:
# Compute jaccard score for test

jaccard_score_t = jaccard_score(Y_test, Y_pred_t, average='samples')
jaccard_score_t

0.03416856492027335

In [43]:
# Compute log loss for test

log_loss_t = log_loss(Y_test, Y_proba_t)
log_loss_t

0.18197595264134137

In [44]:
# Compare predicted test and Y_test
(Y_test.values == Y_pred_t)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True, False,  True],
       [ True,  True, False,  True,  True]])

In [45]:
# How many labels are predicted right
(Y_test.values == Y_pred_t).sum()

10818

In [46]:
# Summarize Y_test

features_test = Y_test.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_test:', features_test.sum())
features_test.head()

The total count of targets in Y_test: 296


nfkb_inhibitor                  85
proteasome_inhibitor            72
dopamine_receptor_antagonist    50
dna_inhibitor                   47
cyclooxygenase_inhibitor        42
dtype: int64

In [47]:
# Summarize Y_pred_t

features_chain_t = pd.DataFrame(Y_pred_t, columns=Y_test.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of predicted targets:', features_chain_t.sum())
features_chain_t.head()

The total count of predicted targets: 153.0


proteasome_inhibitor            72.0
nfkb_inhibitor                  72.0
dna_inhibitor                    6.0
dopamine_receptor_antagonist     2.0
cyclooxygenase_inhibitor         1.0
dtype: float64

In [48]:
# Add features_test and _chain_t to features_sum

features_sum = pd.concat([features_sum, features_test, features_chain_t], axis=1)
features_sum.head()

Unnamed: 0,0,1,0.1,1.1,0.2,1.2
nfkb_inhibitor,598,569.0,149,133.0,85,72.0
proteasome_inhibitor,526,531.0,128,133.0,72,72.0
cyclooxygenase_inhibitor,304,28.0,89,5.0,42,1.0
dopamine_receptor_antagonist,302,99.0,72,5.0,50,2.0
dna_inhibitor,273,93.0,82,18.0,47,6.0


In [49]:
# Print chain object to double check
chain

ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=[2, 3, 4, 0, 1], random_state=123)