In [30]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain

from sklearn.metrics import classification_report, confusion_matrix, jaccard_score, log_loss

### Preprocess

In [2]:
# Load train, validate and test

X_train_rbf = pd.read_csv('X_train_rbf.csv', index_col=0)
Y_train = pd.read_csv('Y_train.csv', index_col=0)

X_validate_rbf = pd.read_csv('X_validate_rbf.csv', index_col=0)
Y_validate = pd.read_csv('Y_validate.csv', index_col=0)

X_test_rbf = pd.read_csv('X_test_rbf.csv', index_col=0)
Y_test = pd.read_csv('Y_test.csv', index_col=0)

# Print their shapes
X_train_rbf.shape, Y_train.shape, X_validate_rbf.shape, Y_validate.shape, X_test_rbf.shape, Y_test.shape

((15802, 50), (15802, 206), (3951, 50), (3951, 206), (2195, 50), (2195, 206))

In [3]:
# Take the top 5 labels from Y_train

Y_train = Y_train.loc[:, ['nfkb_inhibitor', 'proteasome_inhibitor', 'cyclooxygenase_inhibitor', 
                               'dopamine_receptor_antagonist', 'dna_inhibitor']]

Y_validate = Y_validate.loc[:, ['nfkb_inhibitor', 'proteasome_inhibitor', 'cyclooxygenase_inhibitor', 
                               'dopamine_receptor_antagonist', 'dna_inhibitor']]

Y_test = Y_test.loc[:, ['nfkb_inhibitor', 'proteasome_inhibitor', 'cyclooxygenase_inhibitor', 
                               'dopamine_receptor_antagonist', 'dna_inhibitor']]

In [4]:
# Print the shapes
Y_train.shape, Y_validate.shape, Y_test.shape

((15802, 5), (3951, 5), (2195, 5))

In [5]:
# Check if index of Y matches index of X
(X_train_rbf.index == Y_train.index).sum() # Match

15802

In [6]:
# Check if index of Y matches index of X
(X_validate_rbf.index == Y_validate.index).sum() # Match

3951

In [7]:
# Check if index of Y matches index of X
(X_test_rbf.index == Y_test.index).sum() # Match

2195

In [8]:
# Take a peek at X_train_cos
X_train_rbf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
id_887911684,-0.113804,0.073874,-0.045844,-0.040177,0.001239,0.028486,0.008133,0.189038,0.009369,-0.035972,...,0.021074,0.019273,0.040316,-0.04419,0.035163,-0.068163,0.031149,-0.00528,-0.018714,0.04059
id_294d38ce4,-0.081761,0.238309,0.27361,0.066148,0.035802,0.028016,0.053274,0.117625,0.037382,-0.037918,...,0.010674,-0.008108,0.014344,-0.045436,-0.047832,-0.014416,-0.014711,-0.018614,-0.024334,-0.018353
id_b12cf3d9c,-0.042848,0.095801,-0.151614,0.068988,-0.04379,0.014354,-0.053366,-0.053073,-0.046212,0.059299,...,0.006034,-0.001349,0.000459,0.006673,0.086122,0.047384,0.031068,-0.020613,0.052809,0.019886
id_d8f5ca938,-0.095446,-0.102518,0.033974,0.220547,0.048482,-0.078481,0.082272,-0.107104,0.021217,-0.071515,...,-0.026697,-0.005765,-0.018127,-0.011354,0.033982,0.016215,-0.022585,0.028705,0.034875,0.01463
id_c4eaa645d,0.034598,0.148927,0.002449,-0.061712,-0.151287,-0.147584,0.114287,0.100925,-0.1384,-0.179451,...,0.025211,0.027269,-0.002506,0.00164,0.002665,-0.007431,-0.037124,-0.05339,0.005366,0.024452


In [9]:
# Take a peek at Y_train
Y_train.head()

Unnamed: 0_level_0,nfkb_inhibitor,proteasome_inhibitor,cyclooxygenase_inhibitor,dopamine_receptor_antagonist,dna_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id_887911684,0,0,0,0,0
id_294d38ce4,0,0,0,0,0
id_b12cf3d9c,0,0,0,0,0
id_d8f5ca938,0,0,0,0,0
id_c4eaa645d,0,0,0,0,0


In [10]:
# Create the order for the 5 labels

order= [2,3,4,0,1]

### Classifier Chain

In [11]:
# Create estimator
clf = MLPClassifier(random_state=123)

In [12]:
# Create Classifier Chain Object
chain = ClassifierChain(clf, order=order, random_state=123)
chain

ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=[2, 3, 4, 0, 1], random_state=123)

### Model on Train

In [13]:
# Fit the chain on train
chain.fit(X_train_rbf, Y_train)

# Print out the order
chain.order_

[2, 3, 4, 0, 1]

In [14]:
# Use chain to make a prediction

Y_pred = chain.predict(X_train_rbf)
Y_proba = chain.predict_proba(X_train_rbf)

In [15]:
# Take a look at Y_pred
# Print the size of Y_pred

print(Y_pred.size)
Y_pred

79010


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [16]:
# Take a look at Y_proba
# Print the size of Y_proba

print(Y_proba.size)
Y_proba

79010


array([[3.10423218e-03, 2.06716155e-05, 3.42682072e-02, 8.01847782e-03,
        5.00111578e-03],
       [8.25290740e-04, 4.45080977e-05, 1.19255223e-02, 1.58991892e-02,
        1.07058778e-03],
       [8.21365076e-04, 3.24737466e-05, 2.34425551e-02, 1.14306672e-02,
        2.58725835e-03],
       ...,
       [3.55303065e-04, 8.40092305e-05, 1.00718807e-02, 9.88236342e-03,
        6.77548013e-03],
       [2.98231174e-03, 5.87781465e-05, 2.07179584e-02, 3.18585511e-02,
        7.33179668e-03],
       [3.04410638e-04, 1.20570437e-04, 4.09771175e-03, 5.81717230e-03,
        1.53891030e-02]])

In [17]:
# Take a look at Y_train
# Print the size of Y_train

print(Y_train.values.size)
Y_train.values

79010


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [18]:
# Compute jaccard score
jaccard_score = jaccard_score(Y_train, Y_pred, average='samples')
jaccard_score

0.030681770240054

In [19]:
# Compute log loss
log_loss = log_loss(Y_train, Y_proba)
log_loss

0.10958224017602107

In [20]:
# Compare predicted Y_train ith Y_train
(Y_train.values == Y_pred)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [21]:
# How many labels the model predicing right
(Y_train.values == Y_pred).sum()

77820

In [22]:
# Summarize Y_train

features_train = Y_train.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_train:', features_train.sum())
features_train.head()

The total count of targets in Y_train: 2003


nfkb_inhibitor                  598
proteasome_inhibitor            526
cyclooxygenase_inhibitor        304
dopamine_receptor_antagonist    302
dna_inhibitor                   273
dtype: int64

In [23]:
# Summarize Y_pred

features_chain = pd.DataFrame(Y_pred, columns=Y_train.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain.sum())
features_chain.head()

The total count of targets: 1069.0


nfkb_inhibitor                  521.0
proteasome_inhibitor            517.0
dna_inhibitor                    31.0
dopamine_receptor_antagonist      0.0
cyclooxygenase_inhibitor          0.0
dtype: float64

In [24]:
# Concat features_train, features_baseline and features_chain

features_sum = pd.concat([features_train, features_chain], axis=1)
features_sum.head()

Unnamed: 0,0,1
nfkb_inhibitor,598,521.0
proteasome_inhibitor,526,517.0
cyclooxygenase_inhibitor,304,0.0
dopamine_receptor_antagonist,302,0.0
dna_inhibitor,273,31.0


### Model on Validate

In [25]:
# Predict the labels of Y_validate
Y_pred_v = chain.predict(X_validate_rbf)

# Predcit the probability of Y
Y_proba_v = chain.predict_proba(X_validate_rbf) 

In [26]:
# Take a look at Y_pred_v
# Print the size of Y_pred_v

print(Y_pred_v.size)
Y_pred_v

19755


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [27]:
# Take a look at Y_proba_v
# Print the size of Y_proba_v

print(Y_proba_v.size)
Y_proba_v

19755


array([[5.55575731e-03, 5.02536296e-05, 2.39367134e-02, 1.91649295e-02,
        6.09324199e-02],
       [1.71962554e-03, 1.17785301e-04, 9.09422930e-03, 1.79701807e-02,
        8.45634335e-03],
       [3.68213845e-07, 2.58065784e-04, 1.28452817e-03, 1.75486940e-03,
        1.91010852e-02],
       ...,
       [1.11196120e-03, 4.34534513e-05, 4.61921881e-02, 1.82623665e-02,
        5.02494939e-02],
       [8.56855386e-05, 1.13963591e-04, 4.00267790e-02, 2.77107718e-02,
        2.19957498e-02],
       [1.24549245e-02, 1.89616780e-05, 1.09938171e-02, 2.65851276e-02,
        3.70644185e-02]])

In [28]:
# Take a look at Y_validate
# Print the size of Y_validate

print(Y_validate.values.size)
Y_validate.values

19755


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [31]:
# Compute jaccard score for validate

jaccard_score_v = jaccard_score(Y_validate, Y_pred_v, average='samples')
jaccard_score_v

0.029064371888973255

In [32]:
# Compute log loss for validate

log_loss_v = log_loss(Y_validate, Y_proba_v)
log_loss_v

0.14312491122491783

In [33]:
# Compare Y_validate and Y_pred_v
(Y_validate.values == Y_pred_v)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [34]:
# How many labels are predicted right
(Y_validate.values == Y_pred_v).sum()

19437

In [35]:
# Summarize Y_validate

features_validate = Y_validate.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_validate:', features_validate.sum())
features_validate.head()

The total count of targets in Y_validate: 520


nfkb_inhibitor                  149
proteasome_inhibitor            128
cyclooxygenase_inhibitor         89
dna_inhibitor                    82
dopamine_receptor_antagonist     72
dtype: int64

In [36]:
# Summarize Y_pred_v

features_chain_v = pd.DataFrame(Y_pred_v, columns=Y_validate.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain_v.sum())
features_chain_v.head()

The total count of targets: 250.0


proteasome_inhibitor            122.0
nfkb_inhibitor                  122.0
dna_inhibitor                     6.0
dopamine_receptor_antagonist      0.0
cyclooxygenase_inhibitor          0.0
dtype: float64

In [37]:
# Add features_validate and features_chain_v to feature_sum

features_sum = pd.concat([features_sum, features_validate, features_chain_v], axis=1)
features_sum.head()

Unnamed: 0,0,1,0.1,1.1
nfkb_inhibitor,598,521.0,149,122.0
proteasome_inhibitor,526,517.0,128,122.0
cyclooxygenase_inhibitor,304,0.0,89,0.0
dopamine_receptor_antagonist,302,0.0,72,0.0
dna_inhibitor,273,31.0,82,6.0


### Model on Test

In [38]:
# Predict the labels of Y_test
Y_pred_t = chain.predict(X_test_rbf)

# Predcit the probability of Y_test
Y_proba_t = chain.predict_proba(X_test_rbf) 

In [39]:
# Take a look at Y_pred_t

print(Y_pred_t.size)
Y_pred_t

10975


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [40]:
# Take a look at Y_proba_t

print(Y_proba_t.size)
Y_proba_t

10975


array([[3.72349672e-03, 4.61308313e-05, 3.10618244e-02, 4.24709296e-02,
        1.05834322e-02],
       [1.95090493e-03, 1.86721371e-05, 2.27973907e-03, 1.11638443e-03,
        1.27249305e-03],
       [1.91492935e-03, 4.14481409e-05, 5.42273041e-03, 3.45301006e-02,
        3.05837362e-04],
       ...,
       [2.24030066e-06, 1.90255078e-04, 9.43527542e-04, 1.92874108e-03,
        7.52260980e-04],
       [5.21590841e-03, 5.08147232e-04, 7.37691580e-03, 6.75223475e-03,
        1.11485151e-02],
       [4.99436507e-04, 8.80194854e-05, 1.65065971e-02, 1.07451699e-02,
        7.15922512e-03]])

In [41]:
# Take a look at Y_test
# Print the size of Y_test

print(Y_test.values.size)
Y_test.values

10975


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0]])

In [42]:
# Compute jaccard score for test

jaccard_score_t = jaccard_score(Y_test, Y_pred_t, average='samples')
jaccard_score_t

0.028701594533029614

In [43]:
# Compute log loss for test

log_loss_t = log_loss(Y_test, Y_proba_t)
log_loss_t

0.1481303926836591

In [44]:
# Compare predicted test and Y_test
(Y_test.values == Y_pred_t)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True, False,  True],
       [ True,  True, False,  True,  True]])

In [45]:
# How many labels are predicted right
(Y_test.values == Y_pred_t).sum()

10790

In [46]:
# Summarize Y_test

features_test = Y_test.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_test:', features_test.sum())
features_test.head()

The total count of targets in Y_test: 296


nfkb_inhibitor                  85
proteasome_inhibitor            72
dopamine_receptor_antagonist    50
dna_inhibitor                   47
cyclooxygenase_inhibitor        42
dtype: int64

In [47]:
# Summarize Y_pred_t

features_chain_t = pd.DataFrame(Y_pred_t, columns=Y_test.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of predicted targets:', features_chain_t.sum())
features_chain_t.head()

The total count of predicted targets: 139.0


proteasome_inhibitor            69.0
nfkb_inhibitor                  69.0
dna_inhibitor                    1.0
dopamine_receptor_antagonist     0.0
cyclooxygenase_inhibitor         0.0
dtype: float64

In [48]:
# Add features_test and _chain_t to features_sum

features_sum = pd.concat([features_sum, features_test, features_chain_t], axis=1)
features_sum.head()

Unnamed: 0,0,1,0.1,1.1,0.2,1.2
nfkb_inhibitor,598,521.0,149,122.0,85,69.0
proteasome_inhibitor,526,517.0,128,122.0,72,69.0
cyclooxygenase_inhibitor,304,0.0,89,0.0,42,0.0
dopamine_receptor_antagonist,302,0.0,72,0.0,50,0.0
dna_inhibitor,273,31.0,82,6.0,47,1.0


In [49]:
# Print chain object to double check
chain

ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=[2, 3, 4, 0, 1], random_state=123)