In [58]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import permutations

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain

from sklearn.metrics import classification_report, confusion_matrix, jaccard_score, log_loss

import prepare, model

### Preprocess

In [2]:
# Load train, target and test datasets

X_train = pd.read_csv('train_features.csv', index_col=0)
Y_train = pd.read_csv('train_targets_scored.csv', index_col=0)
X_test = pd.read_csv('test_features.csv', index_col=0)

In [3]:
# Print their shapes
X_train.shape, Y_train.shape, X_test.shape # Correct

((23814, 875), (23814, 206), (3982, 875))

In [4]:
# Take the top 5 labels from Y_train

screen_order = Y_train.loc[:, ['nfkb_inhibitor', 'proteasome_inhibitor', 'cyclooxygenase_inhibitor', 
                               'dopamine_receptor_antagonist', 'dna_inhibitor']]
screen_order.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23814 entries, id_000644bb2 to id_ffffdd77b
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   nfkb_inhibitor                23814 non-null  int64
 1   proteasome_inhibitor          23814 non-null  int64
 2   cyclooxygenase_inhibitor      23814 non-null  int64
 3   dopamine_receptor_antagonist  23814 non-null  int64
 4   dna_inhibitor                 23814 non-null  int64
dtypes: int64(5)
memory usage: 1.1+ MB


In [5]:
# Concat to X_train

train = pd.concat([X_train, screen_order], axis=1)
train.head() # Success

Unnamed: 0_level_0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-95,c-96,c-97,c-98,c-99,nfkb_inhibitor,proteasome_inhibitor,cyclooxygenase_inhibitor,dopamine_receptor_antagonist,dna_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0.6584,-0.3981,0.2139,0.3801,0.4176,0,0,0,0,0
id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,0.4899,0.1522,0.1241,0.6077,0.7371,0,0,0,0,0
id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,-0.3174,-0.6417,-0.2187,-1.408,0.6931,0,0,0,0,0
id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,-1.288,-1.621,-0.8784,-0.3876,-0.8154,0,0,0,0,0
id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,-0.3031,0.1094,0.2885,-0.3786,0.7125,0,0,0,0,0


In [6]:
# train.to_csv('screen_order.csv')

In [7]:
# Preprocess X_train to X_train_scaled, X_validate_scaled and X_test_scaled
# encoding, splitting, and scaling

scaler, train_scaled, validate_scaled, test_scaled = prepare.prep_moa_v2(train)

# Print the shapes
scaler, train_scaled.shape, validate_scaled.shape, test_scaled.shape

(MinMaxScaler(), (17145, 881), (4287, 881), (2382, 881))

In [28]:
# Create X

X_train_scaled = train_scaled.iloc[:, 5:]
X_validate_scaled = validate_scaled.iloc[:, 5:]
X_test_scaled = test_scaled.iloc[:, 5:]

In [29]:
# Create Y

Y_train_scaled = train_scaled.iloc[:, 0:5]
Y_validate_scaled = validate_scaled.iloc[:, 0:5]
Y_test_scaled = test_scaled.iloc[:, 0:5]

In [30]:
# Check if index of Y matches index of X
(X_train_scaled.index == Y_train_scaled.index).sum() # Match

17145

In [31]:
# Check if index of Y matches index of X
(X_validate_scaled.index == Y_validate_scaled.index).sum() # Match

4287

In [32]:
# Check if index of Y matches index of X
(X_test_scaled.index == Y_test_scaled.index).sum() # Match

2382

In [33]:
# Create all possible orders for the 5 labels

order= [0,1,2,3,4]
orders = list(permutations(order, 5))

### Classifier Chain

In [14]:
# Create estimator
clf = MLPClassifier(random_state=123)

In [15]:
# Create Classifier chain object

chains=[]

for order in orders:
    chain = ClassifierChain(clf, order=order, random_state=123)
    chains.append(chain)

In [16]:
# Take a peek at the chains object

# Print the size of chains
print(len(chains))

# Print the first chain
print(chains[0])

# Print the middle chain
print(chains[60])

# Print the last chain
print(chains[-1])

120
ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=(0, 1, 2, 3, 4), random_state=123)
ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=(2, 3, 0, 1, 4), random_state=123)
ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=(4, 3, 2, 1, 0), random_state=123)


### Model on Train

In [17]:
# Fit on Train

for chain in chains:
    chain.fit(X_train_scaled, Y_train_scaled) # Took 11 hours and 20 mins

In [84]:
[chain.predict(X_train_scaled) for chain in chains]

IndexError: too many indices for array

In [83]:
chains[0].predict_proba(X_train_scaled)

IndexError: too many indices for array

**Takeways**: When use the chain in chain to make predictions, error message popped up: too many indices for array.<br> 
**Create a new chain with random order to trouble shoot the problem**

In [37]:
# Recreate a chain object with random order

chain_r = ClassifierChain(clf, order='random', random_state=123)
chain_r

ClassifierChain(base_estimator=MLPClassifier(random_state=123), order='random',
                random_state=123)

In [38]:
# Fit the chain_r on train
chain_r.fit(X_train_scaled, Y_train_scaled) # Took 6.5 mins

ClassifierChain(base_estimator=MLPClassifier(random_state=123), order='random',
                random_state=123)

In [39]:
# Print out the order
chain_r.order_

array([1, 3, 4, 0, 2])

In [40]:
# Use chain_r to make a prediction
Y_pred = chain_r.predict(X_train_scaled)

In [51]:
Y_proba = chain_r.predict_proba(X_train_scaled)

In [41]:
# Take a look at Y_pred
# Print the size of Y_pred

print(Y_pred.size)
Y_pred

85725


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [78]:
# Take a look at Y_proba
# Print the size of Y_proba

print(Y_proba.size)
Y_proba

85725


array([[3.80740754e-03, 3.50371415e-08, 2.85480471e-02, 5.15094714e-02,
        1.15988865e-02],
       [1.69899899e-03, 7.32099648e-09, 2.20993100e-02, 4.23131672e-02,
        1.17459434e-02],
       [1.98984981e-03, 1.08405033e-08, 1.87564904e-02, 1.74022194e-02,
        7.43830887e-03],
       ...,
       [2.93402955e-03, 2.36182873e-08, 1.34361063e-02, 2.28784220e-02,
        1.01187946e-02],
       [7.53845272e-03, 8.81307389e-09, 1.60369504e-02, 4.43373204e-02,
        6.83014999e-03],
       [4.24962933e-04, 1.50954572e-08, 5.74066418e-04, 7.61592321e-04,
        7.91854560e-04]])

In [42]:
# Take a look at Y_train
# Print the size of Y_train

print(Y_train_scaled.values.size)
Y_train_scaled.values

85725


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int64)

In [43]:
# Compute jaccard score
jaccard_score = jaccard_score(Y_train_scaled, Y_pred, average='samples')
jaccard_score

0.03041703120443278

In [52]:
# Compute log loss
log_loss = log_loss(Y_train_scaled, Y_proba)
log_loss

0.10504347197453764

In [45]:
# Compare predicted Y_train ith Y_train
(Y_train_scaled.values == Y_pred)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [46]:
# How many labels the model predicing right
(Y_train_scaled.values == Y_pred).sum()

84738

In [47]:
# Summarize Y_train

features_train = Y_train_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_train:', features_train.sum())
features_train.head()

The total count of targets in Y_train: 2022


nfkb_inhibitor                  597
proteasome_inhibitor            517
cyclooxygenase_inhibitor        315
dopamine_receptor_antagonist    302
dna_inhibitor                   291
dtype: int64

In [48]:
# Summarize Y_pred

features_chain = pd.DataFrame(Y_pred, columns=Y_train_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain.sum())
features_chain.head()

The total count of targets: 1035.0


nfkb_inhibitor                  518.0
proteasome_inhibitor            517.0
dna_inhibitor                     0.0
dopamine_receptor_antagonist      0.0
cyclooxygenase_inhibitor          0.0
dtype: float64

In [53]:
# Concat features_train, features_baseline and features_chain

features_sum = pd.concat([features_train, features_chain], axis=1)
features_sum.head()

Unnamed: 0,0,1
nfkb_inhibitor,597,518.0
proteasome_inhibitor,517,517.0
cyclooxygenase_inhibitor,315,0.0
dopamine_receptor_antagonist,302,0.0
dna_inhibitor,291,0.0


### Model on Validate

In [54]:
# Predict the labels of Y_validate
Y_pred_v = chain_r.predict(X_validate_scaled)

# Predcit the probability of Y
Y_proba_v = chain_r.predict_proba(X_validate_scaled) # About 4 mins

In [55]:
# Take a look at Y_pred_v
# Print the size of Y_pred_v

print(Y_pred_v.size)
Y_pred_v

21435


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [79]:
# Take a look at Y_proba_v
# Print the size of Y_proba_v

print(Y_proba_v.size)
Y_proba_v

21435


array([[5.59203680e-03, 7.53417982e-09, 2.17772697e-02, 4.74093903e-02,
        1.00454734e-02],
       [4.01957798e-03, 1.98804704e-08, 1.62761037e-02, 2.53972637e-02,
        5.69432434e-03],
       [9.99996286e-01, 9.99999849e-01, 4.30624729e-04, 6.95806474e-06,
        3.55770194e-05],
       ...,
       [1.34263874e-02, 4.01113197e-09, 2.63551937e-02, 1.11885357e-01,
        6.59367018e-03],
       [1.89402526e-03, 4.50299180e-08, 9.84245529e-03, 1.25203338e-02,
        8.87057532e-03],
       [3.48561503e-03, 4.78200976e-08, 1.50316158e-02, 2.80639143e-02,
        8.51549186e-03]])

In [56]:
# Take a look at Y_validate
# Print the size of Y_validate

print(Y_validate_scaled.values.size)
Y_validate_scaled.values

21435


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int64)

In [59]:
# Compute jaccard score for validate

jaccard_score_v = jaccard_score(Y_validate_scaled, Y_pred_v, average='samples')
jaccard_score_v

0.03184044786564031

In [60]:
# Compute log loss for validate

log_loss_v = log_loss(Y_validate_scaled, Y_proba_v)
log_loss_v

0.11438182726587211

In [61]:
# Compare Y_validate and Y_pred_v
(Y_validate_scaled.values == Y_pred_v)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [62]:
# How many labels are predicted right
(Y_validate_scaled.values == Y_pred_v).sum()

21173

In [63]:
# Summarize Y_validate

features_validate = Y_validate_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_validate:', features_validate.sum())
features_validate.head()

The total count of targets in Y_validate: 530


nfkb_inhibitor                  153
proteasome_inhibitor            137
cyclooxygenase_inhibitor         83
dopamine_receptor_antagonist     82
dna_inhibitor                    75
dtype: int64

In [64]:
# Summarize Y_pred_v

features_chain_v = pd.DataFrame(Y_pred_v, columns=Y_validate_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain_v.sum())
features_chain_v.head()

The total count of targets: 278.0


proteasome_inhibitor            139.0
nfkb_inhibitor                  139.0
dna_inhibitor                     0.0
dopamine_receptor_antagonist      0.0
cyclooxygenase_inhibitor          0.0
dtype: float64

In [65]:
# Add features_validate and features_chain_v to feature_sum

features_sum = pd.concat([features_sum, features_validate, features_chain_v], axis=1)
features_sum.head()

Unnamed: 0,0,1,0.1,1.1
nfkb_inhibitor,597,518.0,153,139.0
proteasome_inhibitor,517,517.0,137,139.0
cyclooxygenase_inhibitor,315,0.0,83,0.0
dopamine_receptor_antagonist,302,0.0,82,0.0
dna_inhibitor,291,0.0,75,0.0


### Model on Test

In [66]:
# Predict the labels of Y_test
Y_pred_t = chain_r.predict(X_test_scaled)

# Predcit the probability of Y_test
Y_proba_t = chain_r.predict_proba(X_test_scaled) # About 4 mins

In [67]:
# Take a look at Y_pred_t

print(Y_pred_t.size)
Y_pred_t

11910


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [76]:
# Take a look at Y_proba_t

print(Y_proba_t.size)
Y_proba_t

11910


array([[4.65716089e-04, 2.69918352e-08, 4.58916675e-03, 1.07403624e-02,
        2.70670839e-03],
       [2.90847600e-03, 1.60278413e-08, 1.54614577e-02, 4.13487272e-02,
        6.56738131e-03],
       [1.55735710e-03, 1.05691674e-08, 1.34849083e-02, 1.58770497e-03,
        2.02286843e-02],
       ...,
       [2.04847053e-03, 3.61435930e-08, 4.62353825e-03, 9.26023712e-03,
        3.23750735e-02],
       [3.26690681e-03, 2.71768762e-08, 3.66799044e-02, 5.19981525e-02,
        6.78502468e-03],
       [2.22132214e-03, 4.84425333e-08, 2.31865735e-02, 2.06335212e-02,
        9.30943798e-03]])

In [68]:
# Take a look at Y_test
# Print the size of Y_test

print(Y_test_scaled.values.size)
Y_test_scaled.values

11910


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int64)

In [69]:
# Compute jaccard score for test

jaccard_score_t = jaccard_score(Y_test_scaled, Y_pred_t, average='samples')
jaccard_score_t

0.030226700251889168

In [70]:
# Compute log loss for test

log_loss_t = log_loss(Y_test_scaled, Y_proba_t)
log_loss_t

0.11079877965284753

In [71]:
# Compare predicted test and Y_test
(Y_test_scaled.values == Y_pred_t)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True, False],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [72]:
# How many labels are predicted right
(Y_test_scaled.values == Y_pred_t).sum()

11785

In [73]:
# Summarize Y_test

features_test = Y_test_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_test:', features_test.sum())
features_test.head()

The total count of targets in Y_test: 267


nfkb_inhibitor                  82
proteasome_inhibitor            72
dopamine_receptor_antagonist    40
cyclooxygenase_inhibitor        37
dna_inhibitor                   36
dtype: int64

In [74]:
# Summarize Y_pred_t

features_chain_t = pd.DataFrame(Y_pred_t, columns=Y_test_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of predicted targets:', features_chain_t.sum())
features_chain_t.head()

The total count of predicted targets: 146.0


proteasome_inhibitor            73.0
nfkb_inhibitor                  73.0
dna_inhibitor                    0.0
dopamine_receptor_antagonist     0.0
cyclooxygenase_inhibitor         0.0
dtype: float64

In [75]:
# Add features_test and _chain_t to features_sum

features_sum = pd.concat([features_sum, features_test, features_chain_t], axis=1)
features_sum.head()

Unnamed: 0,0,1,0.1,1.1,0.2,1.2
nfkb_inhibitor,597,518.0,153,139.0,82,73.0
proteasome_inhibitor,517,517.0,137,139.0,72,73.0
cyclooxygenase_inhibitor,315,0.0,83,0.0,37,0.0
dopamine_receptor_antagonist,302,0.0,82,0.0,40,0.0
dna_inhibitor,291,0.0,75,0.0,36,0.0


In [80]:
# # Conver to csv
# features_sum.to_csv('error_analysis_v3.csv')