In [35]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import permutations

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain

from sklearn.metrics import classification_report, confusion_matrix, jaccard_score, log_loss

import prepare, model

### Preprocess

In [2]:
# Load train, target and test datasets

X_train = pd.read_csv('train_features.csv', index_col=0)
Y_train = pd.read_csv('train_targets_scored.csv', index_col=0)
X_test = pd.read_csv('test_features.csv', index_col=0)

In [3]:
# Print their shapes
X_train.shape, Y_train.shape, X_test.shape # Correct

((23814, 875), (23814, 206), (3982, 875))

In [4]:
# Take the top 5 labels from Y_train

screen_order = Y_train.loc[:, ['nfkb_inhibitor', 'proteasome_inhibitor', 'cyclooxygenase_inhibitor', 
                               'dopamine_receptor_antagonist', 'dna_inhibitor']]
screen_order.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23814 entries, id_000644bb2 to id_ffffdd77b
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   nfkb_inhibitor                23814 non-null  int64
 1   proteasome_inhibitor          23814 non-null  int64
 2   cyclooxygenase_inhibitor      23814 non-null  int64
 3   dopamine_receptor_antagonist  23814 non-null  int64
 4   dna_inhibitor                 23814 non-null  int64
dtypes: int64(5)
memory usage: 1.1+ MB


In [5]:
# Concat to X_train

train = pd.concat([X_train, screen_order], axis=1)
train.head() # Success

Unnamed: 0_level_0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-95,c-96,c-97,c-98,c-99,nfkb_inhibitor,proteasome_inhibitor,cyclooxygenase_inhibitor,dopamine_receptor_antagonist,dna_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0.6584,-0.3981,0.2139,0.3801,0.4176,0,0,0,0,0
id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,0.4899,0.1522,0.1241,0.6077,0.7371,0,0,0,0,0
id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,-0.3174,-0.6417,-0.2187,-1.408,0.6931,0,0,0,0,0
id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,-1.288,-1.621,-0.8784,-0.3876,-0.8154,0,0,0,0,0
id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,-0.3031,0.1094,0.2885,-0.3786,0.7125,0,0,0,0,0


In [6]:
# train.to_csv('screen_order.csv')

In [6]:
# Preprocess X_train to X_train_scaled, X_validate_scaled and X_test_scaled
# encoding, splitting, and scaling

scaler, train_scaled, validate_scaled, test_scaled = prepare.prep_moa_v2(train)

# Print the shapes
scaler, train_scaled.shape, validate_scaled.shape, test_scaled.shape

(MinMaxScaler(), (17145, 881), (4287, 881), (2382, 881))

In [7]:
# Create X

X_train_scaled = train_scaled.iloc[:, 5:]
X_validate_scaled = validate_scaled.iloc[:, 5:]
X_test_scaled = test_scaled.iloc[:, 5:]

In [8]:
# Create Y

Y_train_scaled = train_scaled.iloc[:, 0:5]
Y_validate_scaled = validate_scaled.iloc[:, 0:5]
Y_test_scaled = test_scaled.iloc[:, 0:5]

In [9]:
# Check if index of Y matches index of X
(X_train_scaled.index == Y_train_scaled.index).sum() # Match

17145

In [10]:
# Check if index of Y matches index of X
(X_validate_scaled.index == Y_validate_scaled.index).sum() # Match

4287

In [11]:
# Check if index of Y matches index of X
(X_test_scaled.index == Y_test_scaled.index).sum() # Match

2382

In [12]:
X_train_scaled.head()

Unnamed: 0_level_0,cp_type_trt_cp,cp_time_48,cp_time_72,cp_dose_D2,g-0_scaled,g-1_scaled,g-2_scaled,g-3_scaled,g-4_scaled,g-5_scaled,...,c-90_scaled,c-91_scaled,c-92_scaled,c-93_scaled,c-94_scaled,c-95_scaled,c-96_scaled,c-97_scaled,c-98_scaled,c-99_scaled
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_5c0fbdb9b,1,0,0,1,0.299741,0.492758,0.505973,0.351994,0.363954,0.608909,...,0.744161,0.741918,0.718058,0.709238,0.711884,0.766857,0.739467,0.798042,0.774442,0.750025
id_c92a6fd38,1,1,0,1,0.323171,0.502637,0.576781,0.309164,0.338914,0.650012,...,0.780873,0.764288,0.740569,0.775235,0.779463,0.854768,0.819474,0.816636,0.803919,0.775393
id_d2330add2,1,0,0,0,0.295927,0.522906,0.506693,0.374041,0.379288,0.590328,...,0.746421,0.728843,0.7095,0.702324,0.752156,0.800289,0.772484,0.816102,0.743364,0.78126
id_d41fa7215,1,0,1,1,0.366381,0.537196,0.528213,0.437242,0.433703,0.626875,...,0.756102,0.77688,0.526675,0.811047,0.728781,0.486655,0.659237,0.791396,0.438768,0.713756
id_f529f3643,1,0,1,1,0.388768,0.55637,0.500968,0.397568,0.456466,0.597125,...,0.719212,0.653875,0.632656,0.785297,0.781053,0.785313,0.702073,0.79747,0.818303,0.718435


In [13]:
Y_train_scaled.head()

Unnamed: 0_level_0,nfkb_inhibitor,proteasome_inhibitor,cyclooxygenase_inhibitor,dopamine_receptor_antagonist,dna_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id_5c0fbdb9b,0,0,0,0,0
id_c92a6fd38,0,0,0,0,0
id_d2330add2,0,0,0,0,0
id_d41fa7215,0,0,0,0,0
id_f529f3643,0,0,0,0,0


In [14]:
# Create all possible orders for the 5 labels

order= [2,3,4,0,1]

### Classifier Chain

In [15]:
# Create estimator
clf = MLPClassifier(random_state=123)

In [16]:
# Create Classifier Chain Object
chain = ClassifierChain(clf, order=order, random_state=123)
chain

ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=[2, 3, 4, 0, 1], random_state=123)

### Model on Train

In [17]:
# Fit the chain on train
chain.fit(X_train_scaled, Y_train_scaled) # Took 4.0 mins

ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=[2, 3, 4, 0, 1], random_state=123)

In [18]:
# Print out the order
chain.order_

[2, 3, 4, 0, 1]

In [19]:
# Use chain_r to make a prediction

Y_pred = chain.predict(X_train_scaled)
Y_proba = chain.predict_proba(X_train_scaled)

In [20]:
# Take a look at Y_pred
# Print the size of Y_pred

print(Y_pred.size)
Y_pred

85725


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [21]:
# Take a look at Y_proba
# Print the size of Y_proba

print(Y_proba.size)
Y_proba

85725


array([[2.02562938e-03, 6.26366711e-08, 3.70351979e-02, 5.83633508e-02,
        1.17760973e-02],
       [1.02375468e-03, 9.39225251e-09, 3.54529467e-02, 4.53328134e-02,
        1.18520546e-02],
       [1.25216842e-03, 1.19988485e-08, 3.17464731e-02, 1.91179580e-02,
        7.50318001e-03],
       ...,
       [2.07363806e-03, 4.71412030e-08, 2.78261728e-02, 2.43456735e-02,
        9.94205636e-03],
       [1.92180744e-03, 1.52459759e-08, 3.03575624e-02, 4.71829009e-02,
        6.83465714e-03],
       [2.02170874e-04, 1.64556081e-08, 4.59123165e-03, 7.64342197e-04,
        8.15491428e-04]])

In [22]:
# Take a look at Y_train
# Print the size of Y_train

print(Y_train_scaled.values.size)
Y_train_scaled.values

85725


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int64)

In [23]:
# Compute jaccard score
jaccard_score = jaccard_score(Y_train_scaled, Y_pred, average='samples')
jaccard_score

0.030212890055409742

In [24]:
# Compute log loss
log_loss = log_loss(Y_train_scaled, Y_proba)
log_loss

0.11236622350079215

In [25]:
# Compare predicted Y_train ith Y_train
(Y_train_scaled.values == Y_pred)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [26]:
# How many labels the model predicing right
(Y_train_scaled.values == Y_pred).sum()

84732

In [27]:
# Summarize Y_train

features_train = Y_train_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_train:', features_train.sum())
features_train.head()

The total count of targets in Y_train: 2022


nfkb_inhibitor                  597
proteasome_inhibitor            517
cyclooxygenase_inhibitor        315
dopamine_receptor_antagonist    302
dna_inhibitor                   291
dtype: int64

In [28]:
# Summarize Y_pred

features_chain = pd.DataFrame(Y_pred, columns=Y_train_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain.sum())
features_chain.head()

The total count of targets: 1033.0


proteasome_inhibitor            517.0
nfkb_inhibitor                  516.0
dna_inhibitor                     0.0
dopamine_receptor_antagonist      0.0
cyclooxygenase_inhibitor          0.0
dtype: float64

In [29]:
# Concat features_train, features_baseline and features_chain

features_sum = pd.concat([features_train, features_chain], axis=1)
features_sum.head()

Unnamed: 0,0,1
nfkb_inhibitor,597,516.0
proteasome_inhibitor,517,517.0
cyclooxygenase_inhibitor,315,0.0
dopamine_receptor_antagonist,302,0.0
dna_inhibitor,291,0.0


### Model on Validate

In [30]:
# Predict the labels of Y_validate
Y_pred_v = chain.predict(X_validate_scaled)

# Predcit the probability of Y
Y_proba_v = chain.predict_proba(X_validate_scaled) 

In [31]:
# Take a look at Y_pred_v
# Print the size of Y_pred_v

print(Y_pred_v.size)
Y_pred_v

21435


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [32]:
# Take a look at Y_proba_v
# Print the size of Y_proba_v

print(Y_proba_v.size)
Y_proba_v

21435


array([[1.99436375e-03, 1.15146355e-08, 3.66049652e-02, 4.95106528e-02,
        1.02401151e-02],
       [1.58781961e-03, 2.63961301e-08, 3.14979219e-02, 2.63569142e-02,
        5.73716988e-03],
       [9.99733190e-01, 9.99999515e-01, 1.97646412e-03, 1.23693726e-05,
        3.57742708e-04],
       ...,
       [2.37929516e-03, 6.24812045e-09, 3.63847201e-02, 1.24564278e-01,
        6.68258364e-03],
       [2.27633587e-03, 8.01497506e-08, 2.33929351e-02, 1.29661002e-02,
        8.47169428e-03],
       [2.75104476e-03, 6.91228536e-08, 2.85786682e-02, 3.24666759e-02,
        8.43623454e-03]])

In [33]:
# Take a look at Y_validate
# Print the size of Y_validate

print(Y_validate_scaled.values.size)
Y_validate_scaled.values

21435


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int64)

In [36]:
# Compute jaccard score for validate

jaccard_score_v = jaccard_score(Y_validate_scaled, Y_pred_v, average='samples')
jaccard_score_v

0.03184044786564031

In [37]:
# Compute log loss for validate

log_loss_v = log_loss(Y_validate_scaled, Y_proba_v)
log_loss_v

0.11817462650170078

In [38]:
# Compare Y_validate and Y_pred_v
(Y_validate_scaled.values == Y_pred_v)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [39]:
# How many labels are predicted right
(Y_validate_scaled.values == Y_pred_v).sum()

21171

In [40]:
# Summarize Y_validate

features_validate = Y_validate_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_validate:', features_validate.sum())
features_validate.head()

The total count of targets in Y_validate: 530


nfkb_inhibitor                  153
proteasome_inhibitor            137
cyclooxygenase_inhibitor         83
dopamine_receptor_antagonist     82
dna_inhibitor                    75
dtype: int64

In [41]:
# Summarize Y_pred_v

features_chain_v = pd.DataFrame(Y_pred_v, columns=Y_validate_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain_v.sum())
features_chain_v.head()

The total count of targets: 280.0


proteasome_inhibitor            140.0
nfkb_inhibitor                  140.0
dna_inhibitor                     0.0
dopamine_receptor_antagonist      0.0
cyclooxygenase_inhibitor          0.0
dtype: float64

In [42]:
# Add features_validate and features_chain_v to feature_sum

features_sum = pd.concat([features_sum, features_validate, features_chain_v], axis=1)
features_sum.head()

Unnamed: 0,0,1,0.1,1.1
nfkb_inhibitor,597,516.0,153,140.0
proteasome_inhibitor,517,517.0,137,140.0
cyclooxygenase_inhibitor,315,0.0,83,0.0
dopamine_receptor_antagonist,302,0.0,82,0.0
dna_inhibitor,291,0.0,75,0.0


### Model on Test

In [43]:
# Predict the labels of Y_test
Y_pred_t = chain.predict(X_test_scaled)

# Predcit the probability of Y_test
Y_proba_t = chain.predict_proba(X_test_scaled) 

In [44]:
# Take a look at Y_pred_t

print(Y_pred_t.size)
Y_pred_t

11910


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [45]:
# Take a look at Y_proba_t

print(Y_proba_t.size)
Y_proba_t

11910


array([[8.39234330e-04, 3.18925501e-08, 1.72952067e-02, 1.07155256e-02,
        2.50697358e-03],
       [2.54721533e-03, 2.15705525e-08, 3.09936882e-02, 4.19231246e-02,
        6.24185428e-03],
       [1.00329653e-03, 1.30664807e-08, 2.29674556e-02, 1.79768151e-03,
        2.29204515e-02],
       ...,
       [3.02966210e-03, 4.37416069e-08, 1.27301417e-02, 1.04128829e-02,
        3.06809369e-02],
       [2.43376673e-03, 3.70091865e-08, 4.20924203e-02, 5.47910437e-02,
        6.80082336e-03],
       [1.87298486e-03, 7.36979137e-08, 3.27919202e-02, 2.25978254e-02,
        9.28717505e-03]])

In [46]:
# Take a look at Y_test
# Print the size of Y_test

print(Y_test_scaled.values.size)
Y_test_scaled.values

11910


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int64)

In [47]:
# Compute jaccard score for test

jaccard_score_t = jaccard_score(Y_test_scaled, Y_pred_t, average='samples')
jaccard_score_t

0.030226700251889168

In [48]:
# Compute log loss for test

log_loss_t = log_loss(Y_test_scaled, Y_proba_t)
log_loss_t

0.11346444950312891

In [49]:
# Compare predicted test and Y_test
(Y_test_scaled.values == Y_pred_t)

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True, False],
       ...,
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [50]:
# How many labels are predicted right
(Y_test_scaled.values == Y_pred_t).sum()

11785

In [51]:
# Summarize Y_test

features_test = Y_test_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_test:', features_test.sum())
features_test.head()

The total count of targets in Y_test: 267


nfkb_inhibitor                  82
proteasome_inhibitor            72
dopamine_receptor_antagonist    40
cyclooxygenase_inhibitor        37
dna_inhibitor                   36
dtype: int64

In [52]:
# Summarize Y_pred_t

features_chain_t = pd.DataFrame(Y_pred_t, columns=Y_test_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of predicted targets:', features_chain_t.sum())
features_chain_t.head()

The total count of predicted targets: 146.0


proteasome_inhibitor            73.0
nfkb_inhibitor                  73.0
dna_inhibitor                    0.0
dopamine_receptor_antagonist     0.0
cyclooxygenase_inhibitor         0.0
dtype: float64

In [53]:
# Add features_test and _chain_t to features_sum

features_sum = pd.concat([features_sum, features_test, features_chain_t], axis=1)
features_sum.head()

Unnamed: 0,0,1,0.1,1.1,0.2,1.2
nfkb_inhibitor,597,516.0,153,140.0,82,73.0
proteasome_inhibitor,517,517.0,137,140.0,72,73.0
cyclooxygenase_inhibitor,315,0.0,83,0.0,37,0.0
dopamine_receptor_antagonist,302,0.0,82,0.0,40,0.0
dna_inhibitor,291,0.0,75,0.0,36,0.0


In [54]:
# Print chain object to double check
chain

ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=[2, 3, 4, 0, 1], random_state=123)

In [55]:
# # Conver to csv
# features_sum.to_csv('error_analysis_v4.csv')