In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import permutations

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain

from sklearn.metrics import classification_report, confusion_matrix, jaccard_score, log_loss

import prepare, model

### Preprocess

In [2]:
# Load train, target and test datasets

X_train = pd.read_csv('train_features.csv', index_col=0)
Y_train = pd.read_csv('train_targets_scored.csv', index_col=0)
X_test = pd.read_csv('test_features.csv', index_col=0)

In [3]:
# Print their shapes
X_train.shape, Y_train.shape, X_test.shape # Correct

((23814, 875), (23814, 206), (3982, 875))

In [3]:
Y_train

Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_000779bfc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_000a6266a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_0015fd391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_001626bd3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
id_fffb1ceed,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_fffb70c0c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_fffc1c3f4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
id_fffcb9e7c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
Y_train['nfkb_inhibitor'].value_counts(normalize=True)

0    0.965063
1    0.034937
Name: nfkb_inhibitor, dtype: float64

In [4]:
# Take the top 5 labels from Y_train

screen_order = Y_train.loc[:, ['nfkb_inhibitor', 'proteasome_inhibitor', 'cyclooxygenase_inhibitor', 
                               'dopamine_receptor_antagonist', 'dna_inhibitor']]
screen_order.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23814 entries, id_000644bb2 to id_ffffdd77b
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   nfkb_inhibitor                23814 non-null  int64
 1   proteasome_inhibitor          23814 non-null  int64
 2   cyclooxygenase_inhibitor      23814 non-null  int64
 3   dopamine_receptor_antagonist  23814 non-null  int64
 4   dna_inhibitor                 23814 non-null  int64
dtypes: int64(5)
memory usage: 1.1+ MB


In [5]:
# Concat to X_train

train = pd.concat([X_train, screen_order], axis=1)
train.head() # Success

Unnamed: 0_level_0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-95,c-96,c-97,c-98,c-99,nfkb_inhibitor,proteasome_inhibitor,cyclooxygenase_inhibitor,dopamine_receptor_antagonist,dna_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0.6584,-0.3981,0.2139,0.3801,0.4176,0,0,0,0,0
id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,0.4899,0.1522,0.1241,0.6077,0.7371,0,0,0,0,0
id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,-0.3174,-0.6417,-0.2187,-1.408,0.6931,0,0,0,0,0
id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,-1.288,-1.621,-0.8784,-0.3876,-0.8154,0,0,0,0,0
id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,-0.3031,0.1094,0.2885,-0.3786,0.7125,0,0,0,0,0


In [6]:
# train.to_csv('screen_order.csv')

In [6]:
# Preprocess X_train to X_train_scaled, X_validate_scaled and X_test_scaled
# encoding, splitting, and scaling

scaler, train_scaled, validate_scaled, test_scaled = prepare.prep_moa_v2(train)

# Print the shapes
scaler, train_scaled.shape, validate_scaled.shape, test_scaled.shape

(MinMaxScaler(), (17145, 881), (4287, 881), (2382, 881))

In [7]:
# Create X

X_train_scaled = train_scaled.iloc[:, 5:]
X_validate_scaled = validate_scaled.iloc[:, 5:]
X_test_scaled = test_scaled.iloc[:, 5:]

In [8]:
# Take a peek at X_train_scaled
X_train_scaled.head()

Unnamed: 0_level_0,cp_type_trt_cp,cp_time_48,cp_time_72,cp_dose_D2,g-0_scaled,g-1_scaled,g-2_scaled,g-3_scaled,g-4_scaled,g-5_scaled,...,c-90_scaled,c-91_scaled,c-92_scaled,c-93_scaled,c-94_scaled,c-95_scaled,c-96_scaled,c-97_scaled,c-98_scaled,c-99_scaled
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_5c0fbdb9b,1,0,0,1,0.299741,0.492758,0.505973,0.351994,0.363954,0.608909,...,0.744161,0.741918,0.718058,0.709238,0.711884,0.766857,0.739467,0.798042,0.774442,0.750025
id_c92a6fd38,1,1,0,1,0.323171,0.502637,0.576781,0.309164,0.338914,0.650012,...,0.780873,0.764288,0.740569,0.775235,0.779463,0.854768,0.819474,0.816636,0.803919,0.775393
id_d2330add2,1,0,0,0,0.295927,0.522906,0.506693,0.374041,0.379288,0.590328,...,0.746421,0.728843,0.7095,0.702324,0.752156,0.800289,0.772484,0.816102,0.743364,0.78126
id_d41fa7215,1,0,1,1,0.366381,0.537196,0.528213,0.437242,0.433703,0.626875,...,0.756102,0.77688,0.526675,0.811047,0.728781,0.486655,0.659237,0.791396,0.438768,0.713756
id_f529f3643,1,0,1,1,0.388768,0.55637,0.500968,0.397568,0.456466,0.597125,...,0.719212,0.653875,0.632656,0.785297,0.781053,0.785313,0.702073,0.79747,0.818303,0.718435


In [9]:
# Create Y (cyclooxygenase inhibitor only)

Y_train_scaled = train_scaled.iloc[:, 2]
Y_validate_scaled = validate_scaled.iloc[:, 2]
Y_test_scaled = test_scaled.iloc[:, 2]

In [10]:
# Take a peek at Y_train_scaled
Y_train_scaled.head()

sig_id
id_5c0fbdb9b    0
id_c92a6fd38    0
id_d2330add2    0
id_d41fa7215    0
id_f529f3643    0
Name: cyclooxygenase_inhibitor, dtype: int64

In [11]:
# Check if index of Y matches index of X
(X_train_scaled.index == Y_train_scaled.index).sum() # Match

17145

In [12]:
# Check if index of Y matches index of X
(X_validate_scaled.index == Y_validate_scaled.index).sum() # Match

4287

In [13]:
# Check if index of Y matches index of X
(X_test_scaled.index == Y_test_scaled.index).sum() # Match

2382

### Neural Net

In [14]:
# Create object
clf = MLPClassifier(random_state=123)

### Model on Train

In [15]:
# Fit the chain on train
clf.fit(X_train_scaled, Y_train_scaled) 
clf

MLPClassifier(random_state=123)

In [16]:
# Print out the attributes

print(clf.classes_)
print(clf.n_layers_)
print(clf.n_outputs_)
print(clf.out_activation_)

[0 1]
3
1
logistic


In [17]:
# Use chain_r to make a prediction

Y_pred = clf.predict(X_train_scaled)
Y_proba = clf.predict_proba(X_train_scaled)

In [18]:
# Take a look at Y_pred
# Print the size of Y_pred

print(Y_pred.size)
Y_pred

17145


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [19]:
# Take a look at Y_proba
# Print the size of Y_proba

print(Y_proba.size)
Y_proba

34290


array([[0.9629648 , 0.0370352 ],
       [0.96454705, 0.03545295],
       [0.96825353, 0.03174647],
       ...,
       [0.97217383, 0.02782617],
       [0.96964244, 0.03035756],
       [0.99540877, 0.00459123]])

In [20]:
# Take a look at Y_train
# Print the size of Y_train

print(Y_train_scaled.values.size)
Y_train_scaled.values

17145


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
# Compute jaccard score
jaccard_score = jaccard_score(Y_train_scaled, Y_pred)
jaccard_score

0.0

In [23]:
# Compute log loss
log_loss = log_loss(Y_train_scaled, Y_proba)
log_loss

0.0901423060652396

In [24]:
# Compare predicted Y_train with Y_train
(Y_train_scaled.values == Y_pred)

array([ True,  True,  True, ...,  True,  True,  True])

In [25]:
# How many labels the model predicing right
(Y_train_scaled.values == Y_pred).sum()

16830

In [42]:
# Summarize Y_train

features_train = pd.DataFrame(Y_train_scaled)
print('The total count of targets in Y_train:', features_train.sum())

The total count of targets in Y_train: cyclooxygenase_inhibitor    315
dtype: int64


In [45]:
# Summarize Y_pred

features_clf = pd.DataFrame(Y_pred, index=Y_train_scaled.index, columns=['cyclooxygenase_inhibitor'])
print('The total count of targets:', features_clf.sum())

The total count of targets: cyclooxygenase_inhibitor    0
dtype: int64


In [75]:
# Any 1 in the predicted Y?
(Y_pred == 1).any() # Not at all

False

### Model on Validate

In [49]:
# Predict the labels of Y_validate
Y_pred_v = clf.predict(X_validate_scaled)

# Predcit the probability of Y
Y_proba_v = clf.predict_proba(X_validate_scaled) 

In [50]:
# Take a look at Y_pred_v
# Print the size of Y_pred_v

print(Y_pred_v.size)
Y_pred_v

4287


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [51]:
# Take a look at Y_proba_v
# Print the size of Y_proba_v

print(Y_proba_v.size)
Y_proba_v

8574


array([[0.96339503, 0.03660497],
       [0.96850208, 0.03149792],
       [0.99802354, 0.00197646],
       ...,
       [0.96361528, 0.03638472],
       [0.97660706, 0.02339294],
       [0.97142133, 0.02857867]])

In [52]:
# Take a look at Y_validate
# Print the size of Y_validate

print(Y_validate_scaled.values.size)
Y_validate_scaled.values

4287


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [55]:
# Compute jaccard score for validate

jaccard_score_v = jaccard_score(Y_validate_scaled, Y_pred_v)
jaccard_score_v

0.0

In [56]:
# Compute log loss for validate

log_loss_v = log_loss(Y_validate_scaled, Y_proba_v)
log_loss_v

0.0936482056010045

In [57]:
# Compare Y_validate and Y_pred_v
(Y_validate_scaled.values == Y_pred_v)

array([ True,  True,  True, ...,  True,  True,  True])

In [58]:
# How many labels are predicted right
(Y_validate_scaled.values == Y_pred_v).sum()

4204

In [59]:
# Summarize Y_validate

features_validate = Y_validate_scaled
print('The total count of targets in Y_validate:', features_validate.sum())

The total count of targets in Y_validate: 83


In [60]:
# Summarize Y_pred_v

features_chain_v = pd.DataFrame(Y_pred_v, index=Y_validate_scaled.index, columns=['cyclooxygenase_inhibitor'])
print('The total count of targets:', features_chain_v.sum())

The total count of targets: cyclooxygenase_inhibitor    0
dtype: int64


### Model on Test

In [61]:
# Predict the labels of Y_test
Y_pred_t = clf.predict(X_test_scaled)

# Predcit the probability of Y_test
Y_proba_t = clf.predict_proba(X_test_scaled) 

In [62]:
# Take a look at Y_pred_t

print(Y_pred_t.size)
Y_pred_t

2382


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [63]:
# Take a look at Y_proba_t

print(Y_proba_t.size)
Y_proba_t

4764


array([[0.98270479, 0.01729521],
       [0.96900631, 0.03099369],
       [0.97703254, 0.02296746],
       ...,
       [0.98726986, 0.01273014],
       [0.95790758, 0.04209242],
       [0.96720808, 0.03279192]])

In [64]:
# Take a look at Y_test
# Print the size of Y_test

print(Y_test_scaled.values.size)
Y_test_scaled.values

2382


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [65]:
# Compute jaccard score for test

jaccard_score_t = jaccard_score(Y_test_scaled, Y_pred_t)
jaccard_score_t

0.0

In [66]:
# Compute log loss for test

log_loss_t = log_loss(Y_test_scaled, Y_proba_t)
log_loss_t

0.08150506925877025

In [67]:
# Compare predicted test and Y_test
(Y_test_scaled.values == Y_pred_t)

array([ True,  True,  True, ...,  True,  True,  True])

In [68]:
# How many labels are predicted right
(Y_test_scaled.values == Y_pred_t).sum()

2345

In [69]:
# Summarize Y_test

features_test = Y_test_scaled
print('The total count of targets in Y_test:', features_test.sum())

The total count of targets in Y_test: 37


In [71]:
# Summarize Y_pred_t

features_chain_t = pd.DataFrame(Y_pred_t, index=Y_test_scaled.index, columns=['cyclooxygenase_inhibitor'])
print('The total count of predicted targets:', features_chain_t.sum())

The total count of predicted targets: cyclooxygenase_inhibitor    0
dtype: int64


In [72]:
# Print chain object to double check
clf

MLPClassifier(random_state=123)