In [96]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain

from sklearn.metrics import classification_report, confusion_matrix, jaccard_score, log_loss

import prepare, model

### Preprocess

In [2]:
# Load train, target and test datasets

X_train = pd.read_csv('train_features.csv', index_col=0)
Y_train = pd.read_csv('train_targets_scored.csv', index_col=0)
X_test = pd.read_csv('test_features.csv', index_col=0)

In [145]:
# Print their shapes
X_train.shape, Y_train.shape, X_test.shape # Correct

((23814, 875), (23814, 206), (3982, 875))

In [146]:
# Preprocess X_train to X_train_scaled, X_validate_scaled and X_test_scaled
# encoding, splitting, and scaling

scaler, X_train_scaled, X_validate_scaled, X_test_scaled = prepare.prep_moa_v2(X_train)

# Print the shapes
scaler, X_train_scaled.shape, X_validate_scaled.shape, X_test_scaled.shape

(MinMaxScaler(), (17145, 876), (4287, 876), (2382, 876))

In [147]:
# Preprocess Y_train to Y_train_scaled, Y_validate_scaled and Y_test_scaled
# Y has not been scaled but just for matching X's names

Y_train_scaled = Y_train.loc[X_train_scaled.index, :]
Y_validate_scaled = Y_train.loc[X_validate_scaled.index, :]
Y_test_scaled = Y_train.loc[X_test_scaled.index, :]

In [148]:
# Check if index of Y matches index of X
(X_train_scaled.index == Y_train_scaled.index).sum() # Match

17145

In [149]:
# Check if index of Y matches index of X
(X_validate_scaled.index == Y_validate_scaled.index).sum() # Match

4287

In [150]:
# Check if index of Y matches index of X
(X_test_scaled.index == Y_test_scaled.index).sum() # Match

2382

In [151]:
# Create a optimal order
# Integer column index and text colum index

col_index = pd.DataFrame(Y_train.columns, columns=['target'])

# Count the MoA annotations

count_targets = pd.DataFrame(Y_train.sum(axis=0)).reset_index()
count_targets.rename(columns={'index':'target_1', 0: 'counts'}, inplace=True)

# Concat two dfs together
col_index_counts = pd.concat([col_index, count_targets], axis=1)

order = col_index_counts.sort_values(by='counts', ascending=False).index

# Conver order to a list
order = order.to_list()

order[:5]

[136, 163, 71, 79, 177]

### Baseline Model

In [18]:
# Create estimator
clf = MLPClassifier(random_state=123)

In [19]:
# Create OneVsRest Ojbect
ovr = OneVsRestClassifier(clf)

In [20]:
# Fit on Train
ovr.fit(X_train_scaled, Y_train_scaled) # Took 2 hours and 10 mins

OneVsRestClassifier(estimator=MLPClassifier(random_state=123))

In [152]:
# Print first 2 estimators in ovr
print(ovr.estimators_[:2])

# Print n_classes_
print(ovr.n_classes_)

# Print the type of classification
print(ovr.multilabel_)

[MLPClassifier(random_state=123), MLPClassifier(random_state=123)]
206
True


In [24]:
# Predcit the labels of Y
Y_pred_baseline = ovr.predict(X_train_scaled)

# Predcit the probability of Y
Y_proba_baseline = ovr.predict_proba(X_train_scaled)

In [92]:
# ??? Return the mean accuracy on the given data and labels
ovr.score(X_train_scaled, Y_train_scaled)

0.5153105861767279

In [153]:
# Take a look at Y_pred
# Print the size of Y_pred

print(Y_pred_baseline.size)
Y_pred_baseline

3531870


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [154]:
# Take a look at Y_train
# Print the size of Y_train

print(Y_train_scaled.values.size)
Y_train_scaled.values

3531870


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [155]:
# Compute jaccard score
baseline_jaccard_score = jaccard_score(Y_train_scaled, Y_pred_baseline, average='samples')

# Print jaccard score
baseline_jaccard_score

0.13193600799900013

In [156]:
# Compute log loss

baseline_log_loss = log_loss(Y_train_scaled, Y_proba_baseline)
baseline_log_loss

2.409867604707532

In [157]:
# Compare baseline prediction with Y_train
Y_train_scaled.values == Y_pred_baseline

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [158]:
# How many labels are predicted right
(Y_train_scaled.values == Y_pred_baseline).sum() # 3531870 - 3522776 = 9094

3522776

In [186]:
# Summarize Y_train

print('The total counts of targets:', Y_train_scaled.sum(axis=0).sum())
features_train = Y_train_scaled.sum(axis=0).sort_values(ascending=False)
features_train.head()

The total counts of targets: 12157


nfkb_inhibitor                  597
proteasome_inhibitor            517
cyclooxygenase_inhibitor        315
dopamine_receptor_antagonist    302
dna_inhibitor                   291
dtype: int64

In [187]:
# Summarize Y_pred_baseline

features_baseline = pd.DataFrame(Y_pred_baseline, 
                                 columns=Y_train_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_baseline.sum())
features_baseline.head()

The total count of targets: 3407


nfkb_inhibitor          520
proteasome_inhibitor    517
tubulin_inhibitor       214
cdk_inhibitor           203
pdgfr_inhibitor         200
dtype: int64

### Classifier Chain
### Model on Train

In [32]:
# Create Classifier Chain object
chain = ClassifierChain(clf, order=order, random_state=123)

In [33]:
# Fit on Train
chain.fit(X_train_scaled, Y_train_scaled) # Took 3 hours 40 min

ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=[136, 163, 71, 79, 177, 77, 99, 10, 63, 80, 199, 4, 149,
                       109, 54, 89, 119, 9, 182, 96, 151, 105, 176, 169, 43, 3,
                       202, 94, 83, 153, ...],
                random_state=123)

In [34]:
# Print the first 5 elements in the order
chain.order_[:5]

[136, 163, 71, 79, 177]

In [36]:
# Predcit the labels of Y
Y_pred = chain.predict(X_train_scaled)

# Predcit the probability of Y
Y_proba = chain.predict_proba(X_train_scaled) # About 5 mins

In [88]:
# Calculate score
chain.score(X_train_scaled, Y_train_scaled)

0.5192767570720327

In [86]:
# Take a look at Y_pred
# Print the size of Y_pred

print(Y_pred.size)
Y_pred

3531870


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [80]:
# Take a look at Y_train
# Print the size of Y_train

print(Y_train_scaled.values.size)
Y_train_scaled.values

3531870


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [87]:
# Compute jaccard score
jaccard_score = jaccard_score(Y_train_scaled, Y_pred, average='samples')
jaccard_score

0.13342984904664695

In [40]:
# Compute log loss
log_loss = log_loss(Y_train_scaled, Y_proba)
log_loss

2.324064384932827

In [101]:
# Compare predicted Y_train ith Y_train
(Y_train_scaled.values == Y_pred)

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [102]:
# How many labels the model predicing right
(Y_train_scaled.values == Y_pred).sum()

3522689

In [188]:
# Summarize Y_train

print('The total count of targets in Y_train:', Y_train_scaled.sum(axis=0).sum())
features_train.head()

The total count of targets in Y_train: 12157


nfkb_inhibitor                  597
proteasome_inhibitor            517
cyclooxygenase_inhibitor        315
dopamine_receptor_antagonist    302
dna_inhibitor                   291
dtype: int64

In [189]:
# Summarize Y_pred

features_chain = pd.DataFrame(Y_pred, columns=Y_train_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain.sum())
features_chain.head()

The total count of targets: 3342.0


nfkb_inhibitor          520.0
proteasome_inhibitor    517.0
cdk_inhibitor           211.0
raf_inhibitor           200.0
tubulin_inhibitor       193.0
dtype: float64

In [190]:
# Concat features_train, features_baseline and features_chain

features_sum = pd.concat([features_train, features_baseline, features_chain], axis=1)
features_sum.head()

Unnamed: 0,0,1,2
nfkb_inhibitor,597,520,520.0
proteasome_inhibitor,517,517,517.0
cyclooxygenase_inhibitor,315,0,0.0
dopamine_receptor_antagonist,302,0,0.0
dna_inhibitor,291,8,16.0


### Model on Validate

In [41]:
# Predict the labels of Y_validate
Y_pred_v = chain.predict(X_validate_scaled)

# Predcit the probability of Y
Y_proba_v = chain.predict_proba(X_validate_scaled) # About 5 mins

In [90]:
# Calculate score
chain.score(X_validate_scaled, Y_validate_scaled)

0.4996501049685094

In [78]:
# Take a look at Y_pred_v
# Print the size of Y_pred_v

print(Y_pred_v.size)
Y_pred_v

883122


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [76]:
# Take a look at Y_validate
# Print the size of Y_validate

print(Y_validate_scaled.values.size)
Y_validate_scaled.values

883122


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [63]:
# Compute jaccard score for validate

jaccard_score_v = jaccard_score(Y_validate_scaled, Y_pred_v, average='samples')
jaccard_score_v

0.11785853132949003

In [64]:
# Compute log loss for validate

log_loss_v = log_loss(Y_validate_scaled, Y_proba_v)
log_loss_v

2.7553418258852296

In [99]:
# Compare Y_validate and Y_pred_v
(Y_validate_scaled.values == Y_pred_v)

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [100]:
# How many labels are predicted right
(Y_validate_scaled.values == Y_pred_v).sum()

880686

In [191]:
# Summarize Y_validate

features_validate = Y_validate_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_validate:', features_validate.sum())
features_validate.head()

The total count of targets in Y_validate: 3049


nfkb_inhibitor                   153
proteasome_inhibitor             137
serotonin_receptor_antagonist     85
cyclooxygenase_inhibitor          83
dopamine_receptor_antagonist      82
dtype: int64

In [192]:
# Summarize Y_pred_v

features_chain_v = pd.DataFrame(Y_pred_v, columns=Y_validate_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain_v.sum())
features_chain_v.head()

The total count of targets: 805.0


nfkb_inhibitor                     140.0
proteasome_inhibitor               139.0
glucocorticoid_receptor_agonist     54.0
cdk_inhibitor                       52.0
egfr_inhibitor                      47.0
dtype: float64

In [193]:
# Add features_validate and features_chain_v to feature_sum

features_sum = pd.concat([features_sum, features_validate, features_chain_v], axis=1)
features_sum.head()

Unnamed: 0,0,1,2,0.1,1.1
nfkb_inhibitor,597,520,520.0,153,140.0
proteasome_inhibitor,517,517,517.0,137,139.0
cyclooxygenase_inhibitor,315,0,0.0,83,0.0
dopamine_receptor_antagonist,302,0,0.0,82,0.0
dna_inhibitor,291,8,16.0,75,4.0


### Model on Test

In [58]:
# Predict the labels of Y_test
Y_pred_t = chain.predict(X_test_scaled)

# Predcit the probability of Y_test
Y_proba_t = chain.predict_proba(X_test_scaled) # About 5 mins

In [105]:
# Calculate score
chain.score(X_test_scaled, Y_test_scaled)

0.5037783375314862

In [179]:
# Take a look at Y_pred_t

print(Y_pred_t.size)
Y_pred_t

490692


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [73]:
# Take a look at Y_test
# Print the size of Y_test

print(Y_test_scaled.values.size)
Y_test_scaled.values

490692


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [61]:
# Compute jaccard score for test

jaccard_score_t = jaccard_score(Y_test_scaled, Y_pred_t, average='samples')
jaccard_score_t

0.10612930310663309

In [62]:
# Compute log loss for test

log_loss_t = log_loss(Y_test_scaled, Y_proba_t)
log_loss_t

2.6754036024829264

In [103]:
# Compare predicted test and Y_test
(Y_test_scaled.values == Y_pred_t)

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [104]:
# How many labels are predicted right
(Y_test_scaled.values == Y_pred_t).sum()

489367

In [182]:
# Summarize Y_test

features_test = Y_test_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_test:', features_test.sum())
features_test.head()

The total count of targets in Y_test: 1638


nfkb_inhibitor                    82
proteasome_inhibitor              72
dopamine_receptor_antagonist      40
adrenergic_receptor_antagonist    40
serotonin_receptor_antagonist     39
dtype: int64

In [181]:
# Summarize Y_pred_t

features_chain_t = pd.DataFrame(Y_pred_t, columns=Y_test_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of predicted targets:', features_chain_t.sum())
features_chain_t.head()

The total count of predicted targets: 393.0


proteasome_inhibitor    73.0
nfkb_inhibitor          73.0
hmgcr_inhibitor         30.0
raf_inhibitor           30.0
egfr_inhibitor          22.0
dtype: float64

In [194]:
# Add features_test and _chain_t to features_sum

features_sum = pd.concat([features_sum, features_test, features_chain_t], axis=1)
features_sum.head()

Unnamed: 0,0,1,2,0.1,1.1,0.2,1.2
nfkb_inhibitor,597,520,520.0,153,140.0,82,73.0
proteasome_inhibitor,517,517,517.0,137,139.0,72,73.0
cyclooxygenase_inhibitor,315,0,0.0,83,0.0,37,0.0
dopamine_receptor_antagonist,302,0,0.0,82,0.0,40,0.0
dna_inhibitor,291,8,16.0,75,4.0,36,1.0


In [196]:
# Rename features_sum()

features_sum.rename(columns={0: 'train', 1: 'baseline', 2: 'chain_t',
                             0: 'validate', 1: 'chain_v',
                             0: 'test', 1: 'chain_t'}, inplace=True)
features_sum.head()

Unnamed: 0,test,chain_t,chain_t.1,test.1,chain_t.2,test.2,chain_t.3
nfkb_inhibitor,597,520,520.0,153,140.0,82,73.0
proteasome_inhibitor,517,517,517.0,137,139.0,72,73.0
cyclooxygenase_inhibitor,315,0,0.0,83,0.0,37,0.0
dopamine_receptor_antagonist,302,0,0.0,82,0.0,40,0.0
dna_inhibitor,291,8,16.0,75,4.0,36,1.0


In [199]:
# # Conver to csv
# features_sum.to_csv('error_analysis_v1.csv')