In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain

from sklearn.metrics import classification_report, confusion_matrix, jaccard_score, log_loss

import prepare, model

### Preprocess

In [2]:
# Load train, target and test datasets

X_train = pd.read_csv('train_features.csv', index_col=0)
Y_train = pd.read_csv('train_targets_scored.csv', index_col=0)
X_test = pd.read_csv('test_features.csv', index_col=0)

In [3]:
# Print their shapes
X_train.shape, Y_train.shape, X_test.shape # Correct

((23814, 875), (23814, 206), (3982, 875))

In [4]:
# Preprocess X_train to X_train_scaled, X_validate_scaled and X_test_scaled
# encoding, splitting, and scaling

scaler, X_train_scaled, X_validate_scaled, X_test_scaled = prepare.prep_moa_v2(X_train)

# Print the shapes
scaler, X_train_scaled.shape, X_validate_scaled.shape, X_test_scaled.shape

(MinMaxScaler(), (17145, 876), (4287, 876), (2382, 876))

In [5]:
# Preprocess Y_train to Y_train_scaled, Y_validate_scaled and Y_test_scaled
# Y has not been scaled but just for matching X's names

Y_train_scaled = Y_train.loc[X_train_scaled.index, :]
Y_validate_scaled = Y_train.loc[X_validate_scaled.index, :]
Y_test_scaled = Y_train.loc[X_test_scaled.index, :]

In [6]:
# Check if index of Y matches index of X
(X_train_scaled.index == Y_train_scaled.index).sum() # Match

17145

In [7]:
# Check if index of Y matches index of X
(X_validate_scaled.index == Y_validate_scaled.index).sum() # Match

4287

In [8]:
# Check if index of Y matches index of X
(X_test_scaled.index == Y_test_scaled.index).sum() # Match

2382

In [9]:
# Create a optimal order
# Integer column index and text colum index

col_index = pd.DataFrame(Y_train.columns, columns=['target'])

# Load the target_dep_counts.csv
target_dep_counts = pd.read_csv('target_dep_counts.csv')

# Merge two dfs together
col_index_dep_counts = pd.merge(col_index, target_dep_counts, left_on='target', right_on='target')

# Rank by dep_counts and the index is the order
order = col_index_dep_counts.sort_values(by='dep_counts', ascending=False).index.to_list()

order[:5]

[82, 34, 75, 125, 33]

### Classifier Chain
### Model on Train

In [10]:
# Create estimator
clf = MLPClassifier(random_state=123)

In [13]:
# Create Classifier chain object
chain = ClassifierChain(clf, order=order, random_state=123)

In [14]:
# Take a peek at the chain object
chain

ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=[82, 34, 75, 125, 33, 39, 81, 141, 165, 22, 172, 137, 53,
                       185, 201, 69, 196, 120, 121, 12, 192, 46, 139, 14, 26, 8,
                       24, 15, 170, 60, ...],
                random_state=123)

In [15]:
# Fit on Train
chain.fit(X_train_scaled, Y_train_scaled) # Took 2 hours and 50 mins

ClassifierChain(base_estimator=MLPClassifier(random_state=123),
                order=[82, 34, 75, 125, 33, 39, 81, 141, 165, 22, 172, 137, 53,
                       185, 201, 69, 196, 120, 121, 12, 192, 46, 139, 14, 26, 8,
                       24, 15, 170, 60, ...],
                random_state=123)

In [57]:
# Print the first 5 elements in the order
chain.order_[:5]

[82, 34, 75, 125, 33]

In [17]:
# Predcit the labels of Y
Y_pred = chain.predict(X_train_scaled)

# Predcit the probability of Y
Y_proba = chain.predict_proba(X_train_scaled) # About 4 mins

In [18]:
# Calculate score
chain.score(X_train_scaled, Y_train_scaled)

0.5126275882181394

In [19]:
# Take a look at Y_pred
# Print the size of Y_pred

print(Y_pred.size)
Y_pred

3531870


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
# Take a look at Y_train
# Print the size of Y_train

print(Y_train_scaled.values.size)
Y_train_scaled.values

3531870


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
# Compute jaccard score
jaccard_score = jaccard_score(Y_train_scaled, Y_pred, average='samples')
jaccard_score

0.127888597258676

In [22]:
# Compute log loss
log_loss = log_loss(Y_train_scaled, Y_proba)
log_loss

2.4134392680465564

In [23]:
# Compare predicted Y_train ith Y_train
(Y_train_scaled.values == Y_pred)

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [24]:
# How many labels the model predicing right
(Y_train_scaled.values == Y_pred).sum()

3522589

In [29]:
# Summarize Y_train

features_train = Y_train_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_train:', features_train.sum())
features_train.head()

The total count of targets in Y_train: 12157


nfkb_inhibitor                  597
proteasome_inhibitor            517
cyclooxygenase_inhibitor        315
dopamine_receptor_antagonist    302
dna_inhibitor                   291
dtype: int64

In [26]:
# Summarize Y_pred

features_chain = pd.DataFrame(Y_pred, columns=Y_train_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain.sum())
features_chain.head()

The total count of targets: 3412.0


nfkb_inhibitor          527.0
proteasome_inhibitor    519.0
egfr_inhibitor          245.0
tubulin_inhibitor       211.0
cdk_inhibitor           189.0
dtype: float64

In [30]:
# Concat features_train, features_baseline and features_chain

features_sum = pd.concat([features_train, features_chain], axis=1)
features_sum.head()

Unnamed: 0,0,1
nfkb_inhibitor,597,527.0
proteasome_inhibitor,517,519.0
cyclooxygenase_inhibitor,315,1.0
dopamine_receptor_antagonist,302,0.0
dna_inhibitor,291,13.0


### Model on Validate

In [32]:
# Predict the labels of Y_validate
Y_pred_v = chain.predict(X_validate_scaled)

# Predcit the probability of Y
Y_proba_v = chain.predict_proba(X_validate_scaled) # About 4 mins

In [55]:
# Calculate score
chain.score(X_validate_scaled, Y_validate_scaled)

0.49615115465360393

In [33]:
# Take a look at Y_pred_v
# Print the size of Y_pred_v

print(Y_pred_v.size)
Y_pred_v

883122


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
# Take a look at Y_validate
# Print the size of Y_validate

print(Y_validate_scaled.values.size)
Y_validate_scaled.values

883122


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [37]:
# Compute jaccard score for validate

jaccard_score_v = jaccard_score(Y_validate_scaled, Y_pred_v, average='samples')
jaccard_score_v

0.11474613171604074

In [38]:
# Compute log loss for validate

log_loss_v = log_loss(Y_validate_scaled, Y_proba_v)
log_loss_v

2.799741345974077

In [39]:
# Compare Y_validate and Y_pred_v
(Y_validate_scaled.values == Y_pred_v)

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [40]:
# How many labels are predicted right
(Y_validate_scaled.values == Y_pred_v).sum()

880648

In [41]:
# Summarize Y_validate

features_validate = Y_validate_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_validate:', features_validate.sum())
features_validate.head()

The total count of targets in Y_validate: 3049


nfkb_inhibitor                   153
proteasome_inhibitor             137
serotonin_receptor_antagonist     85
cyclooxygenase_inhibitor          83
dopamine_receptor_antagonist      82
dtype: int64

In [42]:
# Summarize Y_pred_v

features_chain_v = pd.DataFrame(Y_pred_v, columns=Y_validate_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of targets:', features_chain_v.sum())
features_chain_v.head()

The total count of targets: 845.0


proteasome_inhibitor               140.0
nfkb_inhibitor                     138.0
egfr_inhibitor                      62.0
glucocorticoid_receptor_agonist     53.0
flt3_inhibitor                      45.0
dtype: float64

In [43]:
# Add features_validate and features_chain_v to feature_sum

features_sum = pd.concat([features_sum, features_validate, features_chain_v], axis=1)
features_sum.head()

Unnamed: 0,0,1,0.1,1.1
nfkb_inhibitor,597,527.0,153,138.0
proteasome_inhibitor,517,519.0,137,140.0
cyclooxygenase_inhibitor,315,1.0,83,0.0
dopamine_receptor_antagonist,302,0.0,82,0.0
dna_inhibitor,291,13.0,75,2.0


### Model on Test

In [44]:
# Predict the labels of Y_test
Y_pred_t = chain.predict(X_test_scaled)

# Predcit the probability of Y_test
Y_proba_t = chain.predict_proba(X_test_scaled) # About 4 mins

In [56]:
# Calculate score
chain.score(X_test_scaled, Y_test_scaled)

0.5037783375314862

In [45]:
# Take a look at Y_pred_t

print(Y_pred_t.size)
Y_pred_t

490692


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [46]:
# Take a look at Y_test
# Print the size of Y_test

print(Y_test_scaled.values.size)
Y_test_scaled.values

490692


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [47]:
# Compute jaccard score for test

jaccard_score_t = jaccard_score(Y_test_scaled, Y_pred_t, average='samples')
jaccard_score_t

0.10498180800447802

In [48]:
# Compute log loss for test

log_loss_t = log_loss(Y_test_scaled, Y_proba_t)
log_loss_t

2.7239517391467976

In [49]:
# Compare predicted test and Y_test
(Y_test_scaled.values == Y_pred_t)

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [50]:
# How many labels are predicted right
(Y_test_scaled.values == Y_pred_t).sum()

489365

In [51]:
# Summarize Y_test

features_test = Y_test_scaled.sum(axis=0).sort_values(ascending=False)
print('The total count of targets in Y_test:', features_test.sum())
features_test.head()

The total count of targets in Y_test: 1638


nfkb_inhibitor                    82
proteasome_inhibitor              72
dopamine_receptor_antagonist      40
adrenergic_receptor_antagonist    40
serotonin_receptor_antagonist     39
dtype: int64

In [52]:
# Summarize Y_pred_t

features_chain_t = pd.DataFrame(Y_pred_t, columns=Y_test_scaled.columns).sum(axis=0).sort_values(ascending=False)
print('The total count of predicted targets:', features_chain_t.sum())
features_chain_t.head()

The total count of predicted targets: 405.0


nfkb_inhibitor          73.0
proteasome_inhibitor    73.0
egfr_inhibitor          31.0
raf_inhibitor           25.0
tubulin_inhibitor       23.0
dtype: float64

In [53]:
# Add features_test and _chain_t to features_sum

features_sum = pd.concat([features_sum, features_test, features_chain_t], axis=1)
features_sum.head()

Unnamed: 0,0,1,0.1,1.1,0.2,1.2
nfkb_inhibitor,597,527.0,153,138.0,82,73.0
proteasome_inhibitor,517,519.0,137,140.0,72,73.0
cyclooxygenase_inhibitor,315,1.0,83,0.0,37,0.0
dopamine_receptor_antagonist,302,0.0,82,0.0,40,0.0
dna_inhibitor,291,13.0,75,2.0,36,0.0


In [54]:
# # Conver to csv
# features_sum.to_csv('error_analysis_v2.csv')