In [1]:
%matplotlib inline
%config InlineBackend.figure_formats={'png','retina'}
import numpy as np
np.set_printoptions(precision=3, suppress=True)
import pandas as pd
import json

import torch
import torch.nn as nn

from scipy.stats import zscore
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor

from utils import calculate_descriptors
from model import model_utils

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Torch version: {torch.__version__}')
print(f'Device: {DEVICE}')

Optuna version: 3.2.0
Torch version: 2.0.0
Device: cpu


## CycPeptMP

#### Import data and setting

In [7]:
config_path = 'config/CycPeptMP.json'
config = json.load(open(config_path,'r'))

In [8]:
MODEL_TYPE = 'Fusion'
# OPTIMIZE: Augmentation times
REPLICA_NUM = 60

# Set random seed for reproducibility
seed = config['data']['seed']
model_utils.set_seed(seed)

# Import dataset, already removed duplicates
# WARNING: conf matrix of atom model is huge
set_list = ['Test', 'Caco2', 'MDCK', 'RRCK']
dataset_test  = model_utils.load_dataset(MODEL_TYPE, REPLICA_NUM, 'Test')
dataset_caco2 = model_utils.load_dataset(MODEL_TYPE, REPLICA_NUM, 'Caco2')
dataset_mdck  = model_utils.load_dataset(MODEL_TYPE, REPLICA_NUM, 'MDCK')
dataset_rrck  = model_utils.load_dataset(MODEL_TYPE, REPLICA_NUM, 'RRCK')
dataset_list = [dataset_test, dataset_caco2, dataset_mdck, dataset_rrck]

# Determined hyperparameters
best_trial = config['model']

#### Prediction

In [38]:
for cv in range(3):
    # Load trained weights
    model_path = f'weight/{MODEL_TYPE}/{MODEL_TYPE}-{REPLICA_NUM}_cv{cv}.cpt'
    checkpoint = torch.load(model_path)
    model = model_utils.create_model(best_trial, DEVICE, config['model']['use_auxiliary'])
    model_state = checkpoint['model_state_dict']
    model.load_state_dict(model_state)
    model = nn.DataParallel(model)
    model.to(DEVICE)

    for set_name, dataset_now in zip(set_list, dataset_list):
        dataloader_now = torch.utils.data.DataLoader(dataset_now, batch_size=256, shuffle=False)
        ids, exps, preds = model_utils.predict_valid(DEVICE, model, dataloader_now, None, istrain=False,
                                                     use_auxiliary=config['model']['use_auxiliary'], gamma_layer=config['model']['gamma_layer'], gamma_subout=config['model']['gamma_subout'])
        now_pred = pd.DataFrame(preds, columns=['pred'])
        now_pred['exp'] = exps
        now_pred['ID'] = ids

        # NOTE: Can save all predicted values of all replicas
        # now_pred.to_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/{set_name}_cv{cv}_allrep.csv')

        # Take the average of all replicas
        now_pred = now_pred.groupby('ID').mean()
        now_pred.to_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/{set_name}_cv{cv}.csv')

In [39]:
# Calculate evaluation metrics
for cv in range(3):
    metrics = []
    for set_name in set_list:
        now_pred = pd.read_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/{set_name}_cv{cv}.csv')
        metrics.append([set_name] + list(model_utils.evaluate_model(now_pred['exp'].to_list(), now_pred['pred'].to_list(), round_num=5)))
    metrics = pd.DataFrame(metrics, columns=['Set', 'MAE', 'RMSE', 'R', 'MSE', 'R2'])
    metrics.to_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/metrics_cv{cv}.csv', index=False)

In [29]:
# OPTIMIZE
MODEL_TYPE = 'Fusion'
REPLICA_NUM = 60
set_list = ['Test', 'Caco2', 'MDCK', 'RRCK']

metrics_cv0 = pd.read_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/metrics_cv0.csv').iloc[:, 1:]
metrics_cv1 = pd.read_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/metrics_cv1.csv').iloc[:, 1:]
metrics_cv2 = pd.read_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/metrics_cv2.csv').iloc[:, 1:]
metrics = np.array([metrics_cv0, metrics_cv1, metrics_cv2])
means = np.round(np.mean(metrics, axis=0), 3)
stds = np.round(np.std(metrics, axis=0), 3)

print(f'Model: {MODEL_TYPE}-{REPLICA_NUM}')
for i in range(len(set_list)):
    print(f' - {set_list[i]}:')
    print(f'   - MAE: {means[i][0]} ± {stds[i][0]}, MSE: {means[i][3]} ± {stds[i][3]}, R: {means[i][2]} ± {stds[i][2]}, R2: {means[i][4]} ± {stds[i][4]}')

Model: Fusion-60
 - Test:
   - MAE: 0.355 ± 0.007, MSE: 0.253 ± 0.013, R: 0.883 ± 0.003, R2: 0.772 ± 0.011
 - Caco2:
   - MAE: 1.148 ± 0.113, MSE: 1.66 ± 0.312, R: 0.209 ± 0.064, R2: -4.429 ± 1.022
 - MDCK:
   - MAE: 0.821 ± 0.009, MSE: 0.911 ± 0.012, R: 0.57 ± 0.044, R2: -0.93 ± 0.025
 - RRCK:
   - MAE: 0.678 ± 0.041, MSE: 0.652 ± 0.083, R: -0.181 ± 0.027, R2: -1.725 ± 0.346


## Baselines

In [4]:
config_path = 'config/baselines.json'
config = json.load(open(config_path,'r'))

# Scaling permeabilities
LOWER_LIMIT = config['data']['lower_limit']
UPPER_LIMIT = config['data']['upper_limit']

train_index_, valid_index_ = {}, {}
for cv in range(3):
    train_index_[cv] = np.load(f'data/eval_index/train_index_cv{cv}.npy')
    valid_index_[cv] = np.load(f'data/eval_index/valid_index_cv{cv}.npy')
test_index  = np.load(f'data/eval_index/Test_index.npy')
caco2_index = np.load(f'data/eval_index/Caco2_index.npy')
mdck_index  = np.load(f'data/eval_index/MDCK_index.npy')
rrck_index  = np.load(f'data/eval_index/RRCK_index.npy')

#### Remove duplicates

In [43]:
# # Include duplicates (7451 peptides)
# df = pd.read_csv('desc/peptide_2D_all.csv', low_memory=False)
# df_ = {}
# label_list = []
# for assay in ['PAMPA', 'Caco2', 'MDCK', 'RRCK']:
#     df_[assay] = df[~df[assay].isna()].reset_index(drop=True)
#     print(f'{assay}: {len(df_[assay])}', end=" -> ")

#     # Remove peptides that are included in PAMPA
#     if assay != 'PAMPA':
#         tmp = []
#         for i in range(len(df_[assay])):
#             if df_[assay]['structurally_unique_ID'].to_list()[i] not in df_['PAMPA']['structurally_unique_ID'].to_list():
#                 tmp.append(i)
#         df_[assay] = df_[assay].iloc[tmp]
#         # print(f' {len(df_[assay])}', end="")

#     # IMPORTANT: Remove duplicates, using permeability from the latest publication
#     dup = df_[assay].duplicated('SMILES', keep='last')
#     df_[assay] = df_[assay][~dup]
#     df_[assay] = df_[assay].reset_index(drop=True)
#     print(f'{len(df_[assay])}')
#     label_list.append(df_[assay][assay].to_list())

# # 7337 peptides
# df = pd.concat([df_['PAMPA'], df_['Caco2'], df_['MDCK'], df_['RRCK']], axis=0).reset_index(drop=True)
# label_list = sum(label_list, [])

# # Scaling permeabilities
# print(f"LogPexp < LOWER_LIMIT({LOWER_LIMIT}) : {len([_ for _ in label_list if _ < LOWER_LIMIT])}")
# print(f"LogPexp > UPPER_LIMIT({UPPER_LIMIT}) : {len([_ for _ in label_list if _ > UPPER_LIMIT])}")
# label_list = np.clip(label_list, LOWER_LIMIT, UPPER_LIMIT)
# df['y'] = label_list

# # Information + 2D descriptors
# df_2D = df[['ID','Set','Year','ID_org','structurally_unique_ID','SMILES','HELM','Monomer_number','Monomer_number_in_main_chain','shape',\
#             'Objective_variable','PAMPA','Caco2','MDCK','RRCK','y'] + config['SVM']['desc_2D']].copy()

# # For SVM model, only use 3D descriptors calculated from single conformer
# df_3D = pd.read_csv('desc/peptide_3D_v1.csv', low_memory=False)
# df_3D = df_3D[config['SVM']['desc_3D']].iloc[[30*_ for _ in range(7337)]].reset_index(drop=True)

# df = pd.concat([df_2D, df_3D], axis=1)
# df.to_csv('desc/peptide_used.csv', index=False)

PAMPA: 6941 -> 6889
Caco2: 649 -> 378
MDCK: 40 -> 17
RRCK: 186 -> 53
LogPexp < LOWER_LIMIT(-8) : 318
LogPexp > UPPER_LIMIT(-4) : 1


#### SVM & RF model

In [5]:
df = pd.read_csv('desc/peptide_used.csv', low_memory=False)
y = df['y'].to_numpy()

# Standardization by Z-score
desc = df[config['SVM']['desc_2D'] + config['SVM']['desc_3D']].copy()
desc = desc.apply(zscore)

# Morgan fingerprint
fps = calculate_descriptors.calc_fingerprint(df['SMILES'].to_list(), radius=2, bit_num=2048)

100%|██████████| 7337/7337 [00:06<00:00, 1215.39it/s]


In [7]:
for model_name in ['RF', 'SVM-2D', 'SVM-2D3D']:
    if model_name == 'RF':
        model = RandomForestRegressor(random_state=233, n_estimators=config['RF']['n_estimators'], max_depth=config['RF']['max_depth'], n_jobs=12)
        data = fps
    elif model_name == 'SVM-2D':
        model = svm.SVR(C=config['SVM']['C-2D'], gamma=config['SVM']['gamma-2D'])
        data = desc[config['SVM']['desc_2D']].values
    elif model_name == 'SVM-2D3D':
        model = svm.SVR(C=config['SVM']['C-2D3D'], gamma=config['SVM']['gamma-2D3D'])
        data = desc.values

    for cv in range(3):
        metrics = []

        model.fit(data[train_index_[cv]], y[train_index_[cv]])

        for set_name, index_now in zip(['Valid', 'Test', 'Caco2', 'MDCK', 'RRCK'], \
                                       [valid_index_[cv], test_index, caco2_index, mdck_index, rrck_index]):
            now_pred = pd.DataFrame([df.iloc[index_now]['ID']], index=['ID']).T
            exp  = y[index_now]
            pred = model.predict(data[index_now])
            now_pred['exp']  = exp
            now_pred['pred'] = pred
            # Can save predicted values
            # now_pred.to_csv(f'predicted/{model_name}/{set_name}_cv{cv}.csv')

            metrics.append([set_name] + list(model_utils.evaluate_model(exp, pred)))

        metrics = pd.DataFrame(metrics, columns=['Set', 'MAE', 'RMSE', 'R', 'MSE', 'R2'])
        metrics.to_csv(f'predicted/{model_name}/metrics_cv{cv}.csv', index=False)

In [9]:
def summrize_metrics(model_name):
    metrics_cv0 = pd.read_csv(f'predicted/{model_name}/metrics_cv0.csv').iloc[:, 1:]
    metrics_cv1 = pd.read_csv(f'predicted/{model_name}/metrics_cv1.csv').iloc[:, 1:]
    metrics_cv2 = pd.read_csv(f'predicted/{model_name}/metrics_cv2.csv').iloc[:, 1:]
    metrics = np.array([metrics_cv0, metrics_cv1, metrics_cv2])
    means = np.round(np.mean(metrics, axis=0), 3)
    stds = np.round(np.std(metrics, axis=0), 3)
    return means, stds


set_list = ['Valid', 'Test', 'Caco2', 'MDCK', 'RRCK']
for model_name in ['RF', 'SVM-2D', 'SVM-2D3D']:
    print(f'Model: {model_name}')
    means, stds = summrize_metrics(model_name)
    for i in range(len(set_list)):
        print(f' - {set_list[i]}:')
        print(f'   - MAE: {means[i][0]} ± {stds[i][0]}, MSE: {means[i][3]} ± {stds[i][3]}, R: {means[i][2]} ± {stds[i][2]}, R2: {means[i][4]} ± {stds[i][4]}')
    print()

Model: RF
 - Valid:
   - MAE: 0.41 ± 0.01, MSE: 0.328 ± 0.025, R: 0.716 ± 0.022, R2: 0.511 ± 0.033
 - Test:
   - MAE: 0.485 ± 0.003, MSE: 0.38 ± 0.004, R: 0.815 ± 0.003, R2: 0.657 ± 0.003
 - Caco2:
   - MAE: 1.124 ± 0.006, MSE: 1.562 ± 0.013, R: 0.181 ± 0.002, R2: -4.125 ± 0.043
 - MDCK:
   - MAE: 0.913 ± 0.016, MSE: 1.094 ± 0.039, R: 0.283 ± 0.021, R2: -1.318 ± 0.083
 - RRCK:
   - MAE: 0.683 ± 0.026, MSE: 0.655 ± 0.042, R: -0.044 ± 0.045, R2: -1.259 ± 0.146

Model: SVM-2D
 - Valid:
   - MAE: 0.401 ± 0.012, MSE: 0.351 ± 0.02, R: 0.7 ± 0.009, R2: 0.477 ± 0.011
 - Test:
   - MAE: 0.488 ± 0.005, MSE: 0.449 ± 0.014, R: 0.781 ± 0.007, R2: 0.595 ± 0.012
 - Caco2:
   - MAE: 0.784 ± 0.007, MSE: 0.916 ± 0.007, R: 0.279 ± 0.016, R2: -2.005 ± 0.023
 - MDCK:
   - MAE: 0.706 ± 0.022, MSE: 0.774 ± 0.044, R: 0.377 ± 0.072, R2: -0.641 ± 0.094
 - RRCK:
   - MAE: 0.662 ± 0.007, MSE: 0.612 ± 0.017, R: 0.245 ± 0.006, R2: -1.111 ± 0.06

Model: SVM-2D3D
 - Valid:
   - MAE: 0.392 ± 0.007, MSE: 0.336 ± 0.015,

#### TODO: Other DL-based models