In [16]:
%matplotlib inline
%config InlineBackend.figure_formats={'png','retina'}
import numpy as np
np.set_printoptions(precision=2, suppress=True)
import pandas as pd
import time
import csv
import os
import json
""""""
import optuna
import torch
import torch.nn as nn
# from torchinfo import summary
""""""
from model import model_utils
""""""
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Optuna version: {optuna.__version__}')
print(f'Torch version: {torch.__version__}')
print(f'Device: {DEVICE}')

Optuna version: 3.2.0
Torch version: 2.0.0
Device: cuda


In [None]:
import importlib
importlib.reload(model_utils)

In [2]:
# Scaling permeabilities
LOWER_LIMIT = -8
UPPER_LIMIT = -4

## CycPeptMP

In [3]:
config_path = 'config/CycPeptMP.json'
config = json.load(open(config_path,'r'))

In [37]:
# OPTIMIZE
MODEL_TYPE = 'Fusion'
# Augmentation times
REPLICA_NUM = 60


# Use auxiliary loss for training
USE_AUXILIARY = True
# Weight of auxiliary loss
gamma_layer  = 0.05
gamma_subout = 0.10

# Set random seed for reproducibility
seed = 2024
model_utils.set_seed(seed)

# import dataset
# WARNING: conf matrix of atom model is huge
set_list = ['Test', 'Caco2', 'MDCK', 'RRCK']
dataset_test  = model_utils.load_dataset(MODEL_TYPE, REPLICA_NUM, 'Test')
dataset_caco2 = model_utils.load_dataset(MODEL_TYPE, REPLICA_NUM, 'Caco2')
dataset_mdck  = model_utils.load_dataset(MODEL_TYPE, REPLICA_NUM, 'MDCK')
dataset_rrck  = model_utils.load_dataset(MODEL_TYPE, REPLICA_NUM, 'RRCK')
dataset_list = [dataset_test, dataset_caco2, dataset_mdck, dataset_rrck]

# Determined hyperparameters
best_trial = config['model']

In [38]:
# Predict permeabilities
for cv in range(3):
    # Load trained weights
    model_path = f'weight/{MODEL_TYPE}/{MODEL_TYPE}-{REPLICA_NUM}_cv{cv}.cpt'
    checkpoint = torch.load(model_path)
    model = model_utils.create_model(best_trial, DEVICE, USE_AUXILIARY)
    model_state = checkpoint['model_state_dict']
    model.load_state_dict(model_state)
    model = nn.DataParallel(model)
    model.to(DEVICE)

    for set_name, dataset_now in zip(set_list, dataset_list):
        dataloader_now = torch.utils.data.DataLoader(dataset_now, batch_size=256, shuffle=False)
        ids, exps, preds = model_utils.predict_valid(DEVICE, model, dataloader_now, None, istrain=False,
                                                     use_auxiliary=USE_AUXILIARY, gamma_layer=gamma_layer, gamma_subout=gamma_subout)
        now_pred = pd.DataFrame(preds, columns=['pred'])
        now_pred['exp'] = exps
        now_pred['ID'] = ids

        # NOTE: Can save all predicted values of all replicas
        # now_pred.to_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/{set_name}_cv{cv}_allrep.csv')

        # Take the average of all replicas
        now_pred = now_pred.groupby('ID').mean()
        now_pred.to_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/{set_name}_cv{cv}.csv')

In [39]:
# Calculate evaluation metrics
for cv in range(3):
    metrics = []
    for set_name in set_list:
        now_pred = pd.read_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/{set_name}_cv{cv}.csv')
        metrics.append([set_name] + list(model_utils.evaluate_model(now_pred['exp'].to_list(), now_pred['pred'].to_list(), round_num=5)))
    metrics = pd.DataFrame(metrics, columns=['Set', 'MAE', 'RMSE', 'R', 'MSE', 'R2'])
    metrics.to_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/metrics_cv{cv}.csv', index=False)

In [36]:
# # OPTIMIZE
# MODEL_TYPE = 'Fusion'
# REPLICA_NUM = 60

metrics_cv0 = pd.read_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/metrics_cv0.csv').iloc[:, 1:]
metrics_cv1 = pd.read_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/metrics_cv1.csv').iloc[:, 1:]
metrics_cv2 = pd.read_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/metrics_cv2.csv').iloc[:, 1:]
metrics = np.array([metrics_cv0, metrics_cv1, metrics_cv2])
means = np.round(np.mean(metrics, axis=0), 3)
stds = np.round(np.std(metrics, axis=0), 3)

print(f'Model: {MODEL_TYPE}-{REPLICA_NUM}')
for i in range(len(set_list)):
    print(f'{set_list[i]}:')
    print(f'MAE: {means[i][0]} ± {stds[i][0]}, MSE: {means[i][3]} ± {stds[i][3]}, R: {means[i][2]} ± {stds[i][2]}, R2: {means[i][4]} ± {stds[i][4]}')

Model: Fusion-60
Test:
MAE: 0.355 ± 0.007, MSE: 0.253 ± 0.013, R: 0.883 ± 0.003, R2: 0.772 ± 0.011
Caco2:
MAE: 1.148 ± 0.113, MSE: 1.66 ± 0.312, R: 0.209 ± 0.064, R2: -4.429 ± 1.022
MDCK:
MAE: 0.821 ± 0.009, MSE: 0.911 ± 0.012, R: 0.57 ± 0.044, R2: -0.93 ± 0.025
RRCK:
MAE: 0.678 ± 0.041, MSE: 0.652 ± 0.083, R: -0.181 ± 0.027, R2: -1.725 ± 0.346
