In [1]:
from utils import seed_everything, LoadhERGDataset
from config import SEED_NO, NUM_FEATURES, NUM_GRAPHS_PER_BATCH, NUM_TARGET, EDGE_DIM, DEVICE, PATIENCE, EPOCHS, N_SPLITS, params_vertical_gnn
from engine import EnginehERG
from model import VerticalGNN

import torch
import numpy as np
import optuna
from sklearn.model_selection import KFold
from torch_geometric.loader import DataLoader
import os 

## Tuning the Model



In [8]:
def run_tuning(train_loader, valid_loader, params):
    model = VerticalGNN(num_features=NUM_FEATURES, num_targets=NUM_TARGET, num_gin_layers=params['num_gin_layers'], num_graph_trans_layers=params['num_graph_trans_layers'], 
                            hidden_size=params['hidden_size'], n_heads=params['n_heads'], dropout=params['dropout'], edge_dim=EDGE_DIM)
    model.to(DEVICE)
    optimizer=torch.optim.Adam(model.parameters(),lr = params['learning_rate'])
    eng = EnginehERG(model, optimizer, device=DEVICE)

    best_loss = np.inf
    early_stopping_iter = PATIENCE
    early_stopping_counter = 0 

    for epoch in range(EPOCHS):
        train_loss = eng.train(train_loader)
        valid_loss_tuple = eng.validate(valid_loader)
        valid_loss = valid_loss_tuple[0]
        print(f'Epoch: {epoch+1}/{EPOCHS}, train loss : {train_loss}, validation loss : {valid_loss}')
        if valid_loss < best_loss:
            best_loss = valid_loss 
            early_stopping_counter=0

        else:
            early_stopping_counter +=1

        if early_stopping_counter > early_stopping_iter:
            print('Early stopping...')
            break
        print(f'Early stop counter: {early_stopping_counter}')
    
    return best_loss


In [9]:
def objective(trial):
    params = {
        'num_gin_layers' : trial.suggest_categorical('num_gin_layers', [1, 2, 3]),
        'num_graph_trans_layers' : trial.suggest_categorical('num_graph_trans_layers', [1, 2, 3]),
        'hidden_size' : trial.suggest_categorical('hidden_size', [64, 128, 256]),
        'n_heads' : trial.suggest_categorical('n_heads', [1, 2, 3]),
        'dropout': trial.suggest_categorical('dropout', [0.1, 0.2, 0.3, 0.4]),
        'learning_rate' : trial.suggest_categorical('learning_rate', [1e-3, 3e-3, 5e-3, 7e-3, 9e-3])
    }
    
    
    #load dataset 
    dataset_for_cv = LoadhERGDataset(root='./data/graph_data/data_hERG_train/', raw_filename='data_hERG_train.csv')
    kf = KFold(n_splits=N_SPLITS)
    fold_loss = 0

    for fold_no, (train_idx, valid_idx) in enumerate(kf.split(dataset_for_cv)):
        print(f'Fold {fold_no}')
        train_dataset= []
        valid_dataset = []
        for t_idx in train_idx:
            train_dataset.append(torch.load(f'./data/graph_data/data_hERG_train/processed/molecule_{t_idx}.pt'))
        for v_idx in valid_idx:
            valid_dataset.append(torch.load(f'./data/graph_data/data_hERG_train/processed/molecule_{v_idx}.pt'))

        train_loader = DataLoader(train_dataset, batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=NUM_GRAPHS_PER_BATCH, shuffle=False)

        loss = run_tuning(train_loader, valid_loader, params)
        fold_loss += loss

    return fold_loss/10
if __name__ == '__main__':
    study = optuna.create_study(direction = 'minimize')
    study.optimize(objective, n_trials=20)
    print(f'best trial:')
    trial_ = study.best_trial
    print(trial_.values)
    print(f'Best parameters: {trial_.params}')

## Train/validate/test model



In [2]:
def run_training(train_loader, valid_loader, params, trained_model_path):
    model = VerticalGNN(num_features=NUM_FEATURES, num_targets=NUM_TARGET, num_gin_layers=params['num_gin_layers'], num_graph_trans_layers=params['num_graph_trans_layers'], 
                            hidden_size=params['hidden_size'], n_heads=params['n_heads'], dropout=params['dropout'], edge_dim=EDGE_DIM)
    model.to(DEVICE)
    optimizer=torch.optim.Adam(model.parameters(),lr = params['learning_rate'])
    eng = EnginehERG(model, optimizer, device=DEVICE)

    best_loss = np.inf
    early_stopping_iter = PATIENCE
    early_stopping_counter = 0 

    for epoch in range(EPOCHS):
        train_loss = eng.train(train_loader)
        valid_loss_tuple = eng.validate(valid_loader)
        valid_loss = valid_loss_tuple[0]
        print(f'Epoch: {epoch+1}/{EPOCHS}, train loss : {train_loss}, validation loss : {valid_loss}')
        if valid_loss < best_loss:
            best_loss = valid_loss 
            early_stopping_counter=0 #reset counter
            print('Saving model...')
            
            os.makedirs(os.path.dirname(trained_model_path), exist_ok=True)

            torch.save(model.state_dict(), trained_model_path)
        else:
            early_stopping_counter +=1

        if early_stopping_counter > early_stopping_iter:
            print('Early stopping...')
            break
        print(f'Early stop counter: {early_stopping_counter}')
    
    return best_loss

def run_validation(valid_loader, params, trained_model_path):
    model = VerticalGNN(num_features=NUM_FEATURES, num_targets=NUM_TARGET, num_gin_layers=params['num_gin_layers'], num_graph_trans_layers=params['num_graph_trans_layers'], 
                            hidden_size=params['hidden_size'], n_heads=params['n_heads'], dropout=params['dropout'], edge_dim=EDGE_DIM)
    model.load_state_dict(torch.load(trained_model_path))
    model.to(DEVICE)
    optimizer=torch.optim.Adam(model.parameters(),lr = params['learning_rate'])
    eng = EnginehERG(model, optimizer, device=DEVICE)
    mse, r2 = eng.validate(valid_loader)
    print(f"mse :{mse}, r2:{r2}")
    return mse, r2 


def run_testing(test_loader, params, trained_model_path):
    model = VerticalGNN(num_features=NUM_FEATURES, num_targets=NUM_TARGET, num_gin_layers=params['num_gin_layers'], num_graph_trans_layers=params['num_graph_trans_layers'], 
                            hidden_size=params['hidden_size'], n_heads=params['n_heads'], dropout=params['dropout'], edge_dim=EDGE_DIM)
    model.load_state_dict(torch.load(trained_model_path))
    model.to(DEVICE)
    optimizer=torch.optim.Adam(model.parameters(),lr = params['learning_rate'])
    eng = EnginehERG(model, optimizer, device=DEVICE)

    mse, r2 = eng.test(test_loader)
    print(f"mse :{mse}, r2:{r2}")
    return mse, r2 


In [None]:
params = params_vertical_gnn
n_repetitions = 1
train_data_root_path = './data/graph_data/data_hERG_train/'
train_data_raw_filename = 'data_hERG_train.csv'
test_data_root_path = './data/graph_data/data_hERG_test/'
test_data_raw_filename = 'data_hERG_test.csv'
path_to_save_trained_model = './trained_models/vertical/'

val_mse_list = []
val_r2_list = []

mse_list = []
r2_list = []


dataset_for_cv = LoadhERGDataset(root=train_data_root_path, raw_filename=train_data_raw_filename)
test_dataset = LoadhERGDataset(root=test_data_root_path, raw_filename=test_data_raw_filename)
kf = KFold(n_splits=N_SPLITS)

for repeat in range(n_repetitions):
    repeat_val_r2_list = []
    repeat_val_mse_list = []
    
    repeat_r2_list = []
    repeat_mse_list = []

    for fold_no, (train_idx, valid_idx) in enumerate(kf.split(dataset_for_cv)):
        seed_everything(SEED_NO)
        train_dataset= []
        valid_dataset = []
        for t_idx in train_idx:
            train_dataset.append(torch.load(f'./data/graph_data/data_hERG_train/processed/molecule_{t_idx}.pt'))
        for v_idx in valid_idx:
            valid_dataset.append(torch.load(f'./data/graph_data/data_hERG_train/processed/molecule_{v_idx}.pt'))

        train_loader = DataLoader(train_dataset, batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=NUM_GRAPHS_PER_BATCH, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=NUM_GRAPHS_PER_BATCH, shuffle=False)

        run_training(train_loader, valid_loader, params, os.path.join(path_to_save_trained_model, f'vertical_repeat_{repeat}_fold_{fold_no}.pt'))
        val_mse, val_r2 = run_validation(valid_loader, params, os.path.join(path_to_save_trained_model, f'vertical_repeat_{repeat}_fold_{fold_no}.pt'))
        mse, r2 = run_testing(test_loader, params, os.path.join(path_to_save_trained_model, f'vertical_repeat_{repeat}_fold_{fold_no}.pt'))
        
        repeat_val_mse_list.append(val_mse)
        repeat_val_r2_list.append(val_r2)
        
        repeat_mse_list.append(mse)
        repeat_r2_list.append(r2)
        
        val_mse_list.append(val_mse)
        val_r2_list.append(val_r2)
        
        mse_list.append(mse)
        r2_list.append(r2)
        

    print(f'Statistics for repeat {repeat}:')
    print(f'Validation - mse: {np.mean(repeat_val_mse_list):.3f}±{np.std(repeat_val_mse_list):.3f}')
    print(f'Validation - r2: {np.mean(repeat_val_r2_list):.3f}±{np.std(repeat_val_r2_list):.3f}')
    
    print(f'test - mse: {np.mean(repeat_mse_list):.3f}±{np.std(repeat_mse_list):.3f}')
    print(f'test - r2: {np.mean(repeat_r2_list):.3f}±{np.std(repeat_r2_list):.3f}')
    
val_mse_arr = np.array(val_mse_list)
val_mse_mean= np.mean(val_mse_arr)
val_mse_sd = np.std(val_mse_arr)
print(f'validation mse:{val_mse_mean:.3f}±{val_mse_sd:.3f}')

val_r2_arr = np.array(val_r2_list)
val_mean_r2 = np.mean(val_r2_arr)
val_sd_r2 = np.std(val_r2_arr)
print(f'validation r2:{val_mean_r2:.3f}±{val_sd_r2:.3f}')


mse_arr = np.array(mse_list)
mse_mean= np.mean(mse_arr)
mse_sd = np.std(mse_arr)
print(f'mse:{mse_mean:.3f}±{mse_sd:.3f}')

r2_arr = np.array(r2_list)
mean_r2 = np.mean(r2_arr)
sd_r2 = np.std(r2_arr)
print(f'r2:{mean_r2:.3f}±{sd_r2:.3f}')

