In [3]:
import torch
import numpy as np
import os
import glob
import pickle as pkl
import scipy.sparse as sparse
import copy
import shutil

from tensorboardX import SummaryWriter
from test_tube import Experiment
from rdkit import Chem
from rdkit.Chem import AllChem
from torch import nn
from torch.optim import Adam
from torch.utils.data import Subset

from CoordAE import CoordAE
from MSDScorer import MSDScorer
from KLDLoss import KLDLoss
from data_utils import CODDataset, BlockDataLoader
from test import test

In [5]:
torch.cuda.is_available()

True

In [4]:
seed = 0
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7fe85d757df0>

In [9]:
# handled in args parse for a py script version

n_max = 50
dim_node = 35
dim_edge = 10
nval = 3000
ntst = 3000
hidden_node_dim = 50
dim_f = 100
batch_size = 20
val_num_samples = 10
model_name = 'dl4chem'
savepermol = True
savepreddir = 'savepreddir'
use_val = True
mpnn_steps = 5
alignment_type = 'kabsch'
tol = 1e-5
use_X=False
use_R=True
seed=1334
refine_steps=0
refine_mom=0.99
debug = False
useFF = False
w_reg = 1e-5
log_train_steps=100


data_dir = 'data/'
dataset = 'COD'
COD_molset_50_path = data_dir + 'COD_molset_50.p'  
COD_molvec_50_path = data_dir + 'COD_molvec_50.p'

# create directories to store results

ckptdir = './checkpoints/'
if not os.path.exists(ckptdir):
    os.makedirs(ckptdir)
    
eventdir = './events/'
train_eventdir = eventdir.split('/')
train_eventdir.insert(-1, 'train')
train_eventdir = '/'.join(train_eventdir)

valid_eventdir = eventdir.split('/')
valid_eventdir.insert(-1, 'valid')
valid_eventdir = '/'.join(valid_eventdir)

if not os.path.exists(train_eventdir):
    os.makedirs(train_eventdir)
if not os.path.exists(valid_eventdir):
    os.makedirs(valid_eventdir)

save_path = os.path.join(ckptdir, model_name + '_model.ckpt')

molvec_fname = data_dir + dataset + '_molvec_'+str(n_max)+'.p'
molset_fname = data_dir + dataset + '_molset_'+str(n_max)+'.p'

In [12]:
# load data

nodes_fname = data_dir + dataset + '_nodes_'+str(n_max)+'.p'
D1 = pkl.load(open(nodes_fname,'rb'))

masks_fname = data_dir + dataset + '_masks_'+str(n_max)+'.p'
D2 = pkl.load(open(masks_fname,'rb'))

edges_fname = data_dir + dataset + '_edges_'+str(n_max)+'.p'
D3 = pkl.load(open(edges_fname,'rb'))

dist_mats_fname = data_dir + dataset + '_dist_mats_'+str(n_max)+'.p'
D4 = pkl.load(open(dist_mats_fname,'rb'))

positions_fname = data_dir + dataset + '_positions_'+str(n_max)+'.p'
D5 = pkl.load(open(positions_fname,'rb'))

#[D1, D2, D3, D4, D5] = pkl.load(open(molvec_fname,'rb'))

In [13]:
D1 = D1.todense()
D2 = D2.todense()
D3 = D3.todense()

ntrn = len(D5)-nval-ntst

[molsup, molsmi] = pkl.load(open(molset_fname,'rb'))

D1_trn = D1[:ntrn]
D2_trn = D2[:ntrn]
D3_trn = D3[:ntrn]
D4_trn = D4[:ntrn]
D5_trn = D5[:ntrn]
molsup_trn = molsup[:ntrn]
D1_val = D1[ntrn:ntrn+nval]
D2_val = D2[ntrn:ntrn+nval]
D3_val = D3[ntrn:ntrn+nval]
D4_val = D4[ntrn:ntrn+nval]
D5_val = D5[ntrn:ntrn+nval]
molsup_val = molsup[ntrn:ntrn+nval]
D1_tst = D1[ntrn+nval:ntrn+nval+ntst]
D2_tst = D2[ntrn+nval:ntrn+nval+ntst]
D3_tst = D3[ntrn+nval:ntrn+nval+ntst]
D4_tst = D4[ntrn+nval:ntrn+nval+ntst]
D5_tst = D5[ntrn+nval:ntrn+nval+ntst]
molsup_tst = molsup[ntrn+nval:ntrn+nval+ntst]
print ('::: num train samples is ')
print(D1_trn.shape, D3_trn.shape)

tm_trn, tm_val, tm_tst = None, None, None

del D1, D2, D3, D4, D5, molsup

if savepermol:
    savepreddir = os.path.join(savepreddir, dataset, "_val_" if use_val else "_test_")
    if not os.path.exists(savepreddir):
        os.makedirs(savepreddir)

::: num train samples is 
(60663, 50, 35) (60663, 50, 1)


In [14]:
train_dataset = CODDataset(D1_trn, D2_trn, D3_trn, D4_trn, D5_trn)
val_dataset = CODDataset(D1_val, D2_val, D3_val, D4_val, D5_val)
test_dataset = CODDataset(D1_tst, D2_tst, D3_tst, D4_tst, D5_tst)

In [15]:
train_dataset = Subset(train_dataset, range(100))
val_dataset = Subset(val_dataset, range(100))
test_dataset = Subset(test_dataset, range(100))

In [16]:
train_dataloader = BlockDataLoader(train_dataset, batch_size, block_size=40)
val_num_samples = 2
val_batch_size = batch_size // val_num_samples
val_dataloader = BlockDataLoader(val_dataset, val_batch_size, block_size=40)
test_dataloader = BlockDataLoader(test_dataset, batch_size, block_size=40)

# train_dataloader = BlockDataLoader(train_dataset, batch_size)
# val_dataloader = BlockDataLoader(val_dataset, batch_size)
# test_dataloader = BlockDataLoader(test_dataset, batch_size)

In [24]:
D3_trn.shape

(60663, 50, 1)

In [17]:
np.set_printoptions(precision=5, suppress=True)

# training step
# add model.train() optimizer zerograd ...

save_path = 'savepreddir'
train_event_path = None
valid_event_path = None
log_train_steps=100
tm_trn=None
tm_val=None
w_reg=1e-3
debug=False
exp=None # Experiment

model = CoordAE(n_max, dim_node, dim_edge, hidden_node_dim, dim_f, batch_size, \
                    mpnn_steps=mpnn_steps, alignment_type=alignment_type, tol=tol,\
                    use_X=use_X, use_R=use_R, seed=seed, \
                    refine_steps=refine_steps, refine_mom=refine_mom)

kldloss = KLDLoss()
optimizer = Adam(model.parameters(), lr=3e-4)
msd_scorer = MSDScorer('default')

if exp is not None:
    data_path = exp.get_data_path(exp.name, exp.version)
    save_path = os.path.join(data_path, 'checkpoints/model.ckpt')
    event_path = os.path.join(data_path, 'event/')
    print(save_path, flush=True)
    print(event_path, flush=True)
    
if not debug:
    train_summary_writer = SummaryWriter(train_event_path)
    valid_summary_writer = SummaryWriter(valid_event_path)

# training
print('::: start training')
num_epochs = 3
valaggr_mean = np.zeros(num_epochs)
valaggr_std = np.zeros(num_epochs)

model.train()

for epoch in range(num_epochs):

    trnscores = np.zeros((len(train_dataloader), 4))
    
    for batch_idx, batch in enumerate(train_dataloader) :
        
        optimizer.zero_grad()
        
        print('Train batch number' + str(batch_idx))
        
        # batch to be created
        nodes, masks, edges, proximity, pos = batch
        masks = masks.unsqueeze(-1) # because dataloader squeezes the mask Tensor
        
        postZ_mu, postZ_lsgms, priorZ_mu, priorZ_lsgms, X_pred, PX_pred = model(nodes, masks, edges, proximity, pos)
    
        cost_KLDZ = torch.mean(torch.sum(kldloss.loss(masks, postZ_mu, postZ_lsgms,  priorZ_mu, priorZ_lsgms), (1, 2))) # posterior | prior
        cost_KLD0 = torch.mean(torch.sum(kldloss.loss(masks, priorZ_mu, priorZ_lsgms), (1, 2))) # prior | N(0,1)

        cost_X = torch.mean(msd_scorer.score(X_pred, pos, masks))

        cost_op = cost_X + cost_KLDZ + w_reg * cost_KLD0
        loss = -cost_op

        if debug:
            print(batch_idx, n_batch)
            print(trnresult, flush=True)

        # log results
        curr_iter = epoch * len(train_dataloader) + batch_idx

        if not debug:
            if curr_iter % log_train_steps == 0:
                train_summary_writer.add_scalar("train/cost_op", cost_op, curr_iter)
                train_summary_writer.add_scalar("train/cost_X", cost_X, curr_iter)
                train_summary_writer.add_scalar("train/cost_KLDZ", cost_KLDZ, curr_iter)
                train_summary_writer.add_scalar("train/cost_KLD0", cost_KLD0, curr_iter)

        trnresult = np.array([cost_op.detach(), cost_X.detach(), cost_KLDZ.detach(), cost_KLD0.detach()])
        assert np.sum(np.isnan(trnresult)) == 0
        trnscores[batch_idx,:] = trnresult
        
        loss.backward()
        optimizer.step()
        
    print(np.mean(trnscores,0), flush=True)
    
    exp_dict = {}
    if exp is not None:
        exp_dict['training epoch id'] = epoch
        exp_dict['train_score'] = np.mean(trnscores,0)

    valscores_mean, valscores_std = test(model, val_dataloader, molsup_val, val_num_samples, debug=debug)

    valaggr_mean[epoch] = valscores_mean
    valaggr_std[epoch] = valscores_std

    if not debug:
        valid_summary_writer.add_scalar("val/valscores_mean", valscores_mean, epoch)
        valid_summary_writer.add_scalar("val/min_valscores_mean", np.min(valaggr_mean[0:epoch+1]), epoch)
        valid_summary_writer.add_scalar("val/valscores_std", valscores_std, epoch)
        valid_summary_writer.add_scalar("val/min_valscores_std", np.min(valaggr_std[0:epoch+1]), epoch)

    print ('::: training epoch id {} :: --- val mean={} , std={} ; --- best val mean={} , std={} '.format(\
            epoch, valscores_mean, valscores_std, np.min(valaggr_mean[0:epoch+1]), np.min(valaggr_std[0:epoch+1])))
    
    if exp is not None:
        exp_dict['val mean'] = valscores_mean
        exp_dict['std'] = valscores_std
        exp_dict['best val mean'] = np.min(valaggr_mean[0:epoch+1])
        exp_dict['std of best val mean'] = np.min(valaggr_std[0:epoch+1])
        exp.log(exp_dict)
        exp.save()

        
    
#     # keep track of the best model as well in the separate checkpoint
#     # it is done by copying the checkpoint
#     if valaggr_mean[epoch] == np.min(valaggr_mean[0:epoch+1]) and not debug:
#         for ckpt_f in glob.glob(save_path + '*'):
#             model_name_split = ckpt_f.split('/')
#             model_path = '/'.join(model_name_split[:-1])
#             model_name = model_name_split[-1]
#             best_model_name = model_name.split('.')[0] + '_best.' + '.'.join(model_name.split('.')[1:])
#             full_best_model_path = os.path.join(model_path, best_model_name)
#             full_model_path = ckpt_f
#             shutil.copyfile(full_model_path, full_best_model_path)

::: start training
Train batch number0


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 3)

model.test(D1_tst, D2_tst, D3_tst, D4_tst, D5_tst, molsup_tst, \
                    load_path=args.loaddir, tm_v=tm_tst, debug=args.debug, \
                    savepred_path=args.savepreddir, savepermol=args.savepermol, useFF=args.useFF)