In [2]:
import time
import h5py
import random
import torch
import numpy as np
import torch.nn.functional as F
#Setting path"
import sys
sys.path.append('../../')
from utils.utils import PDB, reform_dataset, embedding_multiprocess, save_embedding_dataset, batchify_dmatrix_computation_for_dmin

In [4]:
train_list = np.load('../data_lists/mlp_train_data.npy', allow_pickle=True)
validation_list = np.load('../data_lists/mlp_validation_data.npy', allow_pickle=True)
pdb_path = 'Here set the path to the directory of your pdb files'

In [3]:
#Compute and save embeddings for your training dataset.
radius = 4
st = time.time()
count = 1
pdb_list = train_list
prediction_dir = 'mlp_train_36_374'
#Start computing
for entry in pdb_list:
    ct = time.time()
    #Load PDB files.
    pdb = PDB(f'{pdb_path}{entry}.pdb.gz')
    #Load predicted water candidates.
    water_coords = np.load(f'./unet_prediction_waters/{prediction_dir}/{entry}_waters.npy')
    #Compute embedding
    pdb, emb = embedding_multiprocess(pdb, water_coords, 3000, threads=8, distance=radius, need_labels=True)
    #Normalize embedding
    emb[0][:,1] /= 4
    emb[0][:,2:5] /= np.pi
    #Keep pdb waters within radius
    d = batchify_dmatrix_computation_for_dmin(pdb.coords, pdb.water_coords)
    pdb.water_coords = pdb.water_coords[np.where(d<=radius)[0]]
    #Reform dataset
    emb_reformed, batch_unique_waters = reform_dataset(emb[0], 10)
    #Keep training labels for waters that have embedding
    training_labels = emb[1][batch_unique_waters]
    #Keep also the coordinates of the waters that have embedding.
    embedded_water_coordinates = water_coords[batch_unique_waters]
    #Save_Data
    with h5py.File(f'../mlp_dataset/embedding_{prediction_dir}_dataset.h5', 'a') as f:
        save_embedding_dataset(f, entry,
                              (emb_reformed, training_labels), range(2))
    print(f'{count}/{len(pdb_list)}, {entry} finished in {round(time.time()-ct,2)} seconds. Total time {round((time.time()-st)/60,2)} minutes.', end='\r')
    count+=1
et = time.time()
print(f'Finished in {round(et-st,2)} seconds.')

Finished in 2907.84 seconds..25 seconds.            Total time 48.46 minutes..


In [4]:
#Compute and save embeddings for your validation dataset.
radius = 4
st = time.time()
count = 1
pdb_list = validation_list
prediction_dir = 'mlp_validation_36_374'
#Start computing
for entry in pdb_list:
    ct = time.time()
    #Load PDB files.
    pdb = PDB(f'{pdb_path}{entry}.pdb.gz')
    #Load predicted water candidates.
    water_coords = np.load(f'./unet_prediction_waters/{prediction_dir}/{entry}_waters.npy')
    #Compute embedding
    pdb, emb = embedding_multiprocess(pdb, water_coords, 3000, threads=8, distance=radius, need_labels=True)
    #Normalize embedding
    emb[0][:,1] /= 4
    emb[0][:,2:5] /= np.pi
    #Keep pdb waters within radius
    d = batchify_dmatrix_computation_for_dmin(pdb.coords, pdb.water_coords)
    pdb.water_coords = pdb.water_coords[np.where(d<=radius)[0]]
    #Reform dataset
    emb_reformed, batch_unique_waters = reform_dataset(emb[0], 10)
    #Keep training labels for waters that have embedding
    training_labels = emb[1][batch_unique_waters]
    #Keep also the coordinates of the waters that have embedding.
    embedded_water_coordinates = water_coords[batch_unique_waters]
    #Save_Data
    with h5py.File(f'../mlp_dataset/embedding_{prediction_dir}_dataset.h5', 'a') as f:
        save_embedding_dataset(f, entry,
                              (emb_reformed, embedded_water_coordinates, pdb.coords, pdb.atomname,\
                               pdb.resname, pdb.resnum, pdb.water_coords, emb[2], training_labels), range(9))
    print(f'{count}/{len(pdb_list)}, {entry} finished in {round(time.time()-ct,2)} seconds. Total time {round((time.time()-st)/60,2)} minutes.', end='\r')
    count+=1
et = time.time()
print(f'Finished in {round(et-st,2)} seconds.')

Finished in 605.28 seconds..18 seconds.            Total time 10.09 minutes.
