In [4]:
# merge datasets

import lmdb
import os
from tqdm import tqdm
import pickle
import sys
sys.path.append("/project/Pocket2mol")

def read_lmdb(lmdb_path, mode="direct"):
    """
    Read lmdb file.

    Args:
        lmdb_path (str): Path to the lmdb file.
        mode (str, optional): Read mode. "idx" to follow the idx order, "direct" to read the data directly (use when idx is not continuous).

    Returns:
        list: List of data read from the lmdb file.
    """
    env = lmdb.open(
        lmdb_path,
        subdir=False,
        readonly=True,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=256,
    )
    pocket_name_cnt = {}
    smi_cnt={}
    pocket_smi_pair_cnt={}
    txn = env.begin()
    keys = list(txn.cursor().iternext(values=False))
    data_all = []
    if mode == "idx":
        for idx in tqdm(range(len(keys)), desc="read lmdb {}".format(lmdb_path)):
            ky=f'{idx}'.encode()
            datapoint_pickled = txn.get(ky)
            data_piece = pickle.loads(datapoint_pickled)
            data_all.append(data_piece)
            smi=data_piece['smi']
            pocket_name=data_piece['pocket_name']
            pocket_smi_pair=(smi, pocket_name)
            smi_cnt[smi]=smi_cnt.get(smi, 0)+1
            pocket_name_cnt[pocket_name]=pocket_name_cnt.get(pocket_name, 0)+1
            pocket_smi_pair_cnt[pocket_smi_pair]=pocket_smi_pair_cnt.get(pocket_smi_pair, 0)+1
    elif mode == "direct":
        for key in tqdm(keys, desc="read lmdb {}".format(lmdb_path)):
            datapoint_pickled = txn.get(key)
            data_piece = pickle.loads(datapoint_pickled)
            data_all.append((int(key), data_piece))
    return data_all

def write_lmdb(data, lmdb_path):
    env = lmdb.open(
        lmdb_path, 
        subdir=False, 
        readonly=False, 
        lock=False, 
        readahead=False, 
        meminit=False, 
        max_readers=256, 
        map_size=int(10e9)
    )
    with env.begin(write=True, buffers=True) as txn:
        for i, d in tqdm(data):
            txn.put(
                key=f'{i:08d}'.encode(),
                value=pickle.dumps(d)
            )
    env.close()

# scripts of merge lmdbs

DUD_E_lmdb_path = "/data/DUD-E/DUD-E.lmdb"
# DUD_E_lmdb_path = "/data/lit_pcba/PCBA.lmdb"


input_train_valid_lmdb_path = "/data/CrossDocked/bfn_utils/CrossDocked_no_test.lmdb"
input_train_valid_index_path = "/data/CrossDocked/bfn_utils/id_split_files/CrossDocked-DUD_E_FLAPP_0.6_no_test.pt"
output_lmdb_path="/data/CrossDocked/bfn_utils/CrossDocked.lmdb"
output_id_split_path="/data/CrossDocked/bfn_utils/id_split_files/CrossDocked-DUD_E_FLAPP_0.6.pt"

# input_train_valid_lmdb_path = "/data/BioLip/bfn_utils/BioLip_no_test.lmdb"
# input_train_valid_index_path = "/data/BioLip/bfn_utils/id_split_files/BioLip-DUD_E_FLAPP_0.9_no_test.pt"
# output_lmdb_path="/data/BioLip/bfn_utils/BioLip.lmdb"
# output_id_split_path="/data/BioLip/bfn_utils/id_split_files/BioLip-DUD_E_FLAPP_0.9.pt"

trian_valid_data=read_lmdb(input_train_valid_lmdb_path)
DUD_E_data=read_lmdb(DUD_E_lmdb_path)

import pickle
import torch

# load index
train_valid_index=torch.load(input_train_valid_index_path)
index=train_valid_index
index['val']=index['test']
index['test']=[]

output_list=trian_valid_data
key_num=len(trian_valid_data)
for data_piece in DUD_E_data:
    output_list.append((key_num, data_piece[1]))
    index['test'].append(key_num)
    key_num+=1


torch.save(index, output_id_split_path)
write_lmdb(output_list, output_lmdb_path)


read lmdb /data/CrossDocked/bfn_utils/CrossDocked_no_test.lmdb:   1%|          | 1145/99981 [00:02<03:45, 438.17it/s]


KeyboardInterrupt: 

In [6]:
# read lmdb
path="/data/pdbbind_2020/targetdiff_utils/PDBBind.lmdb"
data=read_lmdb(path)

read lmdb /data/pdbbind_2020/targetdiff_utils/PDBBind.lmdb:   0%|          | 0/19379 [00:00<?, ?it/s]

read lmdb /data/pdbbind_2020/targetdiff_utils/PDBBind.lmdb:   6%|▌         | 1178/19379 [00:02<00:37, 482.31it/s]


KeyboardInterrupt: 

{'train': [1, 2, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 21, 23, 25, 26, 27, 28, 29, 30, 31, 34, 35, 37, 40, 43, 44, 48, 49, 51, 52, 54, 55, 56, 57, 58, 60, 63, 65, 68, 70, 71, 72, 74, 75, 76, 77, 78, 79, 81, 83, 84, 87, 89, 91, 93, 94, 97, 98, 100, 101, 102, 104, 105, 107, 108, 111, 114, 118, 121, 122, 123, 124, 125, 129, 130, 132, 134, 143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 155, 157, 159, 160, 161, 163, 165, 166, 167, 168, 170, 171, 173, 176, 179, 182, 183, 184, 186, 187, 190, 191, 192, 194, 195, 196, 198, 199, 200, 201, 204, 205, 208, 209, 210, 211, 212, 213, 215, 216, 223, 224, 226, 228, 229, 233, 234, 236, 240, 241, 242, 244, 245, 246, 251, 252, 253, 254, 255, 256, 258, 259, 260, 262, 264, 269, 271, 272, 273, 274, 276, 277, 279, 280, 283, 285, 288, 290, 292, 293, 294, 299, 301, 303, 309, 310, 311, 312, 313, 318, 319, 322, 323, 324, 325, 326, 327, 329, 331, 333, 334, 335, 337, 338, 339, 341, 342, 343, 344, 345, 347, 348, 350, 353, 354, 355, 359, 360, 364, 366,

  0%|          | 0/19499 [00:00<?, ?it/s]

100%|██████████| 19499/19499 [00:14<00:00, 1390.46it/s]


In [5]:
import pickle

output_id_split_path="/data/CrossDocked/bfn_utils/id_split_files/CrossDocked-DUD_E_FLAPP_0.6.pt"
index=torch.load(output_id_split_path)
print(index)

# data_new=read_lmdb(output_lmdb_path)

NameError: name 'torch' is not defined

In [13]:
data_new[19398]

(19398,
 (0,
  {'protein_element': tensor([ 7,  6,  6,  8,  6,  6,  6,  7,  6,  7,  7,  1,  1,  1,  1,  1,  1,  7,
            6,  6,  8,  6,  6,  6,  7,  6,  6,  8,  6,  6,  6,  6,  1,  7,  6,  6,
            8,  6,  6,  6,  8,  8,  1,  7,  6,  6,  8,  6,  1,  7,  6,  6,  8,  6,
            6,  6,  6,  1,  7,  6,  6,  8,  6,  6,  6,  6,  1,  7,  6,  6,  8,  6,
            6,  8,  8,  1,  7,  6,  6,  8,  6,  8,  6,  1,  1,  7,  6,  6,  8,  1,
            7,  6,  6,  8,  6,  1,  7,  6,  6,  8,  6,  6,  8,  8,  1,  7,  6,  6,
            8,  6,  6,  8,  8,  1,  7,  6,  6,  8,  6,  8,  6,  1,  1,  7,  6,  6,
            8,  6,  6,  6,  1,  7,  6,  6,  8,  6,  6,  6,  6,  1,  7,  6,  6,  8,
            6,  6,  6,  8,  8,  1,  7,  6,  6,  8,  6,  6,  6,  8,  8,  1,  7,  6,
            6,  8,  6,  6,  6,  6,  7,  1,  7,  6,  6,  8,  6,  6, 16,  6,  1,  7,
            6,  6,  8,  6,  6,  6,  6,  1,  7,  6,  6,  8,  1,  7,  6,  6,  8,  1,
            7,  6,  6,  8,  6,  6,  6,  6,  1,  7,  6, 