In [4]:
import lmdb
import os
from tqdm import tqdm
import pickle
def read_lmdb(lmdb_path, mode="direct"):
    """
    Read lmdb file.

    Args:
        lmdb_path (str): Path to the lmdb file.
        mode (str, optional): Read mode. "idx" to follow the idx order, "direct" to read the data directly (use when idx is not continuous).

    Returns:
        list: List of data read from the lmdb file.
    """
    env = lmdb.open(
        lmdb_path,
        subdir=False,
        readonly=True,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=256,
    )
    pocket_name_cnt = {}
    smi_cnt={}
    pocket_smi_pair_cnt={}
    txn = env.begin()
    keys = list(txn.cursor().iternext(values=False))
    data_all = []
    if mode == "idx":
        for idx in tqdm(range(len(keys)), desc="read lmdb {}".format(lmdb_path)):
            ky=f'{idx}'.encode()
            datapoint_pickled = txn.get(ky)
            data_piece = pickle.loads(datapoint_pickled)
            data_all.append(data_piece)
            smi=data_piece['smi']
            pocket_name=data_piece['pocket_name']
            pocket_smi_pair=(smi, pocket_name)
            smi_cnt[smi]=smi_cnt.get(smi, 0)+1
            pocket_name_cnt[pocket_name]=pocket_name_cnt.get(pocket_name, 0)+1
            pocket_smi_pair_cnt[pocket_smi_pair]=pocket_smi_pair_cnt.get(pocket_smi_pair, 0)+1
    elif mode == "direct":
        for key in tqdm(keys, desc="read lmdb {}".format(lmdb_path)):
            datapoint_pickled = txn.get(key)
            data_piece = pickle.loads(datapoint_pickled)
            data_all.append(data_piece)
    return data_all, smi_cnt, pocket_name_cnt, pocket_smi_pair_cnt

lmdb_path="/data/BioLip/pocket2mol_utils/BioLip.lmdb"
data,*_=read_lmdb(lmdb_path, mode="direct")

read lmdb /data/BioLip/pocket2mol_utils/BioLip.lmdb: 100%|██████████| 50153/50153 [00:42<00:00, 1182.37it/s]


In [2]:
data[0].keys()

dict_keys(['protein_element', 'protein_pos', 'protein_is_backbone', 'protein_atom_name', 'protein_atom_to_aa_type', 'ligand_smiles', 'ligand_element', 'ligand_pos', 'ligand_bond_index', 'ligand_bond_type', 'ligand_center_of_mass', 'ligand_atom_feature', 'ligand_hybridization', 'ligand_nbh_list', 'protein_filename', 'ligand_filename'])

In [3]:
data[0]['ligand_atom_feature']

tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0,

In [None]:
import glob
l=glob.glob("/data/crossdocked_pocket10/*/*.sdf")
print(len(l))

183468


In [3]:
# check for elements
elements_cnt={}
for item in data:
    elements=set([x.item() for x in item['ligand_element']])
    for ele in elements:
        elements_cnt[ele]=elements_cnt.get(ele, 0)+1
elements_cnt

{8: 94, 16: 25, 6: 101, 7: 96, 9: 25, 17: 14, 15: 3, 35: 1}

In [5]:
# check for elements
elements_cnt={}
for item in data:
    elements=set([x.item() for x in item['ligand_element']])
    for ele in elements:
        elements_cnt[ele]=elements_cnt.get(ele, 0)+1
elements_cnt

{6: 50130,
 7: 45367,
 8: 47934,
 9: 4729,
 16: 10468,
 15: 19046,
 17: 4050,
 35: 932}

In [None]:
import os
import sys
from tqdm import tqdm
from numpy import source
sys.path.append("/project/ProFSA")
from scripts.benchmark.dataset import CrossDockedDataset

FLAPP_dir="/data/rag/FLAPP/pockets"

crossdocked_dataset = CrossDockedDataset()
for item in tqdm(crossdocked_dataset.get_items()):
    # cp 6A pocket to FLAPP
    dest_name=item['name']+".pdb"
    dest_path=os.path.join(FLAPP_dir, dest_name)
    source_path=item['pocket6A_dir']
    cmd=f"cp {source_path} {dest_path}"
    # print(cmd)
    os.system(cmd)
    


100%|██████████| 166326/166326 [10:14<00:00, 270.83it/s]


In [1]:
import os
import sys
from tqdm import tqdm
from numpy import source
sys.path.append("/project/ProFSA")
from scripts.benchmark.dataset import CrossDockedDataset, DUDEDataset

dataset=DUDEDataset()

dataset.generate_targetdiff_index_pkl("/data/DUD-E/DUD-E.pkl")
        

    

In [7]:
import sys

sys.path.append("/project/ProFSA")
from scripts.benchmark.dataset import BioLipDataset
from tqdm import tqdm
from rdkit import Chem

def multi_frag(ligand_dir):
    mol = Chem.MolFromMolFile(ligand_dir)
    if mol is None:
        # print(ligand_dir)
        return None
    frags = Chem.GetMolFrags(mol)
    return len(frags)>1

cnt=0
invalid=0
dataset=BioLipDataset()
for item in tqdm(dataset.get_items()):
    res = multi_frag(item['ligand_dir'])
    if res== None:
        invalid+=1
    elif res: 
        cnt+=1
print(cnt)

  0%|          | 0/52961 [00:00<?, ?it/s][14:23:49] Explicit valence for atom # 5 C, 5, is greater than permitted
[14:23:49] Explicit valence for atom # 13 C, 5, is greater than permitted
[14:23:49] Explicit valence for atom # 15 N, 4, is greater than permitted
[14:23:49] Explicit valence for atom # 20 C, 5, is greater than permitted
[14:23:49] Explicit valence for atom # 13 C, 5, is greater than permitted
[14:23:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[14:23:49] Explicit valence for atom # 12 C, 5, is greater than permitted
[14:23:49] Explicit valence for atom # 25 C, 5, is greater than permitted
[14:23:49] Explicit valence for atom # 20 C, 5, is greater than permitted
[14:23:49] Explicit valence for atom # 9 N, 4, is greater than permitted
[14:23:49] Explicit valence for atom # 21 N, 4, is greater than permitted
[14:23:49] Explicit valence for atom # 10 C, 5, is greater than permitted
[14:23:49] Explicit valence for atom # 1 N, 4, is greater than permitted
[

55





In [5]:
import torch
import easydict
ckpt_file="/data/pocket2mol_data/ckpts/tmp.pt"
ckpt=torch.load(ckpt_file)['model']
for name, param in ckpt.items():
    print(name, param.size())

protein_atom_emb.emb_sca.weight torch.Size([256, 27])
protein_atom_emb.emb_sca.bias torch.Size([256])
protein_atom_emb.emb_vec.weight torch.Size([64, 1])
protein_atom_emb.emb_vec.bias torch.Size([64])
ligand_atom_emb.emb_sca.weight torch.Size([256, 13])
ligand_atom_emb.emb_sca.bias torch.Size([256])
ligand_atom_emb.emb_vec.weight torch.Size([64, 1])
ligand_atom_emb.emb_vec.bias torch.Size([64])
encoder.interactions.0.distance_expansion.offset torch.Size([60])
encoder.interactions.0.vector_expansion.nn.weight torch.Size([64, 1])
encoder.interactions.0.message_module.node_gvlinear.lin_vector.map_to_feat.weight torch.Size([64, 64])
encoder.interactions.0.message_module.node_gvlinear.lin_vector2.map_to_feat.weight torch.Size([64, 64])
encoder.interactions.0.message_module.node_gvlinear.scalar_to_vector_gates.weight torch.Size([64, 256])
encoder.interactions.0.message_module.node_gvlinear.scalar_to_vector_gates.bias torch.Size([64])
encoder.interactions.0.message_module.node_gvlinear.lin_sc

In [13]:
import torch
import torch.nn.functional as F
a=F.one_hot(torch.tensor(6), 7)
print(a.shape)

torch.Size([7])


In [6]:
import torch
import easydict
ckpt_file="/data/pocket2mol_data/ckpts/empty.pt"
ckpt=torch.load(ckpt_file)['model']
for name, param in ckpt.items():
    print(name, param.size())

protein_atom_emb.emb_sca.weight torch.Size([256, 27])
protein_atom_emb.emb_sca.bias torch.Size([256])
protein_atom_emb.emb_vec.weight torch.Size([64, 1])
protein_atom_emb.emb_vec.bias torch.Size([64])
ligand_atom_emb.emb_sca.weight torch.Size([256, 14])
ligand_atom_emb.emb_sca.bias torch.Size([256])
ligand_atom_emb.emb_vec.weight torch.Size([64, 1])
ligand_atom_emb.emb_vec.bias torch.Size([64])
encoder.interactions.0.distance_expansion.offset torch.Size([60])
encoder.interactions.0.vector_expansion.nn.weight torch.Size([64, 1])
encoder.interactions.0.message_module.node_gvlinear.lin_vector.map_to_feat.weight torch.Size([64, 64])
encoder.interactions.0.message_module.node_gvlinear.lin_vector2.map_to_feat.weight torch.Size([64, 64])
encoder.interactions.0.message_module.node_gvlinear.scalar_to_vector_gates.weight torch.Size([64, 256])
encoder.interactions.0.message_module.node_gvlinear.scalar_to_vector_gates.bias torch.Size([64])
encoder.interactions.0.message_module.node_gvlinear.lin_sc