In [23]:
import pandas as pd
import os
import pickle
import copy

from rdkit import Chem
from ConformationEnsemble import ConformationEnsembleLibrary
from ConfEnsembleDataset import ConfEnsembleDataset
from ccdc.conformer import ConformerGenerator
from ccdc.molecule import Molecule
from tqdm import tqdm

In [None]:
# To be able to save conformer properties
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps) 

In [2]:
data_dir_path = 'data/'
pdbbind_refined_dir_path = '../PDBBind/PDBbind_v2020_refined/refined-set/'
pdbbind_general_dir_path = '../PDBBind/PDBbind_v2020_other_PL/v2020-other-PL/'

In [3]:
widths = [6,6,7,6,17,9,200]
cols = 'PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name'.replace(', ', ',').split(',')
pl_data = pd.read_fwf(os.path.join(pdbbind_refined_dir_path, 'index', 'INDEX_general_PL_data.2020'), widths=widths, skiprows=6, header=None)
pl_data.columns=cols

In [4]:
pl_data.head()

Unnamed: 0,PDB code,resolution,release year,-logKd/Ki,Kd/Ki,reference,ligand name
0,3zzf,2.2,2012,0.4,Ki=400mM //,3zzf.pdf,(NLG)
1,3gww,2.46,2009,0.45,IC50=355mM //,3gwu.pdf,(SFX)
2,1w8l,1.8,2004,0.49,Ki=320mM //,1w8l.pdf,(1P3)
3,3fqa,2.35,2009,0.49,IC50=320mM //,3fq7.pdf,(GAB&PMP)
4,1zsb,2.0,1996,0.6,Kd=250mM //,1zsb.pdf,(AZM)


In [5]:
pl_data = pl_data[~pl_data['ligand name'].str.contains('-mer')]

In [6]:
pl_data.shape

(16844, 7)

In [7]:
correct_pdb_ids = pl_data['PDB code'].values
#pdb_ids = sorted([dir for dir in os.listdir(pdbbind_dir) if len(dir) == 4])

In [8]:
def extract_pdbbind_mols(directory_path, query_pdb_ids) :
    mols = []
    pdb_ids = [pdb_id for pdb_id in os.listdir(directory_path) if pdb_id in query_pdb_ids]
    for pdb_id in pdb_ids :
        mol2path = os.path.join(directory_path, pdb_id, f'{pdb_id}_ligand.mol2')
        try :
            mol = Chem.rdmolfiles.MolFromMol2File(mol2path, removeHs=False)
            if mol is not None :
                rdmol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
                if rdmol is not None : #rdkit parsable
                    #mol = PropertyMol(mol)
                    mol.GetConformer().SetProp('PDB_ID', pdb_id)
                    mols.append(mol)
                else :
                    print('Not RDKit parsable')
        except :
            print('Impossible to read mol2 file for ' + pdb_id)
            
    return mols

In [9]:
general_mols = extract_pdbbind_mols(pdbbind_general_dir_path, correct_pdb_ids)
print(len(general_mols))

RDKit ERROR: [13:42:42] Explicit valence for atom # 2 C, 5, is greater than permitted
[13:42:42] Explicit valence for atom # 2 C, 5, is greater than permitted
RDKit ERROR: [13:42:42] Explicit valence for atom # 15 N, 4, is greater than permitted
[13:42:42] Explicit valence for atom # 15 N, 4, is greater than permitted


Not RDKit parsable


RDKit ERROR: [13:42:42] Explicit valence for atom # 8 C, 5, is greater than permitted
[13:42:42] Explicit valence for atom # 8 C, 5, is greater than permitted
RDKit ERROR: [13:42:43] Explicit valence for atom # 36 C, 5, is greater than permitted
[13:42:43] Explicit valence for atom # 36 C, 5, is greater than permitted
RDKit ERROR: [13:42:43] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 5 6 8 9 10
[13:42:43] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 5 6 8 9 10

RDKit ERROR: 
RDKit ERROR: [13:42:43] Can't kekulize mol.  Unkekulized atoms: 20 21 22 23 24
RDKit ERROR: 
[13:42:43] Can't kekulize mol.  Unkekulized atoms: 20 21 22 23 24

RDKit ERROR: [13:42:43] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8 9 14 15 16 17 18 19 20 21 22 23 24
RDKit ERROR: 
RDKit ERROR: [13:42:43] Explicit valence for atom # 12 C, 5, is greater than permitted
[13:42:43] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8 9 14 15 16 17 18 19 20 21 22 23 24

[13:42:43] Explicit valence for atom # 12 C, 

Not RDKit parsable


RDKit ERROR: [13:42:44] Explicit valence for atom # 1 C, 6, is greater than permitted
[13:42:44] Explicit valence for atom # 1 C, 6, is greater than permitted
RDKit ERROR: [13:42:44] Explicit valence for atom # 15 N, 4, is greater than permitted
[13:42:44] Explicit valence for atom # 15 N, 4, is greater than permitted


Not RDKit parsable


RDKit ERROR: [13:42:44] Can't kekulize mol.  Unkekulized atoms: 3 4 19 20 22
RDKit ERROR: 
[13:42:44] Can't kekulize mol.  Unkekulized atoms: 3 4 19 20 22

RDKit ERROR: [13:42:44] Can't kekulize mol.  Unkekulized atoms: 0 2 3 6 7 8 9 10 11
[13:42:44] Can't kekulize mol.  Unkekulized atoms: 0 2 3 6 7 8 9 10 11

RDKit ERROR: 
RDKit ERROR: [13:42:44] Can't kekulize mol.  Unkekulized atoms: 31 32 33 34 35
RDKit ERROR: 
[13:42:44] Can't kekulize mol.  Unkekulized atoms: 31 32 33 34 35

RDKit ERROR: [13:42:45] Can't kekulize mol.  Unkekulized atoms: 0 3 4 6 7 8 9 10 11
RDKit ERROR: 
[13:42:45] Can't kekulize mol.  Unkekulized atoms: 0 3 4 6 7 8 9 10 11

RDKit ERROR: [13:42:45] Explicit valence for atom # 84 C, 5, is greater than permitted
[13:42:45] Explicit valence for atom # 84 C, 5, is greater than permitted
RDKit ERROR: [13:42:45] Explicit valence for atom # 9 C, 5, is greater than permitted
[13:42:45] Explicit valence for atom # 9 C, 5, is greater than permitted
RDKit ERROR: [13:42:46] 

Not RDKit parsable


RDKit ERROR: [13:42:48] Can't kekulize mol.  Unkekulized atoms: 9 10 27 30 32
RDKit ERROR: 
[13:42:48] Can't kekulize mol.  Unkekulized atoms: 9 10 27 30 32

RDKit ERROR: [13:42:48] Can't kekulize mol.  Unkekulized atoms: 33 34 35 37 38
RDKit ERROR: 
[13:42:48] Can't kekulize mol.  Unkekulized atoms: 33 34 35 37 38

RDKit ERROR: [13:42:48] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
RDKit ERROR: 
[13:42:48] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4

RDKit ERROR: [13:42:48] Explicit valence for atom # 12 N, 4, is greater than permitted
[13:42:48] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [13:42:49] Explicit valence for atom # 11 C, 5, is greater than permitted
[13:42:49] Explicit valence for atom # 11 C, 5, is greater than permitted


Not RDKit parsable


RDKit ERROR: [13:42:49] Explicit valence for atom # 1 C, 6, is greater than permitted
[13:42:49] Explicit valence for atom # 1 C, 6, is greater than permitted
RDKit ERROR: [13:42:49] Explicit valence for atom # 18 C, 5, is greater than permitted
[13:42:49] Explicit valence for atom # 18 C, 5, is greater than permitted
RDKit ERROR: [13:42:50] Explicit valence for atom # 10 C, 5, is greater than permitted
[13:42:50] Explicit valence for atom # 10 C, 5, is greater than permitted


10634


RDKit ERROR: [13:42:50] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
RDKit ERROR: 
[13:42:50] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6



In [10]:
refined_mols = extract_pdbbind_mols(pdbbind_refined_dir_path, correct_pdb_ids)
print(len(refined_mols))

RDKit ERROR: [13:42:52] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 5
RDKit ERROR: 
[13:42:52] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 5

RDKit ERROR: [13:42:52] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 5
RDKit ERROR: 
[13:42:52] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 5

RDKit ERROR: [13:42:53] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 5
[13:42:53] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 5

RDKit ERROR: 
RDKit ERROR: [13:42:54] Can't kekulize mol.  Unkekulized atoms: 3 4 19 20 22
RDKit ERROR: 
[13:42:54] Can't kekulize mol.  Unkekulized atoms: 3 4 19 20 22



4400




In [11]:
with open(os.path.join(data_dir_path, 'pdbbind_general_mol_list_with_h.p'), 'wb') as f :
    pickle.dump(general_mols, f)

In [12]:
# with open(os.path.join(data_dir_path, 'pdbbind_general_mol_list_with_h.p'), 'rb') as f :
#     general_mols = pickle.load(f)

In [13]:
with open(os.path.join(data_dir_path, 'pdbbind_refined_mol_list_with_h.p'), 'wb') as f :
    pickle.dump(refined_mols, f)

In [14]:
# with open(os.path.join(data_dir_path, 'pdbbind_refined_mol_list_with_h.p'), 'rb') as f :
#     refined_mols = pickle.load(f)

In [15]:
general_CEL = ConformationEnsembleLibrary(general_mols)
refined_CEL = ConformationEnsembleLibrary(refined_mols)

100%|███████████████████████████████████| 10634/10634 [00:01<00:00, 8196.57it/s]
100%|█████████████████████████████████████| 4400/4400 [00:00<00:00, 7962.50it/s]


In [16]:
os.makedirs(os.path.join(data_dir_path, 'raw'), exist_ok=True)

In [17]:
with open(os.path.join(data_dir_path, 'raw', 'pdbbind_general_conf_ensemble_library_with_h.p'), 'wb') as f :
    pickle.dump(general_CEL, f)

In [19]:
with open(os.path.join(data_dir_path, 'raw', 'pdbbind_refined_conf_ensemble_library_with_h.p'), 'wb') as f :
    pickle.dump(refined_CEL, f)

In [40]:
with open(os.path.join(data_dir_path, 'raw', 'pdbbind_general_conf_ensemble_library_with_h.p'), 'rb') as f :
    general_CEL = pickle.load(f)

In [41]:
with open(os.path.join(data_dir_path, 'raw', 'pdbbind_refined_conf_ensemble_library_with_h.p'), 'rb') as f :
    refined_CEL = pickle.load(f)

In [2]:
# %%time
# general_dataset = ConfEnsembleDataset('data/')
# refined_dataset = ConfEnsembleDataset('data/', split='refined')

# Datasets are now ready to use

INFO: /home/benoit/CCDC/CSD_2021/bin/mogul
INFO: /home/benoit/CCDC/CSD_2021/bin/mogul
INFO: /home/benoit/CCDC/CSD_2021/bin/mogul
INFO: /home/benoit/CCDC/CSD_2021/bin/mogul
INFO: /home/benoit/CCDC/CSD_2021/bin/mogul
INFO: /home/benoit/CCDC/CSD_2021/bin/mogul
INFO: /home/benoit/CCDC/CSD_2021/bin/mogul
INFO: /home/benoit/CCDC/CSD_2021/bin/mogul
INFO: /home/benoit/CCDC/CSD_2021/bin/mogul
CPU times: user 957 ms, sys: 1.73 s, total: 2.69 s
Wall time: 2.82 s
