In [1]:
import pandas as pd
import pickle
import json
import os

from ccdc.io import MoleculeWriter

In [2]:
with open('results/random_split_0_new_pdbbind/conf_results.p', 'rb') as f :
    results = pickle.load(f)

In [3]:
#results['NC(=O)c1cccn2c(-c3cccnc3)cnc12']

In [4]:
with open('data/raw/ccdc_generated_conf_ensemble_library.p', 'rb') as f :
    cel = pickle.load(f)

In [5]:
def get_pdb_ids(smiles, cel) :
    try :
        mol = cel.get_conf_ensemble(smiles).mol
        pdb_ids = [conf.GetProp('PDB_ID') for conf in mol.GetConformers()]
        return list(set(pdb_ids))
    except :
        return list(set())

In [6]:
test_set_pdbs = []
for smiles in results :
    pdb_ids = get_pdb_ids(smiles, cel)
    results[smiles]['PDB_ID'] = str(pdb_ids)
    test_set_pdbs.extend(pdb_ids)

In [7]:
len(set(test_set_pdbs))

1299

In [8]:
pdbbind_general_dir = '../PDBBind/PDBbind_v2020_other_PL/v2020-other-PL/'
pdbbind_refined_dir = '../PDBBind/PDBbind_v2020_refined/refined-set/'

In [9]:
widths = [6,6,7,6,17,9,200]
cols = 'PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name'.replace(', ', ',').split(',')
pl_data = pd.read_fwf(os.path.join(pdbbind_general_dir, 'index', 'INDEX_general_PL_data.2020'), widths=widths,skiprows=6,header=None)
pl_data.columns=cols

In [10]:
widths = [6,6,8,200]
cols = 'PDB code, release year, Uniprot ID, protein name'.replace(', ', ',').split(',')
pl_name = pd.read_fwf(os.path.join(pdbbind_general_dir, 'index', 'INDEX_general_PL_name.2020'), widths=widths,skiprows=6,header=None)
pl_name.columns=cols

In [11]:
pl_all = pl_data.merge(pl_name, on='PDB code')

In [12]:
pl_all.head(1)

Unnamed: 0,PDB code,resolution,release year_x,-logKd/Ki,Kd/Ki,reference,ligand name,release year_y,Uniprot ID,protein name
0,3zzf,2.2,2012,0.4,Ki=400mM //,3zzf.pdf,(NLG),2012,Q01217,ACETYLGLUTAMATE KINASE


In [14]:
jak2_actives = pl_all[(pl_all['protein name'].str.contains('JAK2'))
      & (pl_all['-logKd/Ki'] > 6) & (pl_all['resolution'].str.replace('NMR', '0').astype(float) <= 2)]

In [15]:
jak2_pdbs = jak2_actives['PDB code'].values

In [16]:
jak2_pdbs

array(['3ugc', '6bbv', '5wev', '3e64', '4iva', '5aep', '3lpb', '5usy',
       '5tq8', '4d1s', '4d0w', '4d0x', '4ytf', '4e6q', '4bbe', '4bbf',
       '5cf8', '2b7a', '4aqc', '4jia', '3krr'], dtype=object)

In [17]:
test_set_jak2 = [pdb for pdb in jak2_pdbs if pdb in test_set_pdbs]

In [18]:
test_set_jak2

['5tq8', '4bbe']

In [21]:
from ccdc.protein import Protein
from ccdc.io import MoleculeReader

In [22]:
general_pdb_ids = os.listdir(pdbbind_general_dir)
refined_pdb_ids = os.listdir(pdbbind_refined_dir)

In [23]:
with MoleculeWriter(f'jak2_ligands_aligned.mol2') as writer:
    for i, pdb_id in enumerate(jak2_pdbs) :
        if pdb_id in general_pdb_ids :
            protein_path = os.path.join(pdbbind_general_dir, pdb_id, f'{pdb_id}_protein.pdb')
            ligand_path = os.path.join(pdbbind_general_dir, pdb_id, f'{pdb_id}_ligand.mol2')
        else :
            protein_path = os.path.join(pdbbind_refined_dir, pdb_id, f'{pdb_id}_protein.pdb')
            ligand_path = os.path.join(pdbbind_refined_dir, pdb_id, f'{pdb_id}_ligand.mol2')
        ligand = MoleculeReader(ligand_path)[0]
        if i == 0 :
            protein1 = Protein.from_file(protein_path)
            writer.write(ligand)
        else :
            protein2 = Protein.from_file(protein_path)
            chain_superposition = Protein.ChainSuperposition()
            (rmsd, transformation) = chain_superposition.superpose(protein1.chains[0], protein2.chains[0])
            print(rmsd)
            ligand.transform(transformation)
            writer.write(ligand)

1.1286785469209808
0.4657410389947699
0.8893127011327113
0.7754867602612789
0.8356341868754722
0.8586981781926256
0.565704042127319
1.0834811907402222
0.5240279433018442
0.8501531067200915
0.8598196411902158
0.8164076057492244
0.8017148653942267
0.7027824811987311
0.6856146490720604
0.78965573778532
0.8245585801307252
0.8900726760400363
0.4746064850925215
0.9000444161003971


In [73]:
with MoleculeWriter(f'jak2_protein_aligned.pdb') as protein_writer:
    for i, pdb_id in enumerate(jak2_pdbs) :
        if pdb_id in general_pdb_ids :
            path = os.path.join(pdbbind_general_dir, pdb_id, f'{pdb_id}_protein.pdb')
        else :
            path = os.path.join(pdbbind_refined_dir, pdb_id, f'{pdb_id}_protein.pdb')
        if i == 0 :
            protein1 = Protein.from_file(path)
            print("Protein has {} chains and {} residues.".format(len(protein1.chains),len(protein1.residues)))
            protein_writer.write(protein1)
        else :
            protein2 = Protein.from_file(path)
            print("Protein has {} chains and {} residues.".format(len(protein2.chains),len(protein2.residues)))
            chain_superposition = Protein.ChainSuperposition()
            (rmsd, transformation) = chain_superposition.superpose(protein1.chains[0], protein2.chains[0])
            protein_writer.write(protein2)

Protein has 1 chains and 288 residues.
Protein has 1 chains and 287 residues.
Protein has 1 chains and 286 residues.
Protein has 1 chains and 291 residues.
Protein has 1 chains and 300 residues.
Protein has 1 chains and 293 residues.


In [73]:
for i, pdb_id in enumerate(test_set_jak2) :
    if pdb_id in general_pdb_ids :
        path = os.path.join(pdbbind_general_dir, pdb_id, f'{pdb_id}_protein.pdb')
    else :
        path = os.path.join(pdbbind_refined_dir, pdb_id, f'{pdb_id}_protein.pdb')
    if i == 0 :
        protein1 = Protein.from_file(path)
        print("Protein has {} chains and {} residues.".format(len(protein1.chains),len(protein1.residues)))
        with MoleculeWriter(f'{pdb_id}_aligned.pdb') as protein_writer:
            protein_writer.write(protein1)
    else :
        protein2 = Protein.from_file(path)
        print("Protein has {} chains and {} residues.".format(len(protein2.chains),len(protein2.residues)))
        chain_superposition = Protein.ChainSuperposition()
        (rmsd, transformation) = chain_superposition.superpose(protein1.chains[0], protein2.chains[0])
        #protein2.transform(transformation)
        with MoleculeWriter(f'{pdb_id}_aligned.pdb') as protein_writer:
            protein_writer.write(protein2)

Protein has 1 chains and 272 residues.
Protein has 1 chains and 277 residues.
Protein has 1 chains and 286 residues.
Protein has 1 chains and 285 residues.
Protein has 1 chains and 292 residues.


In [24]:
pl_actives = pl_all[(pl_all['protein name'].str.contains('CATHEPSIN L'))]

In [25]:
actives_pdbs = pl_actives['PDB code'].values

In [26]:
actives_pdbs

array(['3h8b', '2xu4', '2xu3', '3bc3', '5mae', '2xu5', '2yj9', '2yjb',
       '5mqy', '1mhw', '5f02', '5maj', '3h89', '2xu1', '2yjc', '3h8c',
       '4axm', '3hha', '2yj2', '2yj8', '3hwn', '3of8'], dtype=object)

In [29]:
test_set_actives = [pdb for pdb in actives_pdbs if pdb in test_set_pdbs]

In [30]:
test_set_actives

['2xu4', '2yj8']

In [31]:
from ccdc.protein import Protein
from ccdc.io import MoleculeReader

In [32]:
general_pdb_ids = os.listdir(pdbbind_general_dir)
refined_pdb_ids = os.listdir(pdbbind_refined_dir)

In [34]:
with MoleculeWriter(f'catl_ligands_aligned.mol2') as writer:
    for i, pdb_id in enumerate(actives_pdbs) :
        if pdb_id in general_pdb_ids :
            protein_path = os.path.join(pdbbind_general_dir, pdb_id, f'{pdb_id}_protein.pdb')
            ligand_path = os.path.join(pdbbind_general_dir, pdb_id, f'{pdb_id}_ligand.mol2')
        else :
            protein_path = os.path.join(pdbbind_refined_dir, pdb_id, f'{pdb_id}_protein.pdb')
            ligand_path = os.path.join(pdbbind_refined_dir, pdb_id, f'{pdb_id}_ligand.mol2')
        ligand = MoleculeReader(ligand_path)[0]
        if i == 0 :
            protein1 = Protein.from_file(protein_path)
            writer.write(ligand)
        else :
            protein2 = Protein.from_file(protein_path)
            chain_superposition = Protein.ChainSuperposition()
            (rmsd, transformation) = chain_superposition.superpose(protein1.chains[0], protein2.chains[0])
            print(rmsd)
            ligand.transform(transformation)
            writer.write(ligand)

0.2234632719942284
0.3315117979379281
0.20282034472911983
0.23249886414643212
0.2778819410518452
0.2352858924772618
0.36184318400148957
0.21724308927181138
0.2058595080316187
0.20769743731981166
0.22253887617906323
0.1679174354047441
0.2570041534903471
0.21796600157698573
0.2317905166272784
0.33105394710048713
0.32046430879720605
0.21027012848653753
0.20509814278396601
0.3395137384601361
0.2932169104910307


In [35]:
pl_all[pl_all['PDB code'] == '1a9u']

Unnamed: 0,PDB code,resolution,release year_x,-logKd/Ki,Kd/Ki,reference,ligand name,release year_y,Uniprot ID,protein name
13159,1a9u,2.5,1999,7.32,IC50=0.048uM //,1a9u.pdf,(SB2),1999,Q16539,MITOGEN-ACTIVATED PROTEIN KINASE P38


In [38]:
pl_actives = pl_all[(pl_all['protein name'].str.contains('MITOGEN-ACTIVATED PROTEIN KINASE P38'))]

In [39]:
pl_actives

Unnamed: 0,PDB code,resolution,release year_x,-logKd/Ki,Kd/Ki,reference,ligand name,release year_y,Uniprot ID,protein name
5693,1di9,2.6,2000,5.3,IC50=5uM //,1di9.pdf,(MSQ),2000,Q16539,MITOGEN-ACTIVATED PROTEIN KINASE P38
7859,1kv1,2.5,2002,5.94,Kd=1.16uM //,1kv1.pdf,(BMU),2002,Q16539,MITOGEN-ACTIVATED PROTEIN KINASE P38
11185,1bl6,2.5,1999,6.8,IC50=0.16uM //,1bl6.pdf,(SB6),1999,Q16539,MITOGEN-ACTIVATED PROTEIN KINASE P38
13159,1a9u,2.5,1999,7.32,IC50=0.048uM //,1a9u.pdf,(SB2),1999,Q16539,MITOGEN-ACTIVATED PROTEIN KINASE P38
14204,1bmk,2.4,1999,7.6,IC50=0.025uM //,1bmk.pdf,(SB5),1999,Q16539,MITOGEN-ACTIVATED PROTEIN KINASE P38
14697,1bl7,2.5,1999,7.72,IC50=0.019uM //,1bl7.pdf,(SB4),1999,Q16539,MITOGEN-ACTIVATED PROTEIN KINASE P38
19034,1kv2,2.8,2002 1,0.0,Kd=0.1nM //,1kv2.pdf,(B96),2002,Q16539,MITOGEN-ACTIVATED PROTEIN KINASE P38


In [40]:
actives_pdbs = pl_actives['PDB code'].values

In [41]:
actives_pdbs

array(['1di9', '1kv1', '1bl6', '1a9u', '1bmk', '1bl7', '1kv2'],
      dtype=object)

In [42]:
test_set_actives = [pdb for pdb in actives_pdbs if pdb in test_set_pdbs]

In [43]:
test_set_actives

[]

In [44]:
from ccdc.protein import Protein
from ccdc.io import MoleculeReader

In [45]:
general_pdb_ids = os.listdir(pdbbind_general_dir)
refined_pdb_ids = os.listdir(pdbbind_refined_dir)

In [46]:
with MoleculeWriter(f'p38_ligands_aligned.mol2') as writer:
    for i, pdb_id in enumerate(actives_pdbs) :
        if pdb_id in general_pdb_ids :
            protein_path = os.path.join(pdbbind_general_dir, pdb_id, f'{pdb_id}_protein.pdb')
            ligand_path = os.path.join(pdbbind_general_dir, pdb_id, f'{pdb_id}_ligand.mol2')
        else :
            protein_path = os.path.join(pdbbind_refined_dir, pdb_id, f'{pdb_id}_protein.pdb')
            ligand_path = os.path.join(pdbbind_refined_dir, pdb_id, f'{pdb_id}_ligand.mol2')
        ligand = MoleculeReader(ligand_path)[0]
        if i == 0 :
            protein1 = Protein.from_file(protein_path)
            writer.write(ligand)
        else :
            protein2 = Protein.from_file(protein_path)
            chain_superposition = Protein.ChainSuperposition()
            (rmsd, transformation) = chain_superposition.superpose(protein1.chains[0], protein2.chains[0])
            print(rmsd)
            ligand.transform(transformation)
            writer.write(ligand)

0.7638626692796266
0.5321982611919538
0.6548682706827815
0.5787289254313762
0.5620268812040935
0.8541614207748065


In [88]:
cel.library.keys()

dict_keys(['COc1ccc(-c2cn(C)c(=O)c3cc(C(=O)NC4CCS(=O)(=O)CC4)sc23)cc1OC', 'Cc1ccccc1C(=O)c1sc(Nc2ccc(S(N)(=O)=O)cc2)nc1N', 'O=C([O-])c1ccnc2cc([C@H](OCC[NH+]3CCCCC3)c3ccccc3Cl)[nH]c12', 'CCC[C@H](COc1ccc(-c2nc3ccccc3n2Cc2ccccc2)cc1)n1c(-c2ccccc2)nc2ccccc21', 'O=C([O-])c1ccc2cccc(O)c2n1', '[NH2+]=c1[nH]c(=O)c2c(CCc3ccc(C(=O)N[C@@H](CCC(=O)[O-])C(=O)[O-])cc3)c[nH]c2[nH]1', 'O=C(c1cc(Cc2n[nH]c(=O)c3ccccc23)ccc1F)N1CCN(C(=O)C2C=C2)CC1', 'Cc1cn([C@H]2C[C@H](O)[C@@H](C[NH+]3CCCCC3)O2)c(=O)[nH]c1=O', 'C/C(=C/[C@@H](O)Cc1ccc(Br)cc1)C(=O)[O-]', 'NC(=O)c1cccn2c(-c3cccnc3)cnc12', 'Nc1ncnc2[nH]cnc12', 'CC(=O)N1c2cccc(O)c2NC2=C(C(=O)CC(C)(C)C2)[C@@H]1c1ccc(OCc2ccccc2)cc1Cl', 'CCN(CC)C(=O)c1c(NC(=O)c2cccs2)sc2c1CCCC2', 'O=C1[C@@H](O)c2ccccc2N1CCc1ccccc1', 'CC[C@@H](CO)NC(=O)[C@@H]1C=C2c3cccc4c3c(cn4C)C[C@H]2[N@H+](C)C1', '[NH3+]C[C@H]1CN(c2c(Br)cnc3[nH]ncc23)CCO1', 'COc1cccc([C@@H](C)NC(=O)CN2Cc3ccc(-c4nc(NC5CCOCC5)ncc4Cl)cc3C2=O)c1', 'CC#Cc1cncc(-c2cc(Cl)c([C@]3(C)CC(=O)N(C)C(=[NH2+])N3)s2)c1', 'CC

In [89]:
smiles = 'COc1ccc(-c2cn(C)c(=O)c3cc(C(=O)NC4CCS(=O)(=O)CC4)sc23)cc1OC'
smiles = 'Cc1ccccc1C(=O)c1sc(Nc2ccc(S(N)(=O)=O)cc2)nc1N'
ce = cel.get_conf_ensemble(smiles)
mol = ce.mol

In [90]:
from ccdc_rdkit_connector import CcdcRdkitConnector

In [91]:
connector = CcdcRdkitConnector()

In [92]:
mol.GetNumConformers()

101

In [93]:
ccdc_mol_bioactive = connector.rdkit_conf_to_ccdc_mol(mol, 0)

In [94]:
ccdc_mols_generated = [connector.rdkit_conf_to_ccdc_mol(mol, conf.GetId()) for conf in mol.GetConformers() if conf.HasProp('Generator')]

In [95]:
with MoleculeWriter(f'bioactive_conf.mol2') as writer:
    writer.write(ccdc_mol_bioactive)

In [96]:
from ccdc.pharmacophore import Pharmacophore
from ccdc import io
from ccdc.utilities import Colour
Pharmacophore.read_feature_definitions()

In [97]:
feature_definitions = [
                fd for fd in Pharmacophore.feature_definitions.values()
                if fd.identifier != 'exit_vector' and
                fd.identifier != 'heavy_atom' and
                fd.identifier != 'hydrophobe'
            ]

In [98]:
native_ligand = io.CrystalReader(f'bioactive_conf.mol2')[0]

In [99]:
ring_feature_def = Pharmacophore.feature_definitions['ring']
ring_features = ring_feature_def.detect_features(native_ligand)
print(len(ring_features))

3


In [100]:
donor_proj_def = Pharmacophore.feature_definitions['donor_projected']
donor_proj_features = donor_proj_def.detect_features(native_ligand)
print(len(donor_proj_features))

6


In [101]:
query = Pharmacophore.Query(ring_features + donor_proj_features)

In [102]:
query.write('test_query.cm')

In [109]:
with MoleculeWriter(f'generated_confs.mol2') as writer:
    for ccdc_mol in ccdc_mols_generated :
        writer.write(ccdc_mol)

In [110]:
all_ligand_files = f'generated_confs.mol2'
mol2_info = Pharmacophore.FeatureDatabase.DatabaseInfo(all_ligand_files, 0, Colour(0, 255, 0, 255))

In [111]:
csdsqlx = os.path.join(all_ligand_files.replace('.mol2', '.csdsqlx'))
mol2_sdb = Pharmacophore.FeatureDatabase.Creator.StructureDatabase(mol2_info, use_crystal_symmetry=False, structure_database_path=csdsqlx)

In [112]:
creator = Pharmacophore.FeatureDatabase.Creator()

In [113]:
db = creator.create([mol2_sdb])

In [114]:
db.write('test_ligands.feat')

In [61]:
print("Protein has {} chains and {} residues.".format(len(protein1.chains),len(protein1.residues)))

Protein has 1 chains and 272 residues.


In [62]:
print("Protein has {} chains and {} residues.".format(len(protein2.chains),len(protein2.residues)))

Protein has 1 chains and 292 residues.


In [63]:
chain_superposition = Protein.ChainSuperposition()
(rmsd, transformation) = chain_superposition.superpose(protein1.chains[0], protein2.chains[0])

In [65]:
transformation

Molecule.Transformation(Rotation: ((-0.028474905115391413, -0.3230093412430103, 0.9459673066488219), (0.5505887508195584, -0.7949205257060925, -0.25485954030035496), (0.8342908409239715, 0.5135818564561121, 0.20048059624179537)) Translation: (-40.41077865500128, -3.2316097138001183, 52.73123480039596))

In [66]:
protein2.transform(transformation)

In [68]:
from ccdc.io import MoleculeWriter
with MoleculeWriter(f'{pdb_id}_aligned.pdb') as protein_writer:
    protein_writer.write(protein2)

# Pharmacophore done with python

In [154]:

pharmacophore_path = '../pharmacophore_maestro/model.cm'
catl_s3_query = Pharmacophore.Query.from_file(pharmacophore_path)

In [155]:
features = catl_s3_query.features

In [156]:
print(len(features))

8


In [157]:
print('\n'.join(str(f) for f in features))

Feature(ring)
Feature(ring)
Feature(ring_planar_projected)
Feature(ring_planar_projected)
Feature(ring_planar_projected)
Feature(ring_planar_projected)
Feature(acceptor_projected)
Feature(acceptor)


In [105]:
keys = list(Pharmacophore.feature_definitions.keys())

In [106]:
print('\n'.join(keys)) 




In [151]:
settings = Pharmacophore.Search.Settings()
settings.max_hit_structures = 20
settings.max_hits_per_structure = 1
settings.max_hit_rmsd = 2.0
searcher = Pharmacophore.Search(settings)

In [158]:
feature_db_file = 'out.feat'
feature_db = Pharmacophore.FeatureDatabase.from_file(feature_db_file)

In [159]:
hits = searcher.search(catl_s3_query, database=feature_db)
h = hits[0]

In [160]:
h.molecule

<ccdc.molecule.Molecule at 0x7fc9077346d0>

In [161]:
points = h.points

In [162]:
donor_projected_points = h.feature_points(catl_s3_query.features[1])

In [163]:
for dc, val in zip(catl_s3_query.distance_constraints, h.constraint_values()): 
    print('(%d, %d) (%d, %d) %.3f' % (dc.feature_point1[0], dc.feature_point1[1], dc.feature_point2[0], dc.feature_point2[1], val))

(1, 0) (0, 0) 2.114
(2, 0) (0, 0) 0.000
(2, 1) (0, 0) 2.800
(2, 0) (1, 0) 2.114
(2, 1) (1, 0) 3.513
(3, 0) (0, 0) 0.000
(3, 1) (0, 0) 2.800
(3, 0) (1, 0) 2.114
(3, 1) (1, 0) 3.504
(3, 0) (2, 0) 0.000
(3, 0) (2, 1) 2.800
(3, 1) (2, 0) 2.800
(3, 1) (2, 1) 5.600
(4, 0) (0, 0) 2.114
(4, 1) (0, 0) 3.512
(4, 0) (1, 0) 0.000
(4, 1) (1, 0) 2.800
(4, 0) (2, 0) 2.114
(4, 0) (2, 1) 3.513
(4, 1) (2, 0) 3.512
(4, 1) (2, 1) 2.127
(4, 0) (3, 0) 2.114
(4, 0) (3, 1) 3.504
(4, 1) (3, 0) 3.512
(4, 1) (3, 1) 5.985
(5, 0) (0, 0) 2.114
(5, 1) (0, 0) 3.505
(5, 0) (1, 0) 0.000
(5, 1) (1, 0) 2.800
(5, 0) (2, 0) 2.114
(5, 0) (2, 1) 3.513
(5, 1) (2, 0) 3.505
(5, 1) (2, 1) 5.986
(5, 0) (3, 0) 2.114
(5, 0) (3, 1) 3.504
(5, 1) (3, 0) 3.505
(5, 1) (3, 1) 2.100
(5, 0) (4, 0) 0.000
(5, 0) (4, 1) 2.800
(5, 1) (4, 0) 2.800
(5, 1) (4, 1) 5.600
(6, 0) (0, 0) 2.525
(6, 1) (0, 0) 4.656
(6, 0) (1, 0) 1.423
(6, 1) (1, 0) 4.223
(6, 0) (2, 0) 2.525
(6, 0) (2, 1) 3.774
(6, 1) (2, 0) 4.656
(6, 1) (2, 1) 5.441
(6, 0) (3, 0) 2.525
