In [123]:
import numpy as np 

### 1A. Query data directly from PDB 

**Search query**

Getting all PDB IDs where the polymer type is "Protein" and where a bound ligand exists. 

In [1]:
from rcsbapi.search import search_attributes as attrs
# To see possible attributes, do attrs.__dict__ 

In [137]:
query = (
    (attrs.entity_poly.rcsb_entity_polymer_type == "Protein") & 
    (attrs.rcsb_ligand_neighbors.ligand_entity_id.exists()) & 
    (attrs.rcsb_binding_affinity.value.exists()) 
    # (attrs.rcsb_nonpolymer_entity.pdbx_number_of_molecules > 0) &
    # (attrs.rcsb_binding_affinity.value >= 0) & 
    # (attrs.rcsb_binding_affinity.value < ...) & 
    # (attrs.rcsb_nonpolymer_entity.formula_weight > 150) 
)
pdb_ids = list(query())
print(len(pdb_ids), "protein–ligand complexes")

14866 protein–ligand complexes


**Data query**

Getting the data for these PDB entries. For now, let's only get the Ligand IDs for now.  

In [84]:
from rcsbapi.data import DataQuery

In [139]:
test_pdb_ids = pdb_ids[:2]
test_pdb_ids

['10GS', '11GS']

In [138]:
query = DataQuery(
    input_type="entries",
    input_ids=test_pdb_ids,
    return_data_list=[
        "rcsb_id",
        "nonpolymer_entities.nonpolymer_comp.chem_comp.id"
    ]
)
protein_ligand_data = query.exec()
print(len(protein_ligand_data["data"]["entries"]), "protein–ligand complexes")

2 protein–ligand complexes


In [118]:
# Example protein–ligand complex record: 
protein_ligand_data["data"]["entries"][0]

{'rcsb_id': '10GS', 'nonpolymer_entities': [{'nonpolymer_comp': {'chem_comp': {'id': 'VWW'}}}, {'nonpolymer_comp': {'chem_comp': {'id': 'MES'}}}]}

In [125]:
# Average number of ligands per protein–ligand complex: 
np.mean([
    len(record["nonpolymer_entities"]) 
    for record in protein_ligand_complexes["data"]["entries"]
])

# If most protein–ligand complexes have more than 1 ligand, that could be a problem 
# as I don't know if our approach can handle this 

2.5

Now that we have the protein–ligand complexes, we still need to fetch data on those ligands, including their SMILES strings: 

In [149]:
# List of unique ligand IDs from above protein–ligand data: 
ligand_ids = list(set(
    ligand["nonpolymer_comp"]["chem_comp"]["id"] 
    for protein_ligand in protein_ligand_data["data"]["entries"] 
    for ligand in protein_ligand["nonpolymer_entities"]
))

In [150]:
test_ligand_ids = ligand_ids[:2]
test_ligand_ids

['VWW', 'GSH']

In [135]:
# For attributes, e.g. go to https://www.rcsb.org/ligand/VWW, click "Data API", which opens a GraphiQL UI, then run it. 

query = DataQuery(
    input_type="chem_comps",
    input_ids=test_ligand_ids,
    return_data_list=[
        "chem_comp.id",
        "chem_comp.type",
        "chem_comp.name",
        "chem_comp.formula",
        "rcsb_chem_comp_descriptor.SMILES",
        # Not sure if these might also come in handy: 
        "rcsb_chem_comp_info.atom_count", 
        "rcsb_chem_comp_info.bond_count", 
        "pdbx_chem_comp_identifier",
        "rcsb_chem_comp_related",
        "drugbank.drugbank_info.drugbank_id",
        "drugbank.durgbank_target.name",
        "drugbank.durgbank_target.interaction_type",
    ]
)
ligand_data = query.exec()
print(len(ligand_data["data"]["chem_comps"]), "ligands")

2 ligands


### 1B. Use Huggingface dataset `pdb_protein_ligand_complexes`

Download from here https://huggingface.co/datasets/jglaser/pdb_protein_ligand_complexes  
and put in `data/` directory.  
I also renamed them to: `pdb_protein_ligand_train.p` and `pdb_protein_ligand_test.p`

In [152]:
import pandas as pd

In [159]:
protein_ligand_train = pd.read_pickle("data/pdb_protein_ligand_train.p")
protein_ligand_test = pd.read_pickle("data/pdb_protein_ligand_test.p")

In [160]:
protein_ligand_train.shape

(34277, 8)

In [157]:
# protein_ligand_test.iloc[:10]
protein_ligand_test[["pdb_id", "lig_id", "seq", "smiles"]].iloc[:10]

Unnamed: 0,pdb_id,lig_id,seq,smiles
0,7k38,VTY,MGIVEEAHNVKVLGTGSRFIVLAHGFGTDQSVWKHLVPHLLEEFRV...,CC1=C[C@@H](O)OC1=O
1,6prt,OWA,SNPPPPETSNPNKPKRQTNQLQYLLRVVLKTLWKHQFAWPFQQPVD...,COC(=O)C[C@H]1CC(=O)N(C)C1
2,4lxx,FNF,GHMIKICIAGKNNIAVNSLQFILKNYFEADQIVVIPNKNDKGIDSW...,Cc1cn([C@H]2C[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...
3,4lxx,FON,GHMIKICIAGKNNIAVNSLQFILKNYFEADQIVVIPNKNDKGIDSW...,Nc1nc(=O)c2c([nH]1)NC[C@@H](CNc1ccc(C(=O)N[C@@...
4,7bp1,CAQ,MLGKVALEEAFALPRHKERTRWWAGLFAIDPDKHAAEINDITEQRI...,Oc1ccccc1O
5,4ibj,1D9,PDISAKDLRNIMYDHLPGFGTAFHQLVQVICKLGKDSNSLDIIHAE...,O=C(O)c1cccc(N2C(=O)C(O)=C(C(=O)c3cccc(C(F)(F)...
6,1psa,0ZL,IGDEPLENYLDTEYFGTIGIGTPAQDFTVIFDTGSSNLWVPSVYCS...,CCOC(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=...
7,5ag7,XXL,AHAFWSTQPVPQTEDETEKIVFAGPMDEPKTVADIPEEPYPIASTF...,CCOC(=O)CN1C(=O)COc2ccccc21
8,1kdr,CAR,AIAPVITIDGPSGAGKGTLCKAMAEALQWHLLDSGAIYRVLALAAL...,Nc1ccn([C@@H]2O[C@H](COP(=O)(O)O)[C@@H](O)[C@@...
9,1v0c,KNC,DSVTLRLMTEHDLAMLYEWLNRSHIVEWWGARPTLADVQEQYLPSV...,N[C@H]1[C@H](O)[C@@H](CO)O[C@H](O[C@@H]2[C@@H]...
