In [None]:
%load_ext autoreload
%autoreload 2
import py3Dmol
import numpy as np
from bio_datasets import load_dataset
from bio_datasets import Features, Value, StructureFeature, ProteinStructureFeature
from bio_datasets.structure.utils import to_pdb_string


Structures in the PDB often contain coordinates only for a subset of the residues or atoms of the molecule(s) under investigation. It is in many cases useful to have an explicit representation of these missing coordinates, but this has to be inferred by comparison of the present coordinates with the expected molelcular composition stored in the metadata.

Bio datasets offers built-in support for loading structures from cif files with missing coordinates explicitly represented. Support for missing residues can be enabled by using StructureFeature/ProteinStructureFeature with `fill_missing_residues=True`

For more background on missing coordinates in PDB files, see [the PDB docs](https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/missing-coordinates#:~:text=Regions%20in%20a%20protein%20that,and%20end%20of%20the%20chain.)

In [None]:
ds = load_dataset('biodatasets/pdb', cache_dir='/Volumes/SSKSSD/.cache/huggingface/datasets', split="train")
index = ds["id"].index("1az5.bcif")
ex = ds[index]

In [None]:
view = py3Dmol.view(width=400, height=400)
view.addModel(to_pdb_string(ex["structure"]), 'pdb')
view.setStyle({'cartoon': {'color': 'spectrum'}})
view.zoomTo()
view.show()

In [None]:
feat = Features(id=Value("string"), structure=ProteinStructureFeature(load_assembly=True, fill_missing_residues=True))
# because encoding doesn't change, casting is unnecessary, and we can just set the features explicitly
# todo: copy ds?
ds.info.features = feat

In [None]:
ex = ds[index]

In [None]:
np.unique(ex["structure"].atoms.res_id)

In [None]:
view = py3Dmol.view(width=400, height=400)
view.addModel(to_pdb_string(ex["structure"].atoms[~np.isnan(ex["structure"].atoms.coord).any(axis=1)]), 'pdb')
view.setStyle({'cartoon': {'color': 'spectrum'}})
view.zoomTo()
view.show()

In [None]:
ex["structure"].backbone_coords(["CA"]).squeeze()