Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add inline and Tutorial documentation for internal_coords; change edr… #3898

Merged
merged 35 commits into from
Jan 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
35766b1
Add inline and Tutorial documentation for internal_coords; change edr…
rob-miller Mar 21, 2022
447cf5a
tutorial doctest typo
rob-miller Apr 8, 2022
415c0a3
show c-alpha plot in tutorial
rob-miller Apr 8, 2022
366871e
more Tutorial polishing
rob-miller Apr 8, 2022
8b8c8a3
fix broken f-string
rob-miller Apr 18, 2022
a9faad6
tweak chain break message for linux doctest
rob-miller Apr 18, 2022
41ddc8b
tweak chain break message for linux doctest
rob-miller Apr 18, 2022
7a7fbd0
tweak chain break message for linux doctest
rob-miller Apr 18, 2022
c3c2c16
Revert "tweak chain break message for linux doctest"
rob-miller Apr 18, 2022
10080f0
Revert "tweak chain break message for linux doctest"
rob-miller Apr 18, 2022
a0ab9f1
Revert "tweak chain break message for linux doctest"
rob-miller Apr 18, 2022
2cbbb35
fix type issues found by Cython
rob-miller May 13, 2022
0bdc9ae
implement and use AtomKey.cr_class() (return covalent radii class) wi…
rob-miller May 18, 2022
05cbd77
remove references to IC_Residue.atom_coords, no longer created
rob-miller May 18, 2022
ec83325
document IC_Residue.ak_set, implement __contains__(AtomKey) for IC_Re…
rob-miller May 18, 2022
8dc1dc2
change edron attributes *dh_class (di/hedron) to *e_class (edron)
rob-miller May 19, 2022
51d3182
change covalent_radii edron attribute to string to match other classi…
rob-miller May 20, 2022
59cac03
raise numerical accuracy of .pic files to support mmCIF data
rob-miller May 20, 2022
7e53703
store chirality info in IC_Chain to match distance arrays for distanc…
rob-miller May 22, 2022
6575009
store chirality info in IC_Chain to match distance arrays for distanc…
rob-miller May 22, 2022
99540c7
more docs on rebuild test with changed accept_atoms, fix incorrect at…
rob-miller Jun 1, 2022
2872d58
check atomArray shapes in quick rebuild_test to avoid crash on 5ehiA
rob-miller Jun 4, 2022
a7536b2
respect no_altloc in more places
rob-miller Jun 6, 2022
d7686b0
put heavy atoms first in covalent radii dict
rob-miller Jul 2, 2022
47f01c7
update comments on hedra and dihedra default tables
rob-miller Jul 5, 2022
8c5899c
improve comment on ic_data_backbone table
rob-miller Jul 29, 2022
194872d
change use of three_to_one to protein_letters_3to1
rob-miller Jul 29, 2022
aa70650
fix crash on quick rebuild check of chain with no residues (e.g. DNA)
rob-miller Aug 3, 2022
f1b26da
update one_to_three to protein_letters_1to3
rob-miller Sep 7, 2022
1ca4ba0
reference code released in 1.80 in NEWS.rst
rob-miller Nov 19, 2022
6b6d094
add convenience method Edron.gen_tuple to match Edron.gen_key
rob-miller Dec 4, 2022
9dd3bc2
fix Ala missing N-Ca-Cb hedron internally if generated from defaults
rob-miller Dec 19, 2022
d3617b2
add default True option for distance_to_internal_coordinates to mark …
rob-miller Dec 20, 2022
3a6f37a
avoid numpy type deprecations as per #4204
rob-miller Dec 20, 2022
bcab729
IC_Residue.pick_length now reports hedra on rprev, fixes bug setting …
rob-miller Jan 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
66 changes: 45 additions & 21 deletions Bio/PDB/PICIO.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2019-2021 by Robert T. Miller. All rights reserved.
# Copyright 2019-2022 by Robert T. Miller. All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
Expand All @@ -24,7 +24,7 @@
from Bio.PDB.parse_pdb_header import _parse_pdb_header_list
from Bio.PDB.PDBExceptions import PDBException

from Bio.PDB.Polypeptide import one_to_three
from Bio.Data.PDBData import protein_letters_1to3

from Bio.PDB.internal_coords import (
IC_Residue,
Expand Down Expand Up @@ -125,6 +125,16 @@ def read_PIC(
r"(?P<segid>[a-zA-z\s]{4})(?P<elm>.{2})"
r"(?P<chg>.{2})?\s*$"
)
pdbx_atm_re = re.compile(
r"^ATOM\s\s(?:\s*(?P<ser>\d+))\s(?P<atm>[\w\s]{4})"
r"(?P<alc>\w|\s)(?P<res>[\w]{3})\s(?P<chn>.)"
r"(?P<pos>[\s\-\d]{4})(?P<icode>[A-Za-z\s])\s\s\s"
r"(?P<x>[\s\-\d\.]{10})(?P<y>[\s\-\d\.]{10})"
r"(?P<z>[\s\-\d\.]{10})(?P<occ>[\s\d\.]{7})"
r"(?P<tfac>[\s\d\.]{6})\s{6}"
r"(?P<segid>[a-zA-z\s]{4})(?P<elm>.{2})"
r"(?P<chg>.{2})?\s*$"
)
bfac_re = re.compile(
r"^BFAC:\s([^\s]+\s+[\-\d\.]+)"
r"\s*([^\s]+\s+[\-\d\.]+)?"
Expand Down Expand Up @@ -212,43 +222,47 @@ def process_hedron(
return ek

def default_hedron(ek: Tuple, ric: IC_Residue) -> None:
"""Create Hedron based on same rdh_class hedra in ref database.
"""Create Hedron based on same re_class hedra in ref database.

Adds Hedron to current Chain.internal_coord, see ic_data for default
values and reference database source.
"""
aks = []
atomkeys = []
hkey = None

atmNdx = AtomKey.fields.atm
resNdx = AtomKey.fields.resname
resPos = AtomKey.fields.respos
aks = [ek[i].akl for i in range(3)]
atomkeys = [ek[i].akl for i in range(3)]

atpl = tuple([atomkeys[i][atmNdx] for i in range(3)])
res = atomkeys[0][resNdx]

atpl = tuple(aks[i][atmNdx] for i in range(3))
res = aks[0][resNdx]
if (
aks[0][resPos] != aks[2][resPos] # hedra crosses amide bond so not reversed
atomkeys[0][resPos]
!= atomkeys[2][resPos] # hedra crosses amide bond so not reversed
or atpl == ("N", "CA", "C") # or chain start tau
or atpl in ic_data_backbone # or found forward hedron in ic_data
or (res not in ["A", "G"] and atpl in ic_data_sidechains[res])
):
hkey = ek
rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(3)]
rhcl = [atomkeys[i][resNdx] + atomkeys[i][atmNdx] for i in range(3)]
try:
dflts = hedra_defaults["".join(rhcl)][0]
except KeyError:
if aks[0][resPos] == aks[1][resPos]:
rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(2)]
rhc = "".join(rhcl) + "X" + aks[2][atmNdx]
if atomkeys[0][resPos] == atomkeys[1][resPos]:
rhcl = [atomkeys[i][resNdx] + atomkeys[i][atmNdx] for i in range(2)]
rhc = "".join(rhcl) + "X" + atomkeys[2][atmNdx]
else:
rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(1, 3)]
rhc = "X" + aks[0][atmNdx] + "".join(rhcl)
rhcl = [
atomkeys[i][resNdx] + atomkeys[i][atmNdx] for i in range(1, 3)
]
rhc = "X" + atomkeys[0][atmNdx] + "".join(rhcl)
dflts = hedra_defaults[rhc][0]
else:
# must be reversed or fail
hkey = ek[::-1]
rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(2, -1, -1)]
rhcl = [atomkeys[i][resNdx] + atomkeys[i][atmNdx] for i in range(2, -1, -1)]
dflts = hedra_defaults["".join(rhcl)][0]

process_hedron(
Expand All @@ -264,7 +278,7 @@ def default_hedron(ek: Tuple, ric: IC_Residue) -> None:
if verbose:
print(f" default for {ek}")

def hedra_check(dk: str, ric: IC_Residue) -> None:
def hedra_check(dk: Tuple, ric: IC_Residue) -> None:
"""Confirm both hedra present for dihedron key, use default if set."""
if dk[0:3] not in sbcic.hedra and dk[2::-1] not in sbcic.hedra:
if defaults:
Expand Down Expand Up @@ -498,6 +512,8 @@ def ak_expand(eLst: List) -> List:
chkLst.append((sN, sCA, sC, sO)) # locate backbone O
if ric.lc != "G":
chkLst.append((sO, sC, sCA, sCB)) # locate CB
if ric.lc == "A":
chkLst.append((sN, sCA, sCB)) # missed for generate from seq
if ric.rprev != [] and ric.lc != "P" and proton:
chkLst.append((sC, sCA, sN, sH)) # amide proton

Expand All @@ -521,7 +537,10 @@ def ak_expand(eLst: List) -> List:
pass # ignore missing hydrogens
elif all(atm.akl[altloc_ndx] is None for atm in dk):
if defaults:
default_dihedron(dk, ric)
if len(dk) != 3:
default_dihedron(dk, ric)
else:
default_hedron(dk, ric) # add ALA N-Ca-Cb
else:
if verbose:
print(f"{ric}-{rn} missing {dk}")
Expand All @@ -530,7 +549,7 @@ def ak_expand(eLst: List) -> List:
pass # ignore missing combinatoric of altloc atoms
# need more here?

def ak_add(ek: set, ric: IC_Residue) -> None:
def ak_add(ek: Tuple, ric: IC_Residue) -> None:
"""Allocate edron key AtomKeys to current residue as appropriate.

A hedron or dihedron may span a backbone amide bond, this routine
Expand Down Expand Up @@ -692,6 +711,8 @@ def finish_chain() -> None:
return None
elif line.startswith("ATOM "):
m = pdb_atm_re.match(line)
if not m:
m = pdbx_atm_re.match(line)
if m:
if sb_res is None:
# ATOM without res spec already loaded, not a pic file
Expand Down Expand Up @@ -768,7 +789,7 @@ def finish_chain() -> None:
m["a2"],
m["a3"],
m["a4"],
float(m["dihedral"]),
m["dihedral"],
sb_res.internal_coord,
)

Expand Down Expand Up @@ -832,7 +853,9 @@ def read_PIC_seq(

ndx = 1
for r in seqRec.seq:
output += f"('{pdbid}', 0, '{chain}', (' ', {ndx}, ' ')) {one_to_three(r)}\n"
output += (
f"('{pdbid}', 0, '{chain}', (' ', {ndx}, ' ')) {protein_letters_1to3[r]}\n"
)
ndx += 1

sp = StringIO()
Expand Down Expand Up @@ -935,6 +958,7 @@ def write_PIC(
"""Write Protein Internal Coordinates (PIC) to file.

See :func:`read_PIC` for file format.
See :data:`IC_Residue.pic_accuracy` to vary numeric accuracy.
Recurses to lower entity levels (M, C, R).

:param Entity entity: Biopython PDB Entity object: S, M, C or R
Expand All @@ -959,7 +983,7 @@ def write_PIC(
* "classic", # classic_b | chi
* "hedra", # all hedra including bond lengths
* "primary", # all primary dihedra
* "secondary", # all secondary dihedra
* "secondary", # all secondary dihedra (fixed angle from primary dihedra)
* "all", # hedra | primary | secondary
* "initAtoms", # XYZ coordinates of initial Tau (N-Ca-C)
* "bFactors"
Expand Down
20 changes: 12 additions & 8 deletions Bio/PDB/ic_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2019-21 by Robert T. Miller. All rights reserved.
# Copyright 2019-22 by Robert T. Miller. All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
Expand All @@ -16,7 +16,7 @@
for naming of individual atoms
"""

# Backbone hedra and dihedra - within residue, no next or prev.
# Backbone hedra and dihedra - within residue, no next or prev (no psi, phi, omg).
ic_data_backbone = (
("N", "CA", "C", "O"), # locate backbone O
("O", "C", "CA", "CB"), # locate CB
Expand Down Expand Up @@ -469,8 +469,8 @@
"Nsb": 0.70,
"Nres": 0.66,
"Ndb": 0.62,
"Hsb": 0.37,
"Ssb": 1.04,
"Hsb": 0.37,
}

# Atom classes based on Heyrovska, Raji covalent radii paper.
Expand Down Expand Up @@ -559,8 +559,10 @@
#
# hedra defaults based on residue-atom classes from reference database
#
# values in comments after each entry are std dev for each value, followed by
# total count of hedra in class
# entry format is
# [(len1, angle, len3 averages), angle_sd] # len1, len3 std dev, [total count]
#
# angle standard deviation is used by :func:`ic_data.write_PIC`
#
# N.B. these are just intended as reasonable starting values. Despite the low
# standard deviations, structures degrade quickly with any variation from true
Expand Down Expand Up @@ -1484,16 +1486,18 @@
# aromatic sidechains as required by the algorithm. These are the ones you
# really need to specify.
#
# each entry is the most common rounded int value observed over the class
# each entry is the most common rounded int value observed over the class,
# followed by standard deviation (used by :func:`ic_data.write_PIC`)
#
# numbers in comments after each entry are count of dihedra in class, average
# angle, and standard deviation.
# numbers in comments after each entry are count of dihedra in class and average
# angle
#
# N.B. these are just intended as reasonable starting values, in practice they
# generate a helical backbone with a kink at proline residues

# 134 total primary entries
dihedra_primary_defaults = {
# [most common int(value), std_dev] # count average
"ANACAACAOXT": [169, 95.25474], # 81 -165.46932
"ANACAACXN": [-42, 98.15673], # 36750 -29.47243
"CNCCACCBCSG": [-70, 81.01579], # 5998 -85.23394
Expand Down
77 changes: 54 additions & 23 deletions Bio/PDB/ic_rebuild.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2019-2021 by Robert T. Miller. All rights reserved.
# Copyright 2019-2022 by Robert T. Miller. All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
Expand All @@ -7,6 +7,7 @@
"""Convert XYZ Structure to internal coordinates and back, test result."""

import re
import numpy as np

from itertools import zip_longest

Expand Down Expand Up @@ -39,13 +40,24 @@
def structure_rebuild_test(entity, verbose: bool = False, quick: bool = False) -> Dict:
"""Test rebuild PDB structure from internal coordinates.

Generates internal coordinates for entity and writes to a .pic file in
memory, then generates XYZ coordinates from the .pic file and compares the
resulting entity against the original.

See :data:`IC_Residue.pic_accuracy` to vary numeric accuracy of the
intermediate .pic file if the only issue is small differences in coordinates.

Note that with default settings, deuterated initial structures will fail
the comparison, as will structures loaded with alternate `IC_Residue.accept_atoms`
settings. Use `quick=True` and/or variations on `AtomKey.d2h` and
`IC_Residue.accept_atoms` settings.

:param Entity entity: Biopython Structure, Model or Chain.
Structure to test
:param bool verbose: default False.
print extra messages
:param bool quick: default False.
only check atomArrays are identical
in internal_to_atom_coords computation
only check the internal coords atomArrays are identical
:returns: dict
comparison dict from :func:`.compare_residues`
"""
Expand Down Expand Up @@ -169,7 +181,7 @@ def IC_duplicate(entity) -> Structure:
Calls :meth:`.Chain.atom_to_internal_coordinates` if needed.

:param Entity entity: Biopython PDB Entity (will fail for Atom)
:returns: Biopython PDBStructure, no Atom objects
:returns: Biopython PDBStructure, no Atom objects except initial coords
"""
sp = StringIO()
hasInternalCoords = False
Expand Down Expand Up @@ -354,10 +366,10 @@ def compare_residues(
Whether to print mismatch info, default False
:param bool quick: default False.
Only check atomArrays are identical, aCoordMatchCount=0 if different
:param float rtol, atol: default 1e-03, 1e-05 or round to 3 places.
:param float rtol, atol: default 1e-03, 1e-03 or round to 3 places.
Numpy allclose parameters; default is to round atom coordinates to 3
places and test equal. For 'quick' will use defaults above for
comparing ataomArrays
comparing atomArrays
:returns dict:
Result counts for Residues, Full ID match Residues, Atoms,
Full ID match atoms, and Coordinate match atoms; report string;
Expand All @@ -380,30 +392,49 @@ def compare_residues(

if quick:
if isinstance(e0, Chain):
if numpy.allclose(
e0.internal_coord.atomArray,
e1.internal_coord.atomArray,
rtol=1e-03 if rtol is None else rtol,
atol=1e-05 if atol is None else atol,
if (
e0.internal_coord.atomArray is not None
and np.shape(e0.internal_coord.atomArray)
== np.shape(e1.internal_coord.atomArray)
and numpy.allclose(
e0.internal_coord.atomArray,
e1.internal_coord.atomArray,
rtol=1e-03 if rtol is None else rtol,
atol=1e-03 if atol is None else atol,
)
):
cmpdict["pass"] = True
cmpdict["aCount"] = numpy.size(e0.internal_coord.atomArray, 0)
cmpdict["aCoordMatchCount"] = numpy.size(e0.internal_coord.atomArray, 0)
if cmpdict["aCoordMatchCount"] > 0:
cmpdict["pass"] = True
else:
cmpdict["pass"] = False
else:
cmpdict["aCount"] = (
0
if e0.internal_coord.atomArray is None
else numpy.size(e0.internal_coord.atomArray, 0)
)
cmpdict["pass"] = False
else:
cmpdict["pass"] = True
for c0, c1 in zip_longest(e0.get_chains(), e1.get_chains()):
if numpy.allclose(
c0.internal_coord.atomArray,
c1.internal_coord.atomArray,
rtol=1e-03 if rtol is None else rtol,
atol=1e-05 if atol is None else atol,
):
cmpdict["aCoordMatchCount"] += numpy.size(
c0.internal_coord.atomArray, 0
)
else:
cmpdict["pass"] = False
if c0.internal_coord.atomArray is not None:
if numpy.allclose(
c0.internal_coord.atomArray,
c1.internal_coord.atomArray,
rtol=1e-03 if rtol is None else rtol,
atol=1e-03 if atol is None else atol,
):
cmpdict["aCoordMatchCount"] += numpy.size(
c0.internal_coord.atomArray, 0
)
else:
cmpdict["pass"] = False
cmpdict["aCount"] += numpy.size(c0.internal_coord.atomArray, 0)
if cmpdict["aCoordMatchCount"] < cmpdict["aCount"]:
cmpdict["pass"] = False

else:
for r0, r1 in zip_longest(e0.get_residues(), e1.get_residues()):
if 2 == r0.is_disordered() == r1.is_disordered():
Expand Down