biopython · peterjc · Jan 24, 2023 · Mar 21, 2022 · Apr 8, 2022 · Apr 8, 2022
@@ -1,4 +1,4 @@
-# Copyright 2019-2021 by Robert T. Miller.  All rights reserved.
+# Copyright 2019-2022 by Robert T. Miller.  All rights reserved.
 # This file is part of the Biopython distribution and governed by your
 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
 # Please see the LICENSE file that should have been included as part of this
@@ -24,7 +24,7 @@
 from Bio.PDB.parse_pdb_header import _parse_pdb_header_list
 from Bio.PDB.PDBExceptions import PDBException
 
-from Bio.PDB.Polypeptide import one_to_three
+from Bio.Data.PDBData import protein_letters_1to3
 
 from Bio.PDB.internal_coords import (
     IC_Residue,
@@ -125,6 +125,16 @@ def read_PIC(
         r"(?P<segid>[a-zA-z\s]{4})(?P<elm>.{2})"
         r"(?P<chg>.{2})?\s*$"
     )
+    pdbx_atm_re = re.compile(
+        r"^ATOM\s\s(?:\s*(?P<ser>\d+))\s(?P<atm>[\w\s]{4})"
+        r"(?P<alc>\w|\s)(?P<res>[\w]{3})\s(?P<chn>.)"
+        r"(?P<pos>[\s\-\d]{4})(?P<icode>[A-Za-z\s])\s\s\s"
+        r"(?P<x>[\s\-\d\.]{10})(?P<y>[\s\-\d\.]{10})"
+        r"(?P<z>[\s\-\d\.]{10})(?P<occ>[\s\d\.]{7})"
+        r"(?P<tfac>[\s\d\.]{6})\s{6}"
+        r"(?P<segid>[a-zA-z\s]{4})(?P<elm>.{2})"
+        r"(?P<chg>.{2})?\s*$"
+    )
     bfac_re = re.compile(
         r"^BFAC:\s([^\s]+\s+[\-\d\.]+)"
         r"\s*([^\s]+\s+[\-\d\.]+)?"
@@ -212,43 +222,47 @@ def process_hedron(
         return ek
 
     def default_hedron(ek: Tuple, ric: IC_Residue) -> None:
-        """Create Hedron based on same rdh_class hedra in ref database.
+        """Create Hedron based on same re_class hedra in ref database.
 
         Adds Hedron to current Chain.internal_coord, see ic_data for default
         values and reference database source.
         """
-        aks = []
+        atomkeys = []
         hkey = None
 
         atmNdx = AtomKey.fields.atm
         resNdx = AtomKey.fields.resname
         resPos = AtomKey.fields.respos
-        aks = [ek[i].akl for i in range(3)]
+        atomkeys = [ek[i].akl for i in range(3)]
+
+        atpl = tuple([atomkeys[i][atmNdx] for i in range(3)])
+        res = atomkeys[0][resNdx]
 
-        atpl = tuple(aks[i][atmNdx] for i in range(3))
-        res = aks[0][resNdx]
         if (
-            aks[0][resPos] != aks[2][resPos]  # hedra crosses amide bond so not reversed
+            atomkeys[0][resPos]
+            != atomkeys[2][resPos]  # hedra crosses amide bond so not reversed
             or atpl == ("N", "CA", "C")  # or chain start tau
             or atpl in ic_data_backbone  # or found forward hedron in ic_data
             or (res not in ["A", "G"] and atpl in ic_data_sidechains[res])
         ):
             hkey = ek
-            rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(3)]
+            rhcl = [atomkeys[i][resNdx] + atomkeys[i][atmNdx] for i in range(3)]
             try:
                 dflts = hedra_defaults["".join(rhcl)][0]
             except KeyError:
-                if aks[0][resPos] == aks[1][resPos]:
-                    rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(2)]
-                    rhc = "".join(rhcl) + "X" + aks[2][atmNdx]
+                if atomkeys[0][resPos] == atomkeys[1][resPos]:
+                    rhcl = [atomkeys[i][resNdx] + atomkeys[i][atmNdx] for i in range(2)]
+                    rhc = "".join(rhcl) + "X" + atomkeys[2][atmNdx]
                 else:
-                    rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(1, 3)]
-                    rhc = "X" + aks[0][atmNdx] + "".join(rhcl)
+                    rhcl = [
+                        atomkeys[i][resNdx] + atomkeys[i][atmNdx] for i in range(1, 3)
+                    ]
+                    rhc = "X" + atomkeys[0][atmNdx] + "".join(rhcl)
                 dflts = hedra_defaults[rhc][0]
         else:
             # must be reversed or fail
             hkey = ek[::-1]
-            rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(2, -1, -1)]
+            rhcl = [atomkeys[i][resNdx] + atomkeys[i][atmNdx] for i in range(2, -1, -1)]
             dflts = hedra_defaults["".join(rhcl)][0]
 
         process_hedron(
@@ -264,7 +278,7 @@ def default_hedron(ek: Tuple, ric: IC_Residue) -> None:
         if verbose:
             print(f" default for {ek}")
 
-    def hedra_check(dk: str, ric: IC_Residue) -> None:
+    def hedra_check(dk: Tuple, ric: IC_Residue) -> None:
         """Confirm both hedra present for dihedron key, use default if set."""
         if dk[0:3] not in sbcic.hedra and dk[2::-1] not in sbcic.hedra:
             if defaults:
@@ -498,6 +512,8 @@ def ak_expand(eLst: List) -> List:
         chkLst.append((sN, sCA, sC, sO))  # locate backbone O
         if ric.lc != "G":
             chkLst.append((sO, sC, sCA, sCB))  # locate CB
+            if ric.lc == "A":
+                chkLst.append((sN, sCA, sCB))  # missed for generate from seq
         if ric.rprev != [] and ric.lc != "P" and proton:
             chkLst.append((sC, sCA, sN, sH))  # amide proton
 
@@ -521,7 +537,10 @@ def ak_expand(eLst: List) -> List:
                 pass  # ignore missing hydrogens
             elif all(atm.akl[altloc_ndx] is None for atm in dk):
                 if defaults:
-                    default_dihedron(dk, ric)
+                    if len(dk) != 3:
+                        default_dihedron(dk, ric)
+                    else:
+                        default_hedron(dk, ric)  # add ALA N-Ca-Cb
                 else:
                     if verbose:
                         print(f"{ric}-{rn} missing {dk}")
@@ -530,7 +549,7 @@ def ak_expand(eLst: List) -> List:
                 pass  # ignore missing combinatoric of altloc atoms
                 # need more here?
 
-    def ak_add(ek: set, ric: IC_Residue) -> None:
+    def ak_add(ek: Tuple, ric: IC_Residue) -> None:
         """Allocate edron key AtomKeys to current residue as appropriate.
 
         A hedron or dihedron may span a backbone amide bond, this routine
@@ -692,6 +711,8 @@ def finish_chain() -> None:
                     return None
             elif line.startswith("ATOM "):
                 m = pdb_atm_re.match(line)
+                if not m:
+                    m = pdbx_atm_re.match(line)
                 if m:
                     if sb_res is None:
                         # ATOM without res spec already loaded, not a pic file
@@ -768,7 +789,7 @@ def finish_chain() -> None:
                             m["a2"],
                             m["a3"],
                             m["a4"],
-                            float(m["dihedral"]),
+                            m["dihedral"],
                             sb_res.internal_coord,
                         )
 
@@ -832,7 +853,9 @@ def read_PIC_seq(
 
     ndx = 1
     for r in seqRec.seq:
-        output += f"('{pdbid}', 0, '{chain}', (' ', {ndx}, ' ')) {one_to_three(r)}\n"
+        output += (
+            f"('{pdbid}', 0, '{chain}', (' ', {ndx}, ' ')) {protein_letters_1to3[r]}\n"
+        )
         ndx += 1
 
     sp = StringIO()
@@ -935,6 +958,7 @@ def write_PIC(
     """Write Protein Internal Coordinates (PIC) to file.
 
     See :func:`read_PIC` for file format.
+    See :data:`IC_Residue.pic_accuracy` to vary numeric accuracy.
     Recurses to lower entity levels (M, C, R).
 
     :param Entity entity: Biopython PDB Entity object: S, M, C or R
@@ -959,7 +983,7 @@ def write_PIC(
         * "classic",    # classic_b | chi
         * "hedra",      # all hedra including bond lengths
         * "primary",    # all primary dihedra
-        * "secondary",  # all secondary dihedra
+        * "secondary",  # all secondary dihedra (fixed angle from primary dihedra)
         * "all",        # hedra | primary | secondary
         * "initAtoms",  # XYZ coordinates of initial Tau (N-Ca-C)
         * "bFactors"

@@ -1,4 +1,4 @@
-# Copyright 2019-21 by Robert T. Miller.  All rights reserved.
+# Copyright 2019-22 by Robert T. Miller.  All rights reserved.
 # This file is part of the Biopython distribution and governed by your
 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
 # Please see the LICENSE file that should have been included as part of this
@@ -16,7 +16,7 @@
 for naming of individual atoms
 """
 
-# Backbone hedra and dihedra - within residue, no next or prev.
+# Backbone hedra and dihedra - within residue, no next or prev (no psi, phi, omg).
 ic_data_backbone = (
     ("N", "CA", "C", "O"),  # locate backbone O
     ("O", "C", "CA", "CB"),  # locate CB
@@ -469,8 +469,8 @@
     "Nsb": 0.70,
     "Nres": 0.66,
     "Ndb": 0.62,
-    "Hsb": 0.37,
     "Ssb": 1.04,
+    "Hsb": 0.37,
 }
 
 # Atom classes based on Heyrovska, Raji covalent radii paper.
@@ -559,8 +559,10 @@
 #
 # hedra defaults based on residue-atom classes from reference database
 #
-# values in comments after each entry are std dev for each value, followed by
-# total count of hedra in class
+# entry format is
+# [(len1, angle, len3 averages), angle_sd]  # len1, len3 std dev, [total count]
+#
+# angle standard deviation is used by  :func:`ic_data.write_PIC`
 #
 # N.B. these are just intended as reasonable starting values.  Despite the low
 # standard deviations, structures degrade quickly with any variation from true
@@ -1484,16 +1486,18 @@
 # aromatic sidechains as required by the algorithm.  These are the ones you
 # really need to specify.
 #
-# each entry is the most common rounded int value observed over the class
+# each entry is the most common rounded int value observed over the class,
+# followed by standard deviation (used by :func:`ic_data.write_PIC`)
 #
-# numbers in comments after each entry are count of dihedra in class, average
-# angle, and standard deviation.
+# numbers in comments after each entry are count of dihedra in class and average
+# angle
 #
 # N.B. these are just intended as reasonable starting values, in practice they
 # generate a helical backbone with a kink at proline residues
 
 # 134 total primary entries
 dihedra_primary_defaults = {
+    # [most common int(value), std_dev] # count average
     "ANACAACAOXT": [169, 95.25474],  # 81 -165.46932
     "ANACAACXN": [-42, 98.15673],  # 36750 -29.47243
     "CNCCACCBCSG": [-70, 81.01579],  # 5998 -85.23394

@@ -1,4 +1,4 @@
-# Copyright 2019-2021 by Robert T. Miller.  All rights reserved.
+# Copyright 2019-2022 by Robert T. Miller.  All rights reserved.
 # This file is part of the Biopython distribution and governed by your
 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
 # Please see the LICENSE file that should have been included as part of this
@@ -7,6 +7,7 @@
 """Convert XYZ Structure to internal coordinates and back, test result."""
 
 import re
+import numpy as np
 
 from itertools import zip_longest
 
@@ -39,13 +40,24 @@
 def structure_rebuild_test(entity, verbose: bool = False, quick: bool = False) -> Dict:
     """Test rebuild PDB structure from internal coordinates.
 
+    Generates internal coordinates for entity and writes to a .pic file in
+    memory, then generates XYZ coordinates from the .pic file and compares the
+    resulting entity against the original.
+
+    See :data:`IC_Residue.pic_accuracy` to vary numeric accuracy of the
+    intermediate .pic file if the only issue is small differences in coordinates.
+
+    Note that with default settings, deuterated initial structures will fail
+    the comparison, as will structures loaded with alternate `IC_Residue.accept_atoms`
+    settings.  Use `quick=True` and/or variations on `AtomKey.d2h` and
+    `IC_Residue.accept_atoms` settings.
+
     :param Entity entity: Biopython Structure, Model or Chain.
         Structure to test
     :param bool verbose: default False.
         print extra messages
     :param bool quick: default False.
-        only check atomArrays are identical
-        in internal_to_atom_coords computation
+        only check the internal coords atomArrays are identical
     :returns: dict
         comparison dict from :func:`.compare_residues`
     """
@@ -169,7 +181,7 @@ def IC_duplicate(entity) -> Structure:
     Calls :meth:`.Chain.atom_to_internal_coordinates` if needed.
 
     :param Entity entity: Biopython PDB Entity (will fail for Atom)
-    :returns: Biopython PDBStructure, no Atom objects
+    :returns: Biopython PDBStructure, no Atom objects except initial coords
     """
     sp = StringIO()
     hasInternalCoords = False
@@ -354,10 +366,10 @@ def compare_residues(
         Whether to print mismatch info, default False
     :param bool quick: default False.
         Only check atomArrays are identical, aCoordMatchCount=0 if different
-    :param float rtol, atol: default 1e-03, 1e-05 or round to 3 places.
+    :param float rtol, atol: default 1e-03, 1e-03 or round to 3 places.
         Numpy allclose parameters; default is to round atom coordinates to 3
         places and test equal.  For 'quick' will use defaults above for
-        comparing ataomArrays
+        comparing atomArrays
     :returns dict:
         Result counts for Residues, Full ID match Residues, Atoms,
         Full ID match atoms, and Coordinate match atoms; report string;
@@ -380,30 +392,49 @@ def compare_residues(
 
     if quick:
         if isinstance(e0, Chain):
-            if numpy.allclose(
-                e0.internal_coord.atomArray,
-                e1.internal_coord.atomArray,
-                rtol=1e-03 if rtol is None else rtol,
-                atol=1e-05 if atol is None else atol,
+            if (
+                e0.internal_coord.atomArray is not None
+                and np.shape(e0.internal_coord.atomArray)
+                == np.shape(e1.internal_coord.atomArray)
+                and numpy.allclose(
+                    e0.internal_coord.atomArray,
+                    e1.internal_coord.atomArray,
+                    rtol=1e-03 if rtol is None else rtol,
+                    atol=1e-03 if atol is None else atol,
+                )
             ):
-                cmpdict["pass"] = True
+                cmpdict["aCount"] = numpy.size(e0.internal_coord.atomArray, 0)
                 cmpdict["aCoordMatchCount"] = numpy.size(e0.internal_coord.atomArray, 0)
+                if cmpdict["aCoordMatchCount"] > 0:
+                    cmpdict["pass"] = True
+                else:
+                    cmpdict["pass"] = False
             else:
+                cmpdict["aCount"] = (
+                    0
+                    if e0.internal_coord.atomArray is None
+                    else numpy.size(e0.internal_coord.atomArray, 0)
+                )
                 cmpdict["pass"] = False
         else:
             cmpdict["pass"] = True
             for c0, c1 in zip_longest(e0.get_chains(), e1.get_chains()):
-                if numpy.allclose(
-                    c0.internal_coord.atomArray,
-                    c1.internal_coord.atomArray,
-                    rtol=1e-03 if rtol is None else rtol,
-                    atol=1e-05 if atol is None else atol,
-                ):
-                    cmpdict["aCoordMatchCount"] += numpy.size(
-                        c0.internal_coord.atomArray, 0
-                    )
-                else:
-                    cmpdict["pass"] = False
+                if c0.internal_coord.atomArray is not None:
+                    if numpy.allclose(
+                        c0.internal_coord.atomArray,
+                        c1.internal_coord.atomArray,
+                        rtol=1e-03 if rtol is None else rtol,
+                        atol=1e-03 if atol is None else atol,
+                    ):
+                        cmpdict["aCoordMatchCount"] += numpy.size(
+                            c0.internal_coord.atomArray, 0
+                        )
+                    else:
+                        cmpdict["pass"] = False
+                    cmpdict["aCount"] += numpy.size(c0.internal_coord.atomArray, 0)
+            if cmpdict["aCoordMatchCount"] < cmpdict["aCount"]:
+                cmpdict["pass"] = False
+
     else:
         for r0, r1 in zip_longest(e0.get_residues(), e1.get_residues()):
             if 2 == r0.is_disordered() == r1.is_disordered():