Added ability to return GC content as a ratio. [Finish #564] (#3540)

While GC content is usually a percentage, many individuals use it strictly as a ratio. Adding the ability to return a ratio instead of a percentage will provide more options for the user and eliminate the need for extra conversions on the user's part. Deprecates GC in favor of new gc_fraction function. Co-authored-by: Cham K <chamkank@users.noreply.github.com>
biopython · Aug 26, 2022 · 8709ac7 · 8709ac7
1 parent 72211ac
commit 8709ac7
Show file tree

Hide file tree

Showing 10 changed files with 188 additions and 48 deletions.
diff --git a/Bio/Align/__init__.py b/Bio/Align/__init__.py
@@ -835,13 +835,13 @@ def sort(self, key=None, reverse=False):
         As an example using a different sort order, you could sort on the
         GC content of each sequence.
 
-        >>> from Bio.SeqUtils import GC
+        >>> from Bio.SeqUtils import gc_fraction
         >>> print(align1)
         Alignment with 3 rows and 4 columns
         ACGC Chicken
         ACGT Human
         ACGG Mouse
-        >>> align1.sort(key = lambda record: GC(record.seq))
+        >>> align1.sort(key = lambda record: gc_fraction(record.seq))
         >>> print(align1)
         Alignment with 3 rows and 4 columns
         ACGT Human

diff --git a/Bio/SeqUtils/MeltingTemp.py b/Bio/SeqUtils/MeltingTemp.py
@@ -564,7 +564,7 @@ def salt_correction(Na=0, K=0, Tris=0, Mg=0, dNTPs=0, method=1, seq=None):
         corr = 0.368 * (len(seq) - 1) * math.log(mon)
     if method == 6:
         corr = (
-            (4.29 * SeqUtils.GC(seq) / 100 - 3.95) * 1e-5 * math.log(mon)
+            (4.29 * SeqUtils.gc_fraction(seq, "ignore") - 3.95) * 1e-5 * math.log(mon)
         ) + 9.40e-6 * math.log(mon) ** 2
     # Turn black code style off
     # fmt: off
@@ -581,7 +581,7 @@ def salt_correction(Na=0, K=0, Tris=0, Mg=0, dNTPs=0, method=1, seq=None):
         if Mon > 0:
             R = math.sqrt(mg) / mon
             if R < 0.22:
-                corr = (4.29 * SeqUtils.GC(seq) / 100 - 3.95) * \
+                corr = (4.29 * SeqUtils.gc_fraction(seq, "ignore") - 3.95) * \
                     1e-5 * math.log(mon) + 9.40e-6 * math.log(mon) ** 2
                 return corr
             elif R < 6.0:
@@ -590,7 +590,7 @@ def salt_correction(Na=0, K=0, Tris=0, Mg=0, dNTPs=0, method=1, seq=None):
                             - 8.03e-3 * math.log(mon) ** 2)
                 g = 8.31 * (0.486 - 0.258 * math.log(mon)
                             + 5.25e-3 * math.log(mon) ** 3)
-        corr = (a + b * math.log(mg) + (SeqUtils.GC(seq) / 100)
+        corr = (a + b * math.log(mg) + (SeqUtils.gc_fraction(seq, "ignore"))
                 * (c + d * math.log(mg)) + (1 / (2.0 * (len(seq) - 1)))
                 * (e + f * math.log(mg) + g * math.log(mg) ** 2)) * 1e-5
     # Turn black code style on
@@ -773,19 +773,19 @@ def Tm_GC(
     seq = str(seq)
     if check:
         seq = _check(seq, "Tm_GC")
-    percent_gc = SeqUtils.GC(seq)
-    # Ambiguous bases: add 0.5, 0.67 or 0.33% depending on G+C probability:
-    tmp = (
-        sum(map(seq.count, ("K", "M", "N", "R", "Y"))) * 50.0 / len(seq)
-        + sum(map(seq.count, ("B", "V"))) * 66.67 / len(seq)
-        + sum(map(seq.count, ("D", "H"))) * 33.33 / len(seq)
-    )
-    if strict and tmp:
+
+    if strict and any(x in seq for x in "KMNRYBVDH"):
         raise ValueError(
             "ambiguous bases B, D, H, K, M, N, R, V, Y not allowed when 'strict=True'"
         )
-    else:
-        percent_gc += tmp
+
+    # Ambiguous bases: add 0.5, 0.67 or 0.33% depending on G+C probability:
+    percent_gc = SeqUtils.gc_fraction(seq, "weighted") * 100
+
+    # gc_fraction counts X as 0.5
+    if mismatch:
+        percent_gc -= seq.count("X") * 50.0 / len(seq)
+
     if userset:
         A, B, C, D = userset
     else:
@@ -1012,7 +1012,7 @@ def Tm_NN(
     delta_s += nn_table["init"][d_s]
 
     # Type: Duplex with no (allA/T) or at least one (oneG/C) GC pair
-    if SeqUtils.GC(seq) == 0:
+    if SeqUtils.gc_fraction(seq, "ignore") == 0:
         delta_h += nn_table["init_allA/T"][d_h]
         delta_s += nn_table["init_allA/T"][d_s]
     else:

diff --git a/Bio/SeqUtils/__init__.py b/Bio/SeqUtils/__init__.py
@@ -11,31 +11,139 @@
 
 
 import re
+import warnings
 from math import pi, sin, cos
 
 from Bio.Seq import Seq, complement, complement_rna
 from Bio.Data import IUPACData
-
+from Bio import BiopythonDeprecationWarning
 
 ######################################
 # DNA
 ######################
-# {{{
+# {
+
+_gc_values = {
+    "G": 1.000,
+    "C": 1.000,
+    "A": 0.000,
+    "T": 0.000,
+    "S": 1.000,  # Strong interaction (3 H bonds) (G or C)
+    "W": 0.000,  # Weak interaction (2 H bonds) (A or T)
+    "M": 0.500,  # Amino (A or C)
+    "R": 0.500,  # Purine (A or G)
+    "Y": 0.500,  # Pyrimidine (T or C)
+    "K": 0.500,  # Keto (G or T)
+    "V": 2 / 3,  # Not T or U (A or C or G)
+    "B": 2 / 3,  # Not A (C or G or T)
+    "H": 1 / 3,  # Not G (A or C or T)
+    "D": 1 / 3,  # Not C (A or G or T)
+    "X": 0.500,  # Any nucleotide (A or C or G or T)
+    "N": 0.500,  # Any nucleotide (A or C or G or T)
+}
+
+
+def gc_fraction(seq, ambiguous="remove"):
+    """Calculate G+C percentage in seq (float between 0 and 1).
+
+    Copes with mixed case sequences. Ambiguous Nucleotides in this context are
+    those different from ATCGSW (S is G or C, and W is A or T).
+
+    If ambiguous equals "remove" (default), will only count GCS and will only
+    include ACTGSW when calculating the sequence length. Equivalent to removing
+    all characters in the set BDHKMNRVXY before calculating the GC content, as
+    each of these ambiguous nucleotides can either be in (A,T) or in (C,G).
+
+    If ambiguous equals "ignore", it will treat only unambiguous nucleotides (GCS)
+    as counting towards the GC percentage, but will include all ambiguous and
+    unambiguous nucleotides when calculating the sequence length.
+
+    If ambiguous equals "weighted", will use a "mean" value when counting the
+    ambiguous characters, for example, G and C will be counted as 1, N and X will
+    be counted as 0.5, D will be counted as 0.33 etc. See Bio.SeqUtils._gc_values
+    for a full list.
+
+    Will raise a ValueError for any other value of the ambiguous parameter.
+
+
+    >>> from Bio.SeqUtils import gc_fraction
+    >>> seq = "ACTG"
+    >>> print(f"GC content of {seq} : {gc_fraction(seq):.2f}")
+    GC content of ACTG : 0.50
+
+    S and W are ambiguous for the purposes of calculating the GC content.
+
+    >>> seq = "ACTGSSSS"
+    >>> gc = gc_fraction(seq, "remove")
+    >>> print(f"GC content of {seq} : {gc:.2f}")
+    GC content of ACTGSSSS : 0.75
+    >>> gc = gc_fraction(seq, "ignore")
+    >>> print(f"GC content of {seq} : {gc:.2f}")
+    GC content of ACTGSSSS : 0.75
+    >>> gc = gc_fraction(seq, "weighted")
+    >>> print(f"GC content with ambiguous counting: {gc:.2f}")
+    GC content with ambiguous counting: 0.75
+
+    Some examples with ambiguous nucleotides.
+
+    >>> seq = "ACTGN"
+    >>> gc = gc_fraction(seq, "ignore")
+    >>> print(f"GC content of {seq} : {gc:.2f}")
+    GC content of ACTGN : 0.40
+    >>> gc = gc_fraction(seq, "weighted")
+    >>> print(f"GC content with ambiguous counting: {gc:.2f}")
+    GC content with ambiguous counting: 0.50
+    >>> gc = gc_fraction(seq, "remove")
+    >>> print(f"GC content with ambiguous removing: {gc:.2f}")
+    GC content with ambiguous removing: 0.50
+
+    Ambiguous nucleotides are also removed from the length of the sequence.
+
+    >>> seq = "GDVV"
+    >>> gc = gc_fraction(seq, "ignore")
+    >>> print(f"GC content of {seq} : {gc:.2f}")
+    GC content of GDVV : 0.25
+    >>> gc = gc_fraction(seq, "weighted")
+    >>> print(f"GC content with ambiguous counting: {gc:.4f}")
+    GC content with ambiguous counting: 0.6667
+    >>> gc = gc_fraction(seq, "remove")
+    >>> print(f"GC content with ambiguous removing: {gc:.2f}")
+    GC content with ambiguous removing: 1.00
 
 
-def GC(seq):
-    """Calculate G+C content, return percentage (as float between 0 and 100).
+    Note that this will return zero for an empty sequence.
+    """
+    if ambiguous not in ("weighted", "remove", "ignore"):
+        raise ValueError(f"ambiguous value '{ambiguous}' not recognized")
 
-    Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
-    when counting the G and C content.  The percentage is calculated against
-    the full length, e.g.:
+    gc = sum(seq.count(x) for x in "CGScgs")
 
-    >>> from Bio.SeqUtils import GC
-    >>> GC("ACTGN")
-    40.0
+    if ambiguous == "remove":
+        l = gc + sum(seq.count(x) for x in "ATWatw")
+    else:
+        l = len(seq)
 
-    Note that this will return zero for an empty sequence.
+    if ambiguous == "weighted":
+        gc += sum(
+            (seq.count(x) + seq.count(x.lower())) * _gc_values[x] for x in "BDHKMNRVXY"
+        )
+
+    if l == 0:
+        return 0
+
+    return gc / l
+
+
+def GC(seq):
+    """Calculate G+C content (DEPRECATED).
+
+    Use Bio.SeqUtils.gc_fraction instead.
     """
+    warnings.warn(
+        "GC is deprecated; please use gc_fraction instead.",
+        BiopythonDeprecationWarning,
+    )
+
     gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])
     try:
         return gc * 100.0 / len(seq)

diff --git a/DEPRECATED.rst b/DEPRECATED.rst
@@ -593,6 +593,9 @@ The method 'print_index' of the CodonAdaptationIndex class in
 Bio.SeqUtils.CodonUsage was deprecated in Release 1.80. Instead of
 self.print_index(), please use print(self).
 
+Function 'GC' in Bio.SeqUtils was deprecated in Release 1.80. Instead use
+function 'gc_fraction'.
+
 Bio.GFF (for accessing a MySQL database created with BioPerl, etc)
 ------------------------------------------------------------------
 The whole of the old ``Bio.GFF`` module was deprecated in Release 1.53, and

diff --git a/Doc/Tutorial/chapter_cookbook.tex b/Doc/Tutorial/chapter_cookbook.tex
@@ -1131,9 +1131,11 @@ \subsection{Plot of sequence GC\%}
 
 \begin{minted}{python}
 from Bio import SeqIO
-from Bio.SeqUtils import GC
+from Bio.SeqUtils import gc_fraction
 
-gc_values = sorted(GC(rec.seq) for rec in SeqIO.parse("ls_orchid.fasta", "fasta"))
+gc_values = sorted(
+    100 * gc_fraction(rec.seq) for rec in SeqIO.parse("ls_orchid.fasta", "fasta")
+)
 \end{minted}
 
 Having read in each sequence and calculated the GC\%, we then sorted them into ascending

diff --git a/Doc/Tutorial/chapter_seq_objects.tex b/Doc/Tutorial/chapter_seq_objects.tex
@@ -75,13 +75,13 @@ \section{Sequences act like strings}
 %doctest
 \begin{minted}{pycon}
 >>> from Bio.Seq import Seq
->>> from Bio.SeqUtils import GC
+>>> from Bio.SeqUtils import gc_fraction
 >>> my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
->>> GC(my_seq)
-46.875
+>>> gc_fraction(my_seq)
+0.46875
 \end{minted}
 
-\noindent Note that using the \verb|Bio.SeqUtils.GC()| function should automatically cope with mixed case sequences and the ambiguous nucleotide S which means G or C.
+\noindent Note that using the \verb|Bio.SeqUtils.gc_fraction()| function should automatically cope with mixed case sequences and the ambiguous nucleotide S which means G or C.
 
 Also note that just like a normal Python string, the \verb|Seq| object is in some ways ``read-only''.  If you need to edit your sequence, for example simulating a point mutation, look at the Section~\ref{sec:mutable-seq} below which talks about the \verb|MutableSeq| object.
 

diff --git a/NEWS.rst b/NEWS.rst
@@ -59,13 +59,17 @@ from which the enzyme object was created, and a `uri` property with a canonical
 Additionally, a number of small bugs and typos have been fixed with additions
 to the test suite.
 
+Add new ``gc_fraction`` function in ``SeqUtils`` and marks ``GC`` for future
+deprecation.
+
 Many thanks to the Biopython developers and community for making this release
 possible, especially the following contributors:
 
 - Andrius Merkys
 - Arup Ghosh (first contribution)
 - Aziz Khan
 - Alex Morehead
+- Caio Fontes
 - Chenghao Zhu
 - Christian Brueffer
 - Damien Goutte-Gattat

diff --git a/Scripts/xbbtools/xbb_translations.py b/Scripts/xbbtools/xbb_translations.py
@@ -15,7 +15,7 @@
 import time
 
 from Bio.Seq import Seq, reverse_complement, translate
-from Bio.SeqUtils import GC
+from Bio.SeqUtils import gc_fraction
 
 
 class xbb_translations:
@@ -93,7 +93,7 @@ def frame_nice(self, seq, frame, translation_table=1):
 
     def gc(self, seq):
         """Calculate GC content in percent (0-100)."""
-        return GC(seq)
+        return 100 * gc_fraction(seq)
 
     def gcframe(self, seq, translation_table=1, direction="both"):
         """Print a pretty print translation in several frames."""

diff --git a/Tests/test_Align_Alignment.py b/Tests/test_Align_Alignment.py
@@ -20,7 +20,7 @@
 from Bio import Align, SeqIO
 from Bio.Seq import Seq, reverse_complement
 from Bio.SeqRecord import SeqRecord
-from Bio.SeqUtils import GC
+from Bio.SeqUtils import gc_fraction
 
 
 class TestPairwiseAlignment(unittest.TestCase):
@@ -993,7 +993,7 @@ def test_sort(self):
 ACTT
 """,
         )
-        alignment.sort(key=GC)
+        alignment.sort(key=gc_fraction)
         self.assertEqual(
             str(alignment),
             """\
@@ -1002,7 +1002,7 @@ def test_sort(self):
 ACCT
 """,
         )
-        alignment.sort(key=GC, reverse=True)
+        alignment.sort(key=gc_fraction, reverse=True)
         self.assertEqual(
             str(alignment),
             """\
@@ -1613,7 +1613,7 @@ def test_sort(self):
             tuple(sequence.id for sequence in alignment.sequences),
             ("seq7", "seq6", "seq5", "seq4", "seq3", "seq2", "seq1"),
         )
-        alignment.sort(key=lambda record: GC(record.seq))
+        alignment.sort(key=lambda record: gc_fraction(record.seq))
         self.assertEqual(
             "\n".join(row for row in alignment),  # str(alignment),
             """\
@@ -1629,7 +1629,7 @@ def test_sort(self):
             tuple(sequence.id for sequence in alignment.sequences),
             ("seq3", "seq7", "seq4", "seq5", "seq1", "seq6", "seq2"),
         )
-        alignment.sort(key=lambda record: GC(record.seq), reverse=True)
+        alignment.sort(key=lambda record: gc_fraction(record.seq), reverse=True)
         self.assertEqual(
             "\n".join(row for row in alignment),  # str(alignment),
             """\