diff --git a/Bio/Align/__init__.py b/Bio/Align/__init__.py index ffd47a555d2..bc623421412 100644 --- a/Bio/Align/__init__.py +++ b/Bio/Align/__init__.py @@ -835,13 +835,13 @@ def sort(self, key=None, reverse=False): As an example using a different sort order, you could sort on the GC content of each sequence. - >>> from Bio.SeqUtils import GC + >>> from Bio.SeqUtils import gc_fraction >>> print(align1) Alignment with 3 rows and 4 columns ACGC Chicken ACGT Human ACGG Mouse - >>> align1.sort(key = lambda record: GC(record.seq)) + >>> align1.sort(key = lambda record: gc_fraction(record.seq)) >>> print(align1) Alignment with 3 rows and 4 columns ACGT Human diff --git a/Bio/SeqUtils/MeltingTemp.py b/Bio/SeqUtils/MeltingTemp.py index e6cdfc8e110..2a638ad2d7c 100644 --- a/Bio/SeqUtils/MeltingTemp.py +++ b/Bio/SeqUtils/MeltingTemp.py @@ -564,7 +564,7 @@ def salt_correction(Na=0, K=0, Tris=0, Mg=0, dNTPs=0, method=1, seq=None): corr = 0.368 * (len(seq) - 1) * math.log(mon) if method == 6: corr = ( - (4.29 * SeqUtils.GC(seq) / 100 - 3.95) * 1e-5 * math.log(mon) + (4.29 * SeqUtils.gc_fraction(seq, "ignore") - 3.95) * 1e-5 * math.log(mon) ) + 9.40e-6 * math.log(mon) ** 2 # Turn black code style off # fmt: off @@ -581,7 +581,7 @@ def salt_correction(Na=0, K=0, Tris=0, Mg=0, dNTPs=0, method=1, seq=None): if Mon > 0: R = math.sqrt(mg) / mon if R < 0.22: - corr = (4.29 * SeqUtils.GC(seq) / 100 - 3.95) * \ + corr = (4.29 * SeqUtils.gc_fraction(seq, "ignore") - 3.95) * \ 1e-5 * math.log(mon) + 9.40e-6 * math.log(mon) ** 2 return corr elif R < 6.0: @@ -590,7 +590,7 @@ def salt_correction(Na=0, K=0, Tris=0, Mg=0, dNTPs=0, method=1, seq=None): - 8.03e-3 * math.log(mon) ** 2) g = 8.31 * (0.486 - 0.258 * math.log(mon) + 5.25e-3 * math.log(mon) ** 3) - corr = (a + b * math.log(mg) + (SeqUtils.GC(seq) / 100) + corr = (a + b * math.log(mg) + (SeqUtils.gc_fraction(seq, "ignore")) * (c + d * math.log(mg)) + (1 / (2.0 * (len(seq) - 1))) * (e + f * math.log(mg) + g * math.log(mg) ** 2)) * 1e-5 # Turn black code style on @@ -773,19 +773,19 @@ def Tm_GC( seq = str(seq) if check: seq = _check(seq, "Tm_GC") - percent_gc = SeqUtils.GC(seq) - # Ambiguous bases: add 0.5, 0.67 or 0.33% depending on G+C probability: - tmp = ( - sum(map(seq.count, ("K", "M", "N", "R", "Y"))) * 50.0 / len(seq) - + sum(map(seq.count, ("B", "V"))) * 66.67 / len(seq) - + sum(map(seq.count, ("D", "H"))) * 33.33 / len(seq) - ) - if strict and tmp: + + if strict and any(x in seq for x in "KMNRYBVDH"): raise ValueError( "ambiguous bases B, D, H, K, M, N, R, V, Y not allowed when 'strict=True'" ) - else: - percent_gc += tmp + + # Ambiguous bases: add 0.5, 0.67 or 0.33% depending on G+C probability: + percent_gc = SeqUtils.gc_fraction(seq, "weighted") * 100 + + # gc_fraction counts X as 0.5 + if mismatch: + percent_gc -= seq.count("X") * 50.0 / len(seq) + if userset: A, B, C, D = userset else: @@ -1012,7 +1012,7 @@ def Tm_NN( delta_s += nn_table["init"][d_s] # Type: Duplex with no (allA/T) or at least one (oneG/C) GC pair - if SeqUtils.GC(seq) == 0: + if SeqUtils.gc_fraction(seq, "ignore") == 0: delta_h += nn_table["init_allA/T"][d_h] delta_s += nn_table["init_allA/T"][d_s] else: diff --git a/Bio/SeqUtils/__init__.py b/Bio/SeqUtils/__init__.py index dd9a4b6f51c..bb653e577ea 100644 --- a/Bio/SeqUtils/__init__.py +++ b/Bio/SeqUtils/__init__.py @@ -11,31 +11,139 @@ import re +import warnings from math import pi, sin, cos from Bio.Seq import Seq, complement, complement_rna from Bio.Data import IUPACData - +from Bio import BiopythonDeprecationWarning ###################################### # DNA ###################### -# {{{ +# { + +_gc_values = { + "G": 1.000, + "C": 1.000, + "A": 0.000, + "T": 0.000, + "S": 1.000, # Strong interaction (3 H bonds) (G or C) + "W": 0.000, # Weak interaction (2 H bonds) (A or T) + "M": 0.500, # Amino (A or C) + "R": 0.500, # Purine (A or G) + "Y": 0.500, # Pyrimidine (T or C) + "K": 0.500, # Keto (G or T) + "V": 2 / 3, # Not T or U (A or C or G) + "B": 2 / 3, # Not A (C or G or T) + "H": 1 / 3, # Not G (A or C or T) + "D": 1 / 3, # Not C (A or G or T) + "X": 0.500, # Any nucleotide (A or C or G or T) + "N": 0.500, # Any nucleotide (A or C or G or T) +} + + +def gc_fraction(seq, ambiguous="remove"): + """Calculate G+C percentage in seq (float between 0 and 1). + + Copes with mixed case sequences. Ambiguous Nucleotides in this context are + those different from ATCGSW (S is G or C, and W is A or T). + + If ambiguous equals "remove" (default), will only count GCS and will only + include ACTGSW when calculating the sequence length. Equivalent to removing + all characters in the set BDHKMNRVXY before calculating the GC content, as + each of these ambiguous nucleotides can either be in (A,T) or in (C,G). + + If ambiguous equals "ignore", it will treat only unambiguous nucleotides (GCS) + as counting towards the GC percentage, but will include all ambiguous and + unambiguous nucleotides when calculating the sequence length. + + If ambiguous equals "weighted", will use a "mean" value when counting the + ambiguous characters, for example, G and C will be counted as 1, N and X will + be counted as 0.5, D will be counted as 0.33 etc. See Bio.SeqUtils._gc_values + for a full list. + + Will raise a ValueError for any other value of the ambiguous parameter. + + + >>> from Bio.SeqUtils import gc_fraction + >>> seq = "ACTG" + >>> print(f"GC content of {seq} : {gc_fraction(seq):.2f}") + GC content of ACTG : 0.50 + + S and W are ambiguous for the purposes of calculating the GC content. + + >>> seq = "ACTGSSSS" + >>> gc = gc_fraction(seq, "remove") + >>> print(f"GC content of {seq} : {gc:.2f}") + GC content of ACTGSSSS : 0.75 + >>> gc = gc_fraction(seq, "ignore") + >>> print(f"GC content of {seq} : {gc:.2f}") + GC content of ACTGSSSS : 0.75 + >>> gc = gc_fraction(seq, "weighted") + >>> print(f"GC content with ambiguous counting: {gc:.2f}") + GC content with ambiguous counting: 0.75 + + Some examples with ambiguous nucleotides. + + >>> seq = "ACTGN" + >>> gc = gc_fraction(seq, "ignore") + >>> print(f"GC content of {seq} : {gc:.2f}") + GC content of ACTGN : 0.40 + >>> gc = gc_fraction(seq, "weighted") + >>> print(f"GC content with ambiguous counting: {gc:.2f}") + GC content with ambiguous counting: 0.50 + >>> gc = gc_fraction(seq, "remove") + >>> print(f"GC content with ambiguous removing: {gc:.2f}") + GC content with ambiguous removing: 0.50 + + Ambiguous nucleotides are also removed from the length of the sequence. + + >>> seq = "GDVV" + >>> gc = gc_fraction(seq, "ignore") + >>> print(f"GC content of {seq} : {gc:.2f}") + GC content of GDVV : 0.25 + >>> gc = gc_fraction(seq, "weighted") + >>> print(f"GC content with ambiguous counting: {gc:.4f}") + GC content with ambiguous counting: 0.6667 + >>> gc = gc_fraction(seq, "remove") + >>> print(f"GC content with ambiguous removing: {gc:.2f}") + GC content with ambiguous removing: 1.00 -def GC(seq): - """Calculate G+C content, return percentage (as float between 0 and 100). + Note that this will return zero for an empty sequence. + """ + if ambiguous not in ("weighted", "remove", "ignore"): + raise ValueError(f"ambiguous value '{ambiguous}' not recognized") - Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) - when counting the G and C content. The percentage is calculated against - the full length, e.g.: + gc = sum(seq.count(x) for x in "CGScgs") - >>> from Bio.SeqUtils import GC - >>> GC("ACTGN") - 40.0 + if ambiguous == "remove": + l = gc + sum(seq.count(x) for x in "ATWatw") + else: + l = len(seq) - Note that this will return zero for an empty sequence. + if ambiguous == "weighted": + gc += sum( + (seq.count(x) + seq.count(x.lower())) * _gc_values[x] for x in "BDHKMNRVXY" + ) + + if l == 0: + return 0 + + return gc / l + + +def GC(seq): + """Calculate G+C content (DEPRECATED). + + Use Bio.SeqUtils.gc_fraction instead. """ + warnings.warn( + "GC is deprecated; please use gc_fraction instead.", + BiopythonDeprecationWarning, + ) + gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"]) try: return gc * 100.0 / len(seq) diff --git a/DEPRECATED.rst b/DEPRECATED.rst index 357722c8cd1..81746718756 100644 --- a/DEPRECATED.rst +++ b/DEPRECATED.rst @@ -593,6 +593,9 @@ The method 'print_index' of the CodonAdaptationIndex class in Bio.SeqUtils.CodonUsage was deprecated in Release 1.80. Instead of self.print_index(), please use print(self). +Function 'GC' in Bio.SeqUtils was deprecated in Release 1.80. Instead use +function 'gc_fraction'. + Bio.GFF (for accessing a MySQL database created with BioPerl, etc) ------------------------------------------------------------------ The whole of the old ``Bio.GFF`` module was deprecated in Release 1.53, and diff --git a/Doc/Tutorial/chapter_cookbook.tex b/Doc/Tutorial/chapter_cookbook.tex index 144e233de59..42a4dab4c8b 100644 --- a/Doc/Tutorial/chapter_cookbook.tex +++ b/Doc/Tutorial/chapter_cookbook.tex @@ -1131,9 +1131,11 @@ \subsection{Plot of sequence GC\%} \begin{minted}{python} from Bio import SeqIO -from Bio.SeqUtils import GC +from Bio.SeqUtils import gc_fraction -gc_values = sorted(GC(rec.seq) for rec in SeqIO.parse("ls_orchid.fasta", "fasta")) +gc_values = sorted( + 100 * gc_fraction(rec.seq) for rec in SeqIO.parse("ls_orchid.fasta", "fasta") +) \end{minted} Having read in each sequence and calculated the GC\%, we then sorted them into ascending diff --git a/Doc/Tutorial/chapter_seq_objects.tex b/Doc/Tutorial/chapter_seq_objects.tex index a1fe590a963..16b2f31e27c 100644 --- a/Doc/Tutorial/chapter_seq_objects.tex +++ b/Doc/Tutorial/chapter_seq_objects.tex @@ -75,13 +75,13 @@ \section{Sequences act like strings} %doctest \begin{minted}{pycon} >>> from Bio.Seq import Seq ->>> from Bio.SeqUtils import GC +>>> from Bio.SeqUtils import gc_fraction >>> my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC") ->>> GC(my_seq) -46.875 +>>> gc_fraction(my_seq) +0.46875 \end{minted} -\noindent Note that using the \verb|Bio.SeqUtils.GC()| function should automatically cope with mixed case sequences and the ambiguous nucleotide S which means G or C. +\noindent Note that using the \verb|Bio.SeqUtils.gc_fraction()| function should automatically cope with mixed case sequences and the ambiguous nucleotide S which means G or C. Also note that just like a normal Python string, the \verb|Seq| object is in some ways ``read-only''. If you need to edit your sequence, for example simulating a point mutation, look at the Section~\ref{sec:mutable-seq} below which talks about the \verb|MutableSeq| object. diff --git a/NEWS.rst b/NEWS.rst index 8aff40960da..7d6ad464047 100644 --- a/NEWS.rst +++ b/NEWS.rst @@ -59,6 +59,9 @@ from which the enzyme object was created, and a `uri` property with a canonical Additionally, a number of small bugs and typos have been fixed with additions to the test suite. +Add new ``gc_fraction`` function in ``SeqUtils`` and marks ``GC`` for future +deprecation. + Many thanks to the Biopython developers and community for making this release possible, especially the following contributors: @@ -66,6 +69,7 @@ possible, especially the following contributors: - Arup Ghosh (first contribution) - Aziz Khan - Alex Morehead +- Caio Fontes - Chenghao Zhu - Christian Brueffer - Damien Goutte-Gattat diff --git a/Scripts/xbbtools/xbb_translations.py b/Scripts/xbbtools/xbb_translations.py index 979a437dfd5..c348c3dd1e5 100644 --- a/Scripts/xbbtools/xbb_translations.py +++ b/Scripts/xbbtools/xbb_translations.py @@ -15,7 +15,7 @@ import time from Bio.Seq import Seq, reverse_complement, translate -from Bio.SeqUtils import GC +from Bio.SeqUtils import gc_fraction class xbb_translations: @@ -93,7 +93,7 @@ def frame_nice(self, seq, frame, translation_table=1): def gc(self, seq): """Calculate GC content in percent (0-100).""" - return GC(seq) + return 100 * gc_fraction(seq) def gcframe(self, seq, translation_table=1, direction="both"): """Print a pretty print translation in several frames.""" diff --git a/Tests/test_Align_Alignment.py b/Tests/test_Align_Alignment.py index 98eba1f5ef8..17f16dddd6a 100644 --- a/Tests/test_Align_Alignment.py +++ b/Tests/test_Align_Alignment.py @@ -20,7 +20,7 @@ from Bio import Align, SeqIO from Bio.Seq import Seq, reverse_complement from Bio.SeqRecord import SeqRecord -from Bio.SeqUtils import GC +from Bio.SeqUtils import gc_fraction class TestPairwiseAlignment(unittest.TestCase): @@ -993,7 +993,7 @@ def test_sort(self): ACTT """, ) - alignment.sort(key=GC) + alignment.sort(key=gc_fraction) self.assertEqual( str(alignment), """\ @@ -1002,7 +1002,7 @@ def test_sort(self): ACCT """, ) - alignment.sort(key=GC, reverse=True) + alignment.sort(key=gc_fraction, reverse=True) self.assertEqual( str(alignment), """\ @@ -1613,7 +1613,7 @@ def test_sort(self): tuple(sequence.id for sequence in alignment.sequences), ("seq7", "seq6", "seq5", "seq4", "seq3", "seq2", "seq1"), ) - alignment.sort(key=lambda record: GC(record.seq)) + alignment.sort(key=lambda record: gc_fraction(record.seq)) self.assertEqual( "\n".join(row for row in alignment), # str(alignment), """\ @@ -1629,7 +1629,7 @@ def test_sort(self): tuple(sequence.id for sequence in alignment.sequences), ("seq3", "seq7", "seq4", "seq5", "seq1", "seq6", "seq2"), ) - alignment.sort(key=lambda record: GC(record.seq), reverse=True) + alignment.sort(key=lambda record: gc_fraction(record.seq), reverse=True) self.assertEqual( "\n".join(row for row in alignment), # str(alignment), """\ diff --git a/Tests/test_SeqUtils.py b/Tests/test_SeqUtils.py index 2f5e8107084..937ac7313e1 100644 --- a/Tests/test_SeqUtils.py +++ b/Tests/test_SeqUtils.py @@ -11,7 +11,7 @@ from Bio.Seq import MutableSeq from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -from Bio.SeqUtils import GC +from Bio.SeqUtils import gc_fraction from Bio.SeqUtils import GC_skew from Bio.SeqUtils import seq1 from Bio.SeqUtils import seq3 @@ -340,13 +340,36 @@ def test_checksum3(self): ), ) - def test_GC(self): - s = "ACGGGCTACCGTATAGGCAAGAGATGATGCCC" - seq = Seq(s) - record = SeqRecord(seq) - self.assertEqual(GC(s), 56.25) - self.assertEqual(GC(seq), 56.25) - self.assertEqual(GC(record), 56.25) + def test_gc_fraction(self): + """Tests gc_fraction function.""" + self.assertAlmostEqual(gc_fraction("", "ignore"), 0, places=3) + self.assertAlmostEqual(gc_fraction("", "weighted"), 0, places=3) + self.assertAlmostEqual(gc_fraction("", "remove"), 0, places=3) + + seq = "ACGGGCTACCGTATAGGCAAGAGATGATGCCC" + self.assertAlmostEqual(gc_fraction(seq, "ignore"), 0.5625, places=3) + self.assertAlmostEqual(gc_fraction(seq, "weighted"), 0.5625, places=3) + self.assertAlmostEqual(gc_fraction(seq, "remove"), 0.5625, places=3) + + seq = "ACTGSSSS" + self.assertAlmostEqual(gc_fraction(seq, "ignore"), 0.75, places=3) + self.assertAlmostEqual(gc_fraction(seq, "weighted"), 0.75, places=3) + self.assertAlmostEqual(gc_fraction(seq, "remove"), 0.75, places=3) + + # Test ambiguous nucleotide behaviour + + seq = "CCTGNN" + self.assertAlmostEqual(gc_fraction(seq, "ignore"), 0.5, places=3) + self.assertAlmostEqual(gc_fraction(seq, "weighted"), 0.667, places=3) + self.assertAlmostEqual(gc_fraction(seq, "remove"), 0.75, places=3) + + seq = "GDVV" + self.assertAlmostEqual(gc_fraction(seq, "ignore"), 0.25, places=3) + self.assertAlmostEqual(gc_fraction(seq, "weighted"), 0.6667, places=3) + self.assertAlmostEqual(gc_fraction(seq, "remove"), 1.00, places=3) + + with self.assertRaises(ValueError): + gc_fraction(seq, "other string") def test_GC_skew(self): s = "A" * 50