Skip to content

Commit

Permalink
Added ability to return GC content as a ratio. [Finish #564] (#3540)
Browse files Browse the repository at this point in the history
While GC content is usually a percentage, many individuals use it strictly as a ratio. Adding the ability to return a ratio instead of a percentage will provide more options for the user and eliminate the need for extra conversions on the user's part.

Deprecates GC in favor of new gc_fraction function.

Co-authored-by: Cham K <chamkank@users.noreply.github.com>
  • Loading branch information
Caiofcas and chamkank committed Aug 26, 2022
1 parent 72211ac commit 8709ac7
Show file tree
Hide file tree
Showing 10 changed files with 188 additions and 48 deletions.
4 changes: 2 additions & 2 deletions Bio/Align/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,13 +835,13 @@ def sort(self, key=None, reverse=False):
As an example using a different sort order, you could sort on the
GC content of each sequence.
>>> from Bio.SeqUtils import GC
>>> from Bio.SeqUtils import gc_fraction
>>> print(align1)
Alignment with 3 rows and 4 columns
ACGC Chicken
ACGT Human
ACGG Mouse
>>> align1.sort(key = lambda record: GC(record.seq))
>>> align1.sort(key = lambda record: gc_fraction(record.seq))
>>> print(align1)
Alignment with 3 rows and 4 columns
ACGT Human
Expand Down
28 changes: 14 additions & 14 deletions Bio/SeqUtils/MeltingTemp.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ def salt_correction(Na=0, K=0, Tris=0, Mg=0, dNTPs=0, method=1, seq=None):
corr = 0.368 * (len(seq) - 1) * math.log(mon)
if method == 6:
corr = (
(4.29 * SeqUtils.GC(seq) / 100 - 3.95) * 1e-5 * math.log(mon)
(4.29 * SeqUtils.gc_fraction(seq, "ignore") - 3.95) * 1e-5 * math.log(mon)
) + 9.40e-6 * math.log(mon) ** 2
# Turn black code style off
# fmt: off
Expand All @@ -581,7 +581,7 @@ def salt_correction(Na=0, K=0, Tris=0, Mg=0, dNTPs=0, method=1, seq=None):
if Mon > 0:
R = math.sqrt(mg) / mon
if R < 0.22:
corr = (4.29 * SeqUtils.GC(seq) / 100 - 3.95) * \
corr = (4.29 * SeqUtils.gc_fraction(seq, "ignore") - 3.95) * \
1e-5 * math.log(mon) + 9.40e-6 * math.log(mon) ** 2
return corr
elif R < 6.0:
Expand All @@ -590,7 +590,7 @@ def salt_correction(Na=0, K=0, Tris=0, Mg=0, dNTPs=0, method=1, seq=None):
- 8.03e-3 * math.log(mon) ** 2)
g = 8.31 * (0.486 - 0.258 * math.log(mon)
+ 5.25e-3 * math.log(mon) ** 3)
corr = (a + b * math.log(mg) + (SeqUtils.GC(seq) / 100)
corr = (a + b * math.log(mg) + (SeqUtils.gc_fraction(seq, "ignore"))
* (c + d * math.log(mg)) + (1 / (2.0 * (len(seq) - 1)))
* (e + f * math.log(mg) + g * math.log(mg) ** 2)) * 1e-5
# Turn black code style on
Expand Down Expand Up @@ -773,19 +773,19 @@ def Tm_GC(
seq = str(seq)
if check:
seq = _check(seq, "Tm_GC")
percent_gc = SeqUtils.GC(seq)
# Ambiguous bases: add 0.5, 0.67 or 0.33% depending on G+C probability:
tmp = (
sum(map(seq.count, ("K", "M", "N", "R", "Y"))) * 50.0 / len(seq)
+ sum(map(seq.count, ("B", "V"))) * 66.67 / len(seq)
+ sum(map(seq.count, ("D", "H"))) * 33.33 / len(seq)
)
if strict and tmp:

if strict and any(x in seq for x in "KMNRYBVDH"):
raise ValueError(
"ambiguous bases B, D, H, K, M, N, R, V, Y not allowed when 'strict=True'"
)
else:
percent_gc += tmp

# Ambiguous bases: add 0.5, 0.67 or 0.33% depending on G+C probability:
percent_gc = SeqUtils.gc_fraction(seq, "weighted") * 100

# gc_fraction counts X as 0.5
if mismatch:
percent_gc -= seq.count("X") * 50.0 / len(seq)

if userset:
A, B, C, D = userset
else:
Expand Down Expand Up @@ -1012,7 +1012,7 @@ def Tm_NN(
delta_s += nn_table["init"][d_s]

# Type: Duplex with no (allA/T) or at least one (oneG/C) GC pair
if SeqUtils.GC(seq) == 0:
if SeqUtils.gc_fraction(seq, "ignore") == 0:
delta_h += nn_table["init_allA/T"][d_h]
delta_s += nn_table["init_allA/T"][d_s]
else:
Expand Down
130 changes: 119 additions & 11 deletions Bio/SeqUtils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,139 @@


import re
import warnings
from math import pi, sin, cos

from Bio.Seq import Seq, complement, complement_rna
from Bio.Data import IUPACData

from Bio import BiopythonDeprecationWarning

######################################
# DNA
######################
# {{{
# {

_gc_values = {
"G": 1.000,
"C": 1.000,
"A": 0.000,
"T": 0.000,
"S": 1.000, # Strong interaction (3 H bonds) (G or C)
"W": 0.000, # Weak interaction (2 H bonds) (A or T)
"M": 0.500, # Amino (A or C)
"R": 0.500, # Purine (A or G)
"Y": 0.500, # Pyrimidine (T or C)
"K": 0.500, # Keto (G or T)
"V": 2 / 3, # Not T or U (A or C or G)
"B": 2 / 3, # Not A (C or G or T)
"H": 1 / 3, # Not G (A or C or T)
"D": 1 / 3, # Not C (A or G or T)
"X": 0.500, # Any nucleotide (A or C or G or T)
"N": 0.500, # Any nucleotide (A or C or G or T)
}


def gc_fraction(seq, ambiguous="remove"):
"""Calculate G+C percentage in seq (float between 0 and 1).
Copes with mixed case sequences. Ambiguous Nucleotides in this context are
those different from ATCGSW (S is G or C, and W is A or T).
If ambiguous equals "remove" (default), will only count GCS and will only
include ACTGSW when calculating the sequence length. Equivalent to removing
all characters in the set BDHKMNRVXY before calculating the GC content, as
each of these ambiguous nucleotides can either be in (A,T) or in (C,G).
If ambiguous equals "ignore", it will treat only unambiguous nucleotides (GCS)
as counting towards the GC percentage, but will include all ambiguous and
unambiguous nucleotides when calculating the sequence length.
If ambiguous equals "weighted", will use a "mean" value when counting the
ambiguous characters, for example, G and C will be counted as 1, N and X will
be counted as 0.5, D will be counted as 0.33 etc. See Bio.SeqUtils._gc_values
for a full list.
Will raise a ValueError for any other value of the ambiguous parameter.
>>> from Bio.SeqUtils import gc_fraction
>>> seq = "ACTG"
>>> print(f"GC content of {seq} : {gc_fraction(seq):.2f}")
GC content of ACTG : 0.50
S and W are ambiguous for the purposes of calculating the GC content.
>>> seq = "ACTGSSSS"
>>> gc = gc_fraction(seq, "remove")
>>> print(f"GC content of {seq} : {gc:.2f}")
GC content of ACTGSSSS : 0.75
>>> gc = gc_fraction(seq, "ignore")
>>> print(f"GC content of {seq} : {gc:.2f}")
GC content of ACTGSSSS : 0.75
>>> gc = gc_fraction(seq, "weighted")
>>> print(f"GC content with ambiguous counting: {gc:.2f}")
GC content with ambiguous counting: 0.75
Some examples with ambiguous nucleotides.
>>> seq = "ACTGN"
>>> gc = gc_fraction(seq, "ignore")
>>> print(f"GC content of {seq} : {gc:.2f}")
GC content of ACTGN : 0.40
>>> gc = gc_fraction(seq, "weighted")
>>> print(f"GC content with ambiguous counting: {gc:.2f}")
GC content with ambiguous counting: 0.50
>>> gc = gc_fraction(seq, "remove")
>>> print(f"GC content with ambiguous removing: {gc:.2f}")
GC content with ambiguous removing: 0.50
Ambiguous nucleotides are also removed from the length of the sequence.
>>> seq = "GDVV"
>>> gc = gc_fraction(seq, "ignore")
>>> print(f"GC content of {seq} : {gc:.2f}")
GC content of GDVV : 0.25
>>> gc = gc_fraction(seq, "weighted")
>>> print(f"GC content with ambiguous counting: {gc:.4f}")
GC content with ambiguous counting: 0.6667
>>> gc = gc_fraction(seq, "remove")
>>> print(f"GC content with ambiguous removing: {gc:.2f}")
GC content with ambiguous removing: 1.00
def GC(seq):
"""Calculate G+C content, return percentage (as float between 0 and 100).
Note that this will return zero for an empty sequence.
"""
if ambiguous not in ("weighted", "remove", "ignore"):
raise ValueError(f"ambiguous value '{ambiguous}' not recognized")

Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
when counting the G and C content. The percentage is calculated against
the full length, e.g.:
gc = sum(seq.count(x) for x in "CGScgs")

>>> from Bio.SeqUtils import GC
>>> GC("ACTGN")
40.0
if ambiguous == "remove":
l = gc + sum(seq.count(x) for x in "ATWatw")
else:
l = len(seq)

Note that this will return zero for an empty sequence.
if ambiguous == "weighted":
gc += sum(
(seq.count(x) + seq.count(x.lower())) * _gc_values[x] for x in "BDHKMNRVXY"
)

if l == 0:
return 0

return gc / l


def GC(seq):
"""Calculate G+C content (DEPRECATED).
Use Bio.SeqUtils.gc_fraction instead.
"""
warnings.warn(
"GC is deprecated; please use gc_fraction instead.",
BiopythonDeprecationWarning,
)

gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])
try:
return gc * 100.0 / len(seq)
Expand Down
3 changes: 3 additions & 0 deletions DEPRECATED.rst
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,9 @@ The method 'print_index' of the CodonAdaptationIndex class in
Bio.SeqUtils.CodonUsage was deprecated in Release 1.80. Instead of
self.print_index(), please use print(self).

Function 'GC' in Bio.SeqUtils was deprecated in Release 1.80. Instead use
function 'gc_fraction'.

Bio.GFF (for accessing a MySQL database created with BioPerl, etc)
------------------------------------------------------------------
The whole of the old ``Bio.GFF`` module was deprecated in Release 1.53, and
Expand Down
6 changes: 4 additions & 2 deletions Doc/Tutorial/chapter_cookbook.tex
Original file line number Diff line number Diff line change
Expand Up @@ -1131,9 +1131,11 @@ \subsection{Plot of sequence GC\%}

\begin{minted}{python}
from Bio import SeqIO
from Bio.SeqUtils import GC
from Bio.SeqUtils import gc_fraction

gc_values = sorted(GC(rec.seq) for rec in SeqIO.parse("ls_orchid.fasta", "fasta"))
gc_values = sorted(
100 * gc_fraction(rec.seq) for rec in SeqIO.parse("ls_orchid.fasta", "fasta")
)
\end{minted}

Having read in each sequence and calculated the GC\%, we then sorted them into ascending
Expand Down
8 changes: 4 additions & 4 deletions Doc/Tutorial/chapter_seq_objects.tex
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@ \section{Sequences act like strings}
%doctest
\begin{minted}{pycon}
>>> from Bio.Seq import Seq
>>> from Bio.SeqUtils import GC
>>> from Bio.SeqUtils import gc_fraction
>>> my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
>>> GC(my_seq)
46.875
>>> gc_fraction(my_seq)
0.46875
\end{minted}

\noindent Note that using the \verb|Bio.SeqUtils.GC()| function should automatically cope with mixed case sequences and the ambiguous nucleotide S which means G or C.
\noindent Note that using the \verb|Bio.SeqUtils.gc_fraction()| function should automatically cope with mixed case sequences and the ambiguous nucleotide S which means G or C.

Also note that just like a normal Python string, the \verb|Seq| object is in some ways ``read-only''. If you need to edit your sequence, for example simulating a point mutation, look at the Section~\ref{sec:mutable-seq} below which talks about the \verb|MutableSeq| object.

Expand Down
4 changes: 4 additions & 0 deletions NEWS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,17 @@ from which the enzyme object was created, and a `uri` property with a canonical
Additionally, a number of small bugs and typos have been fixed with additions
to the test suite.

Add new ``gc_fraction`` function in ``SeqUtils`` and marks ``GC`` for future
deprecation.

Many thanks to the Biopython developers and community for making this release
possible, especially the following contributors:

- Andrius Merkys
- Arup Ghosh (first contribution)
- Aziz Khan
- Alex Morehead
- Caio Fontes
- Chenghao Zhu
- Christian Brueffer
- Damien Goutte-Gattat
Expand Down
4 changes: 2 additions & 2 deletions Scripts/xbbtools/xbb_translations.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import time

from Bio.Seq import Seq, reverse_complement, translate
from Bio.SeqUtils import GC
from Bio.SeqUtils import gc_fraction


class xbb_translations:
Expand Down Expand Up @@ -93,7 +93,7 @@ def frame_nice(self, seq, frame, translation_table=1):

def gc(self, seq):
"""Calculate GC content in percent (0-100)."""
return GC(seq)
return 100 * gc_fraction(seq)

def gcframe(self, seq, translation_table=1, direction="both"):
"""Print a pretty print translation in several frames."""
Expand Down
10 changes: 5 additions & 5 deletions Tests/test_Align_Alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from Bio import Align, SeqIO
from Bio.Seq import Seq, reverse_complement
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import GC
from Bio.SeqUtils import gc_fraction


class TestPairwiseAlignment(unittest.TestCase):
Expand Down Expand Up @@ -993,7 +993,7 @@ def test_sort(self):
ACTT
""",
)
alignment.sort(key=GC)
alignment.sort(key=gc_fraction)
self.assertEqual(
str(alignment),
"""\
Expand All @@ -1002,7 +1002,7 @@ def test_sort(self):
ACCT
""",
)
alignment.sort(key=GC, reverse=True)
alignment.sort(key=gc_fraction, reverse=True)
self.assertEqual(
str(alignment),
"""\
Expand Down Expand Up @@ -1613,7 +1613,7 @@ def test_sort(self):
tuple(sequence.id for sequence in alignment.sequences),
("seq7", "seq6", "seq5", "seq4", "seq3", "seq2", "seq1"),
)
alignment.sort(key=lambda record: GC(record.seq))
alignment.sort(key=lambda record: gc_fraction(record.seq))
self.assertEqual(
"\n".join(row for row in alignment), # str(alignment),
"""\
Expand All @@ -1629,7 +1629,7 @@ def test_sort(self):
tuple(sequence.id for sequence in alignment.sequences),
("seq3", "seq7", "seq4", "seq5", "seq1", "seq6", "seq2"),
)
alignment.sort(key=lambda record: GC(record.seq), reverse=True)
alignment.sort(key=lambda record: gc_fraction(record.seq), reverse=True)
self.assertEqual(
"\n".join(row for row in alignment), # str(alignment),
"""\
Expand Down
Loading

0 comments on commit 8709ac7

Please sign in to comment.