Skip to content

Commit

Permalink
Add a reverse_complement method to the Alignment class (#4331)
Browse files Browse the repository at this point in the history
* update

* update

* update

* update

---------

Co-authored-by: Michiel de Hoon <mdehoon@tkx249.genome.gsc.riken.jp>
  • Loading branch information
mdehoon and Michiel de Hoon committed Jun 15, 2023
1 parent cf49e3d commit be9f39d
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 2 deletions.
48 changes: 48 additions & 0 deletions Bio/Align/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3132,6 +3132,54 @@ def counts(self):
mismatches += 1
return AlignmentCounts(gaps, identities, mismatches)

def reverse_complement(self):
"""Reverse-complement the alignment and return it.
>>> sequences = ["ATCG", "AAG", "ATC"]
>>> coordinates = np.array([[0, 2, 3, 4], [0, 2, 2, 3], [0, 2, 3, 3]])
>>> alignment = Alignment(sequences, coordinates)
>>> print(alignment)
0 ATCG 4
0 AA-G 3
0 ATC- 3
<BLANKLINE>
>>> rc_alignment = alignment.reverse_complement()
>>> print(rc_alignment)
0 CGAT 4
0 C-TT 3
0 -GAT 3
<BLANKLINE>
The attribute `column_annotations`, if present, is associated with the
reverse-complemented alignment, with its values in reverse order.
>>> alignment.column_annotations = {"score": [3, 2, 2, 2]}
>>> rc_alignment = alignment.reverse_complement()
>>> print(rc_alignment.column_annotations)
{'score': [2, 2, 2, 3]}
"""
sequences = [reverse_complement(sequence) for sequence in self.sequences]
coordinates = np.array(
[
len(sequence) - row[::-1]
for sequence, row in zip(sequences, self.coordinates)
]
)
alignment = Alignment(sequences, coordinates)
try:
column_annotations = self.column_annotations
except AttributeError:
pass
else:
alignment.column_annotations = {}
for key, value in column_annotations.items():
if isinstance(value, np.ndarray):
value = value[::-1].copy()
else:
value = value[::-1]
alignment.column_annotations[key] = value
return alignment


class AlignmentsAbstractBaseClass(ABC):
"""Abstract base class for sequence alignments.
Expand Down
33 changes: 33 additions & 0 deletions Doc/Tutorial/chapter_align.tex
Original file line number Diff line number Diff line change
Expand Up @@ -2404,6 +2404,39 @@ \subsection{Alignment objects}
<BLANKLINE>
\end{minted}

\paragraph*{Reverse-complementing the alignment}

Reverse_complementing an alignment will take the reverse complement of each sequence, and recalculate the coordinates:
%cont-doctest
\begin{minted}{pycon}
>>> print(alignment.sequences)
['GAACT', 'GAT']
>>> rc_alignment = alignment.reverse_complement()
>>> print(rc_alignment.sequences)
['AGTTC', 'ATC']
>>> print(rc_alignment)
target 0 AGTTC 5
0 |-|-| 5
query 0 A-T-C 3
<BLANKLINE>
>>> print(alignment[:, :4].sequences)
['GAACT', 'GAT']
>>> print(alignment[:, :4])
target 0 GAAC 4
0 |-|- 4
query 0 G-A- 2
<BLANKLINE>
>>> rc_alignment = alignment[:, :4].reverse_complement()
>>> print(rc_alignment[:, :4].sequences)
['AGTTC', 'ATC']
>>> print(rc_alignment[:, :4])
target 1 GTTC 5
0 -|-| 4
query 1 -T-C 3
<BLANKLINE>
\end{minted}
Reverse-complementing an alignment preserves its column annotations (in reverse order), but discards all other annotations.

\paragraph*{Exporting alignments}

Use the \verb+format+ method to create a string representation of the alignment in various file formats. This method takes an argument \verb+fmt+ specifying the file format, and may take additional keyword arguments depending on file type. The following values for \verb+fmt+ are supported:
Expand Down
40 changes: 38 additions & 2 deletions Tests/test_Align_Alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1292,6 +1292,44 @@ def test_target_query_properties(self):
self.assertEqual(alignment.target, target)
self.assertEqual(alignment.query, query)

def test_reverse_complement(self):
target = SeqRecord(Seq(self.target), id="seqA")
query = SeqRecord(Seq(self.query), id="seqB")
sequences = [target, query]
coordinates = self.forward_coordinates
alignment = Align.Alignment(sequences, coordinates)
alignment.column_annotations = {
"score": [2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1],
"letter": "ABCDEFGHIJKL",
}
self.assertEqual(
str(alignment),
"""\
seqA 0 AACCGGGA-CCG 11
0 |-|-||-|-|-- 12
seqB 0 A-C-GG-AAC-- 7
""",
)
self.assertEqual(
alignment.column_annotations["score"], [2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1]
)
self.assertEqual(alignment.column_annotations["letter"], "ABCDEFGHIJKL")
rc_alignment = alignment.reverse_complement()
self.assertEqual(
str(rc_alignment),
"""\
<unknown 0 CGG-TCCCGGTT 11
0 --|-|-||-|-| 12
<unknown 0 --GTT-CC-G-T 7
""",
)
self.assertEqual(len(rc_alignment.column_annotations), 2)
self.assertEqual(
rc_alignment.column_annotations["score"],
[1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2],
)
self.assertEqual(rc_alignment.column_annotations["letter"], "LKJIHGFEDCBA")


class TestMultipleAlignment(unittest.TestCase):
def setUp(self):
Expand Down Expand Up @@ -2607,7 +2645,6 @@ def check(self, fmt, alignments, ids=("", ""), descriptions=("", "")):


class TestAlign_out_of_order(unittest.TestCase):

seq1 = "AACCCAAAACCAAAAATTTAAATTTTAAA"
seq2 = "TGTTTTTCCCCC"
coordinates = numpy.array(
Expand Down Expand Up @@ -3051,7 +3088,6 @@ def test_str(self):


class TestAlign_nucleotide_protein_str(unittest.TestCase):

s1 = "ATGCGGAGCTTTCGAGCGACGTTTGGCTTTGACGACGGA" * 6
s2 = "ATGCGGAGCCGAGCGACGTTTACGGCTTTGACGACGGA" * 6
t1 = translate(s1)
Expand Down

0 comments on commit be9f39d

Please sign in to comment.