-
Notifications
You must be signed in to change notification settings - Fork 88
/
app3.py
254 lines (224 loc) · 8.25 KB
/
app3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
__name__ = "biotite.application.muscle"
__author__ = "Patrick Kunzmann"
__all__ = ["MuscleApp"]
import re
import numbers
import warnings
import subprocess
from tempfile import NamedTemporaryFile
from ..localapp import cleanup_tempfile
from ..msaapp import MSAApp
from ..application import AppState, VersionError, requires_state
from ...sequence.sequence import Sequence
from ...sequence.seqtypes import NucleotideSequence, ProteinSequence
from ...sequence.align.matrix import SubstitutionMatrix
from ...sequence.align.alignment import Alignment
from ...sequence.phylo.tree import Tree
class MuscleApp(MSAApp):
"""
Perform a multiple sequence alignment using MUSCLE version 3.
Parameters
----------
sequences : list of Sequence
The sequences to be aligned.
bin_path : str, optional
Path of the MUSCLE binary.
matrix : SubstitutionMatrix, optional
A custom substitution matrix.
See also
--------
Muscle5App
Examples
--------
>>> seq1 = ProteinSequence("BIQTITE")
>>> seq2 = ProteinSequence("TITANITE")
>>> seq3 = ProteinSequence("BISMITE")
>>> seq4 = ProteinSequence("IQLITE")
>>> app = MuscleApp([seq1, seq2, seq3, seq4])
>>> app.start()
>>> app.join()
>>> alignment = app.get_alignment()
>>> print(alignment)
BIQT-ITE
TITANITE
BISM-ITE
-IQL-ITE
"""
def __init__(self, sequences, bin_path="muscle", matrix=None):
major_version = get_version(bin_path)[0]
if major_version != 3:
raise VersionError(
f"Muscle 3 is required, got version {major_version}"
)
super().__init__(sequences, bin_path, matrix)
self._gap_open = None
self._gap_ext = None
self._terminal_penalty = None
self._tree1 = None
self._tree2 = None
self._out_tree1_file = NamedTemporaryFile(
"r", suffix=".tree", delete=False
)
self._out_tree2_file = NamedTemporaryFile(
"r", suffix=".tree", delete=False
)
def run(self):
args = [
"-quiet",
"-in", self.get_input_file_path(),
"-out", self.get_output_file_path(),
"-tree1", self._out_tree1_file.name,
"-tree2", self._out_tree2_file.name,
]
if self.get_seqtype() == "protein":
args += ["-seqtype", "protein"]
else:
args += ["-seqtype", "dna"]
if self.get_matrix_file_path() is not None:
args += ["-matrix", self.get_matrix_file_path()]
if self._gap_open is not None and self._gap_ext is not None:
args += ["-gapopen", f"{self._gap_open:.1f}"]
args += ["-gapextend", f"{self._gap_ext:.1f}"]
# When the gap penalty is set,
# use the penalty also for hydrophobic regions
args += ["-hydrofactor", "1.0"]
# Use the recommendation of the documentation
args += ["-center", "0.0"]
self.set_arguments(args)
super().run()
def evaluate(self):
super().evaluate()
newick = self._out_tree1_file.read().replace("\n", "")
if len(newick) > 0:
self._tree1 = Tree.from_newick(newick)
else:
warnings.warn(
"MUSCLE did not write a tree file from the first iteration"
)
newick = self._out_tree2_file.read().replace("\n", "")
if len(newick) > 0:
self._tree2 = Tree.from_newick(newick)
else:
warnings.warn(
"MUSCLE did not write a tree file from the second iteration"
)
def clean_up(self):
super().clean_up()
cleanup_tempfile(self._out_tree1_file)
cleanup_tempfile(self._out_tree2_file)
@requires_state(AppState.CREATED)
def set_gap_penalty(self, gap_penalty):
"""
Set the gap penalty for the alignment.
Parameters
----------
gap_penalty : float or (tuple, dtype=int)
If a float is provided, the value will be interpreted as
general gap penalty.
If a tuple is provided, an affine gap penalty is used.
The first value in the tuple is the gap opening penalty,
the second value is the gap extension penalty.
The values need to be negative.
"""
# Check if gap penalty is general or affine
if isinstance(gap_penalty, numbers.Real):
if gap_penalty > 0:
raise ValueError("Gap penalty must be negative")
self._gap_open = gap_penalty
self._gap_ext= gap_penalty
elif type(gap_penalty) == tuple:
if gap_penalty[0] > 0 or gap_penalty[1] > 0:
raise ValueError("Gap penalty must be negative")
self._gap_open = gap_penalty[0]
self._gap_ext = gap_penalty[1]
else:
raise TypeError("Gap penalty must be either float or tuple")
@requires_state(AppState.JOINED)
def get_guide_tree(self, iteration="identity"):
"""
Get the guide tree created for the progressive alignment.
Parameters
----------
iteration : {'kmer', 'identity'}
If 'kmer', the first iteration tree is returned.
This tree uses the sequences common *k*-mers as distance
measure.
If 'identity' the second iteration tree is returned.
This tree uses distances based on the pairwise sequence
identity after the first progressive alignment iteration.
Returns
-------
tree : Tree
The guide tree.
"""
if iteration == "kmer":
return self._tree1
elif iteration == "identity":
return self._tree2
else:
raise ValueError("Iteration must be 'kmer' or 'identity'")
@staticmethod
def supports_nucleotide():
return True
@staticmethod
def supports_protein():
return True
@staticmethod
def supports_custom_nucleotide_matrix():
return False
@staticmethod
def supports_custom_protein_matrix():
return True
@classmethod
def align(cls, sequences, bin_path=None, matrix=None,
gap_penalty=None):
"""
Perform a multiple sequence alignment.
This is a convenience function, that wraps the :class:`MuscleApp`
execution.
Parameters
----------
sequences : iterable object of Sequence
The sequences to be aligned
bin_path : str, optional
Path of the MSA software binary. By default, the default path
will be used.
matrix : SubstitutionMatrix, optional
A custom substitution matrix.
gap_penalty : float or (tuple, dtype=int), optional
If a float is provided, the value will be interpreted as
general gap penalty.
If a tuple is provided, an affine gap penalty is used.
The first value in the tuple is the gap opening penalty,
the second value is the gap extension penalty.
The values need to be negative.
Returns
-------
alignment : Alignment
The global multiple sequence alignment.
"""
if bin_path is None:
app = cls(sequences, matrix=matrix)
else:
app = cls(sequences, bin_path, matrix=matrix)
if gap_penalty is not None:
app.set_gap_penalty(gap_penalty)
app.start()
app.join()
return app.get_alignment()
def get_version(bin_path="muscle"):
output = subprocess.run(
[bin_path, "-version"], capture_output=True, text=True
)
# Find matches for version string containing major and minor version
match = re.search("\d+\.\d+", output.stdout)
if match is None:
raise subprocess.SubprocessError(
"Could not determine Muscle version"
)
version_string = match.group(0)
splitted = version_string.split(".")
return int(splitted[0]), int(splitted[1])