-
Notifications
You must be signed in to change notification settings - Fork 90
/
application.py
288 lines (256 loc) · 10.9 KB
/
application.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
"""
Outsourcing - The Application subpackage
========================================
.. currentmodule:: biotite.application
Although you can achieve a lot with *Biotite*, there are still a lot of
things which are not implemented in this *Python* package.
But wait, this is what the :mod:`biotite.application` package is for:
It contains interfaces for popular external software.
This ranges from locally installed software to external tools running on
web servers.
The usage of these interfaces is seamless: Rather than writing input
files and reading output files, you simply put in your *Python*
objects (e.g. instances of :class:`Sequence` or :class:`AtomArray`) and
the interface returns *Python* objects (e.g. an :class:`Alignment`
object).
The base class for all interfaces is the :class:`Application` class.
Each :class:`Application` instance has a life cycle, starting with its
creation and ending with the result extraction.
Each state in this life cycle is described by the value of the
*enum* :class:`AppState`, that each :class:`Application` contains:
Directly after its instantiation the app is in the ``CREATED`` state.
In this state further parameters can be set for the application run.
After the user calls the :func:`Application.start()` method, the app
state is set to ``RUNNING`` and the app performs the calculations.
When the application finishes the AppState
changes to ``FINISHED``.
The user can now call the :func:`Application.join()` method, concluding
the application in the ``JOINED`` state and making the results of the
application accessible.
Furthermore, this may trigger cleanup actions in some applications.
:func:`Application.join()` can even be called in the ``RUNNING`` state:
This will constantly check if the application has finished and will
directly go into the ``JOINED`` state as soon as the application reaches
the ``FINISHED`` state.
Calling the :func:`Application.cancel()` method while the application is
``RUNNING`` or ``FINISHED`` leaves the application in the ``CANCELLED``
state.
This triggers cleanup, too, but there are no accessible results.
If a method is called in an unsuitable app state, an
:class:`AppStateError` is called.
At each state in the life cycle, :class:`Application` type specific
methods are called, as shown in the following diagram.
.. image:: /static/assets/figures/app_lifecycle_path.svg
The following sample code shows how an :class:`Application` is generally
executed. Pay attention to the space between the
:func:`Application.start()` and :func:`Application.join()` method:
Since these are separate functions, you can do some other stuff, while
the :class:`Application` runs in the background.
Thus, an :class:`Application` behaves effectively like an additional
thread.
"""
from biotite.application import Application
# Create a dummy Application subclass
class MyApplication(Application):
def __init__(self, param): super().__init__()
def run(self): pass
def is_finished(self): return True
def wait_interval(self): return 0.1
def evaluate(self): pass
def clean_up(self): pass
def set_some_other_input(self, param): pass
def get_some_data(self): return "some data"
param = "foo"
param2 = "bar"
app = MyApplication(param)
app.set_some_other_input(param2)
app.start()
# Time to do other stuff
app.join()
results = app.get_some_data()
########################################################################
# The following subsections will dive into the available
# :class:`Application` classes in depth.
#
# Finding homologous sequences with BLAST
# ---------------------------------------
#
# .. currentmodule:: biotite.application.blast
#
# the :mod:`biotite.application.blast` subpackage provides an
# interface to NCBI BLAST: the :class:`BlastWebApp` class.
# Let's dive directly into the code, we try to find
# homologous sequences to the miniprotein *TC5b*:
import biotite.application.blast as blast
import biotite.sequence as seq
tc5b_seq = seq.ProteinSequence("NLYIQWLKDGGPSSGRPPPS")
app = blast.BlastWebApp("blastp", tc5b_seq)
app.start()
app.join()
alignments = app.get_alignments()
best_ali = alignments[0]
print(best_ali)
print()
print("HSP position in query: ", best_ali.query_interval)
print("HSP position in hit: ", best_ali.hit_interval)
print("Score: ", best_ali.score)
print("E-value: ", best_ali.e_value)
print("Hit UID: ", best_ali.hit_id)
print("Hit name: ", best_ali.hit_definition)
########################################################################
# This was too simple for BLAST:
# As best hit it just found the query sequence itself in the PDB.
# However, it gives a good impression about how this
# :class:`Application` works.
# Besides some optional parameters, the :class:`BlastWebApp` requires
# the BLAST program and the query sequence. After the app has finished,
# you get a list of alignments with descending score.
# An alignment is an instance of :class:`BlastAlignment`, a subclass of
# :class:`biotite.sequence.align.Alignment`.
# It contains some additional information as shown above.
# The hit UID can be used to obtain the complete hit sequence via
# :mod:`biotite.database.entrez`.
#
# The next alignment should be a bit more challenging.
# We take a random part of the *E. coli* BL21 genome and distort it a
# little bit.
# Since we still expect a high similarity to the original sequence,
# we decrease the E-value threshold.
import biotite.application.blast as blast
import biotite.sequence as seq
distorted_bl21_excerpt = seq.NucleotideSequence(
"CGGAAGCGCTCGGTCTCCTGGCCTTATCAGCCACTGCGCGACGATATGCTCGTCCGTTTCGAAGA"
)
app = blast.BlastWebApp("blastn", distorted_bl21_excerpt, obey_rules=False)
app.set_max_expect_value(0.1)
app.start()
app.join()
alignments = app.get_alignments()
best_ali = alignments[0]
print(best_ali)
print()
print("HSP position in query: ", best_ali.query_interval)
print("HSP position in hit: ", best_ali.hit_interval)
print("Score: ", best_ali.score)
print("E-value: ", best_ali.e_value)
print("Hit UID: ", best_ali.hit_id)
print("Hit name: ", best_ali.hit_definition)
########################################################################
# In this snippet we have set :obj:`obey_rules` to false in the
# :class:`BlastWebApp` constructor, if we omitted this parameter and
# we started the last two code snippets in quick succession, a
# :class:`RuleViolationError` would be raised.
# This is because normally the :class:`BlastWebApp` respects NCBI's code
# of conduct and prevents you from submitting two queries within one
# minute. If you want to be rude to the NCBI server, create the
# instance with :obj:`obey_rules` set to false.
#
# Multiple sequence alignments
# ----------------------------
#
# .. currentmodule:: biotite.application.muscle
#
# For *multiple sequence alignments* (MSAs) :mod:`biotite.application`
# offers several interfaces to MSA software.
# For our example we choose the software MUSCLE:
# The subpackage :mod:`biotite.application.muscle` contains the class
# :class:`MuscleApp` that does the job.
# You simply input the sequences you want to have aligned, run the
# application and get the resulting :class:`Alignment` object.
import biotite.application.muscle as muscle
import biotite.sequence as seq
seq1 = seq.ProteinSequence("BIQTITE")
seq2 = seq.ProteinSequence("TITANITE")
seq3 = seq.ProteinSequence("BISMITE")
seq4 = seq.ProteinSequence("IQLITE")
app = muscle.MuscleApp([seq1, seq2, seq3, seq4])
app.start()
app.join()
alignment = app.get_alignment()
print(alignment)
########################################################################
# In most MSA software even more information than the mere alignment can
# be extracted.
# For instance the guide tree, that was used for the alignment, can be
# obtained from the MUSCLE output.
import matplotlib.pyplot as plt
import biotite.sequence.graphics as graphics
tree = app.get_guide_tree()
fig, ax = plt.subplots()
graphics.plot_dendrogram(
ax, tree, labels=[str(sequence) for sequence in [seq1, seq2, seq3, seq4]]
)
ax.set_xlabel("Distance")
fig.tight_layout()
########################################################################
# For the lazy people there is also a convenience method,
# that handles the :class:`Application` execution internally.
# However, this shortcut returns only the :class:`Alignment`.
alignment = muscle.MuscleApp.align([seq1, seq2, seq3, seq4])
########################################################################
# The alternatives to MUSCLE are Clustal-Omega and MAFFT.
# To use them, simply replace :class:`MuscleApp` with
# :class:`ClustalOmegaApp` or :class:`MafftApp` and you are done.
import biotite.application.clustalo as clustalo
alignment = clustalo.ClustalOmegaApp.align([seq1, seq2, seq3, seq4])
print(alignment)
########################################################################
# As shown in the output, the alignment with Clustal-Omega slightly
# differs from the one performed with MUSCLE.
#
# If the MSA software supports protein sequence alignment AND
# custom substitution matrices, e.g. MUSCLE and MAFFT, almost any type
# of sequence can be aligned:
# Internally the sequences and the matrix are converted into protein
# sequences/matrix.
# Then the masquerading sequences are aligned via the software and
# finally the sequences are mapped back into the original sequence type.
# Let's show this on the example of a nonsense alphabet.
import numpy as np
import biotite.application.mafft as mafft
import biotite.sequence.align as align
alphabet = seq.Alphabet(("foo", "bar", 42))
sequences = [seq.GeneralSequence(alphabet, sequence) for sequence in [
["foo", "bar", 42, "foo", "foo", 42, 42],
["foo", 42, "foo", "bar", "foo", 42, 42],
]]
matrix = align.SubstitutionMatrix(
alphabet, alphabet, np.array([
[ 100, -100, -100],
[-100, 100, -100],
[-100, -100, 100]
])
)
alignment = mafft.MafftApp.align(sequences, matrix=matrix)
# As the alphabet do not has characters as symbols
# the alignment cannot be directly printed
# However, we can print the trace
print(alignment.trace)
########################################################################
# Secondary structure annotation
# ------------------------------
#
# .. currentmodule:: biotite.application.dssp
#
# Althogh :mod:`biotite.structure` offers the function
# :func:`annotate_sse()` to assign secondary structure elements based on
# the P-SEA algorithm, DSSP can also be used via the
# :mod:`biotite.application.dssp` subpackage (provided that DSSP is
# installed).
# Let us demonstrate this on the example of the good old miniprotein
# *TC5b*.
from tempfile import gettempdir
import biotite.database.rcsb as rcsb
import biotite.application.dssp as dssp
import biotite.structure.io as strucio
file_path = rcsb.fetch("1l2y", "mmtf", gettempdir())
stack = strucio.load_structure(file_path)
array = stack[0]
app = dssp.DsspApp(array)
app.start()
app.join()
sse = app.get_sse()
print(sse)
########################################################################
# Similar to the MSA examples, :class:`DsspApp` has the convenience
# method :func:`DsspApp.annotate_sse()` as shortcut.