forked from Edinburgh-Genome-Foundry/DnaFeaturesViewer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
biotools.py
163 lines (127 loc) · 4.85 KB
/
biotools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import textwrap
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.PDB.Polypeptide import aa1, aa3
from Bio import SeqIO
try:
from BCBio import GFF
except ImportError:
class GFF:
def parse(*a):
"""Not available. Please install bcbio-gff."""
raise ImportError("Please install the bcbio-gff library to parse GFF data")
def complement(dna_sequence):
"""Return the complement of the DNA sequence.
For instance ``complement("ATGCCG")`` returns ``"TACGGC"``.
Uses BioPython for speed.
"""
return str(Seq(dna_sequence).complement())
def reverse_complement(sequence):
"""Return the reverse-complement of the DNA sequence.
For instance ``complement("ATGCCG")`` returns ``"GCCGTA"``.
Uses BioPython for speed.
"""
return complement(sequence)[::-1]
aa_short_to_long_form_dict = {
_aa1: _aa3[0] + _aa3[1:].lower() for (_aa1, _aa3) in zip(aa1 + "*", aa3 + ["*"])
}
def translate(dna_sequence, long_form=False):
"""Translate the DNA sequence into an amino-acids sequence MLKYQT...
If long_form is true, a list of 3-letter amino acid representations
is returned instead (['Ala', 'Ser', ...]).
"""
result = str(Seq(dna_sequence).translate())
if long_form:
result = [aa_short_to_long_form_dict[aa] for aa in result]
return result
def extract_graphical_translation(sequence, location, long_form=False):
"""Return a string of the "graphical" translation of a sequence's subsegment.
Here "graphical" means that the amino acid sequence is always given
left-to-right, as it will appear under the sequence in the plot. This matters
when the location is on the -1 strand. In this case, the amino-acids are
determined by (part of) the reverse-complement of the sequence, however
the sequence returned will be the mirror of the translated sequence, as
this is the left-to-right order in which the codons corresponding to the
amino-acids appear in the sequence.
Parameters
----------
sequence
An "ATGC" string.
location
Either (start, end) or (start, end, strand), with strand in (0, 1, -1).
long_form
if True, a list of 3-letter amino acid representations is returned instead
(['Ala', 'Ser', ...]).
"""
if len(location) == 3:
start, end, strand = location
else:
start, end = location
strand = 1
subsequence = sequence[start:end]
if strand == -1:
subsequence = reverse_complement(subsequence)
translation = translate(subsequence, long_form=long_form)
if strand == -1:
translation = translation[::-1]
return translation
def load_record(path):
"""Load a Genbank file."""
if isinstance(path, str):
# Input is a file path
if path.lower().endswith(".gff"):
return list(GFF.parse(path))[0]
else:
return SeqIO.read(path, "genbank")
else:
# Input is a file-like object
try:
return SeqIO.read(path, "genbank")
except:
path.seek(0)
return list(GFF.parse(path))[0]
def annotate_biopython_record(
seqrecord, location="full", feature_type="misc_feature", margin=0, **qualifiers
):
"""Add a feature to a Biopython SeqRecord.
Parameters
----------
seqrecord
The biopython seqrecord to be annotated.
location
Either (start, end) or (start, end, strand). (strand defaults to +1)
feature_type
The type associated with the feature.
margin
Number of extra bases added on each side of the given location.
qualifiers
Dictionary that will be the Biopython feature's `qualifiers` attribute.
"""
if location == "full":
location = (margin, len(seqrecord) - margin)
strand = location[2] if len(location) == 3 else 1
seqrecord.features.append(
SeqFeature(
FeatureLocation(location[0], location[1], strand),
qualifiers=qualifiers,
type=feature_type,
)
)
def find_narrowest_text_wrap(text, max_line_length):
"""Wrap the text into a multi-line text minimizing the longest line length.
This is done by first wrapping the text using max_line_length, then
attempt new wraps by iteratively decreasing the line_length, as long as the
number of lines stays the same as with max_line_length.
"""
narrowest_wrap = textwrap.wrap(text, max_line_length)
narrowest_width = max([len(l) for l in narrowest_wrap])
for line_length in range(max_line_length - 1, 0, -1):
wrap = textwrap.wrap(text, line_length)
if len(wrap) <= len(narrowest_wrap):
width = max([len(l) for l in wrap])
if width < narrowest_width:
narrowest_wrap = wrap
narrowest_width = width
else:
break
return "\n".join(narrowest_wrap)