/
convert.py
133 lines (122 loc) · 4.59 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.
__name__ = "biotite.sequence.io.gff"
__author__ = "Patrick Kunzmann"
__all__ = ["get_annotation", "set_annotation"]
from ...annotation import Location, Feature, Annotation
def get_annotation(gff_file):
"""
Parse a GFF3 file into an :class:`Annotation`.
The *type* column is used as the :attr:`Feature.key` attribute,
the locations (``loc``) are taken from the *start*, *end* and
*strand* columns and the *attributes* column is parsed into the
:attr:`Feature.qual` attribute.
Multiple entries with the same ``ID`` attribute are interpreted
as the same feature.
Thus, for entries with the same ``ID``, the *type* and *attributes*
are only parsed once and the locations are aggregated from each
entry.
Parameters
----------
gff_file : GFFFile
The file tro extract the :class:`Annotation` object from.
Returns
-------
annotation : Annotation
The extracted annotation.
"""
annot = Annotation()
current_key = None
current_locs = None
current_qual = None
current_id = None
for _, _, type, start, end, _, strand, _, attrib in gff_file:
id = attrib.get("ID")
if id != current_id or id is None:
# current_key is None, when there is no previous feature
# (beginning of the file)
if current_key is not None:
# Beginning of new feature -> Save previous feature
annot.add_feature(
Feature(current_key, current_locs, current_qual)
)
# Track new feature
current_key = type
current_locs = [Location(start, end, strand)]
current_qual = attrib
else:
current_locs.append(Location(start, end, strand))
current_id = id
# Save last feature
if current_key is not None:
annot.add_feature(Feature(current_key, current_locs, current_qual))
return annot
def set_annotation(gff_file, annotation,
seqid=None, source=None, is_stranded=True):
"""
Write an :class:`Annotation` object into a GFF3 file.
Each feature will get one entry for each location it has.
:class:`Feature` objects with multiple locations require the ``ID``
qualifier in its :attr:`Feature.qual` attribute.
Parameters
----------
gff_file : GFFFile
The GFF3 file to write into.
annotation : Annotation
The annoation which is written to the GFF3 file.
seqid : str, optional
The content for the *seqid* column.
source : str, optional
The content for the *source* column.
is_stranded : bool, optional
If true, the strand of each feature is taken into account.
Otherwise the *strand* column is filled with '``.``'.
"""
for feature in sorted(annotation):
if len(feature.locs) > 1 and "ID" not in feature.qual:
raise ValueError(
"The 'Id' qualifier is required "
"for features with multiple locations"
)
## seqid ##
if seqid is not None and " " in seqid:
raise ValueError("The 'seqid' must not contain whitespaces")
## source ##
#Nothing to be done
## type ##
type = feature.key
## strand ##
# Expect same strandedness for all locations
strand = list(feature.locs)[0].strand if is_stranded else None
## score ##
score = None
## attributes ##
attributes = feature.qual
# The previous properties are shared by all entries
# for this feature
# The following loop handles properties that change with each
# location
reverse_order = True if strand == Location.Strand.REVERSE else False
next_phase = 0
for loc in sorted(
feature.locs, key=lambda loc: loc.first, reverse=reverse_order
):
## start ##
start = loc.first
## end ##
end = loc.last
## strand ##
strand = loc.strand if is_stranded else None
## phase ##
if type == "CDS":
phase = next_phase
# Subtract the length of the location
next_phase -= loc.last - loc.first + 1
next_phase %= 3
else:
phase = None
gff_file.append(
seqid, source, type, start, end,
score, strand, phase, attributes
)