Skip to content

Commit

Permalink
Explicitly record EMBL/GenBank molecule type
Browse files Browse the repository at this point in the history
  • Loading branch information
peterjc committed Nov 24, 2016
1 parent eec4a79 commit 2b06528
Show file tree
Hide file tree
Showing 8 changed files with 123 additions and 15 deletions.
22 changes: 19 additions & 3 deletions Bio/GenBank/Scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,10 @@ def _feed_first_line(self, consumer, line):
raise ValueError('Did not recognise the ID line layout:\n' + line)

def _feed_first_line_patents(self, consumer, line):
# Old style EMBL patent records where ID line ended SQ
# Not 100% sure that PRT here is really molecule type and
# not the data file division...
#
# Either Non-Redundant Level 1 database records,
# ID <accession>; <molecule type>; <non-redundant level 1>; <cluster size L1>
# e.g. ID NRP_AX000635; PRT; NR1; 15 SQ
Expand All @@ -647,7 +651,7 @@ def _feed_first_line_patents(self, consumer, line):
fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip()[:-3].split(";")]
assert len(fields) == 4
consumer.locus(fields[0])
consumer.residue_type(fields[1])
consumer.residue_type(fields[1]) # semi-redundant
consumer.data_file_division(fields[2])
# TODO - Record cluster size?

Expand All @@ -673,6 +677,7 @@ def _feed_first_line_patents_kipo(self, consumer, line):
3. Sequence length (e.g. '111 AA.')
"""
consumer.locus(fields[0]) # Should we also call the accession consumer?
# consumer.molecule_type(fields[2])
self._feed_seq_length(consumer, fields[3])

def _feed_first_line_old(self, consumer, line):
Expand All @@ -698,8 +703,12 @@ def _feed_first_line_old(self, consumer, line):
consumer.residue_type(fields[2])
if "circular" in fields[2]:
consumer.topology("circular")
consumer.molecule_type(fields[2].replace("circular", "").strip())
elif "linear" in fields[2]:
consumer.topology("linear")
consumer.molecule_type(fields[2].replace("linear", "").strip())
else:
consumer.molecule_type(fields[2].strip())
consumer.data_file_division(fields[3])
self._feed_seq_length(consumer, fields[4])

Expand Down Expand Up @@ -737,9 +746,10 @@ def _feed_first_line_new(self, consumer, line):
consumer.version_suffix(version_parts[1])

# Based on how the old GenBank parser worked, merge these two:
consumer.residue_type(" ".join(fields[2:4])) # TODO - Store as two fields?
consumer.residue_type(" ".join(fields[2:4])) # Semi-obsolete

consumer.topology(fields[2])
consumer.molecule_type(fields[3])

# consumer.xxx(fields[4]) # TODO - What should we do with the data class?

Expand Down Expand Up @@ -968,8 +978,12 @@ def _feed_first_line(self, consumer, line):
consumer.residue_type(fields[3])
if "circular" in fields[3]:
consumer.topology("circular")
consumer.molecule_type(fields[3].replace("circular", "").strip())
elif "linear" in fields[3]:
consumer.topology("linear")
consumer.molecule_type(fields[3].replace("linear", "").strip())
else:
consumer.molecule_type(fields[3].strip())
consumer.data_file_division(fields[4])
self._feed_seq_length(consumer, fields[5])

Expand Down Expand Up @@ -1151,7 +1165,7 @@ def _feed_first_line(self, consumer, line):
# ??:?? space
# ??:29 Length of sequence, right-justified
# 29:33 space, bp, space
# 33:41 strand type
# 33:41 strand type / molecule type, e.g. DNA
# 41:42 space
# 42:51 Blank (implies linear), linear or circular
# 51:52 space
Expand Down Expand Up @@ -1206,6 +1220,7 @@ def _feed_first_line(self, consumer, line):
else:
consumer.residue_type(line[33:51].strip())

consumer.molecule_type(line[33:41].strip())
consumer.topology(line[42:51].strip())
consumer.data_file_division(line[52:55])
if line[62:73].strip():
Expand Down Expand Up @@ -1277,6 +1292,7 @@ def _feed_first_line(self, consumer, line):
else:
consumer.residue_type(line[44:63].strip())

consumer.molecule_type(line[44:54].strip())
consumer.topology(line[55:63].strip())
consumer.data_file_division(line[64:67])
if line[68:79].strip():
Expand Down
12 changes: 11 additions & 1 deletion Bio/GenBank/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,14 +659,24 @@ def residue_type(self, type):
This reflects the fact that the topology (linear/circular) and
molecule type (e.g. DNA vs RNA) were a single field in early
GenBank files. Current GenBank/EMBL files have two fields.
files. Current GenBank/EMBL files have two fields.
"""
self._seq_type = type.strip()

def topology(self, topology):
"""Record the topology (linear or circular as strings)."""
if topology:
if topology not in ['linear', 'circular']:
raise ParserFailureError("Unexpected topology %r should be linear or circular" % topology)
self.data.annotations['topology'] = topology

def molecule_type(self, mol_type):
"""Record the molecule type (for round-trip etc)."""
if mol_type:
if "circular" in mol_type or 'linear' in mol_type:
raise ParserFailureError("Molecule type %r should not include topology" % mol_type)
self.data.annotations['molecule_type'] = mol_type

def data_file_division(self, division):
self.data.annotations['data_file_division'] = division

Expand Down
22 changes: 20 additions & 2 deletions Bio/SeqIO/InsdcIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,8 +608,20 @@ def _write_the_first_line(self, record):
raise ValueError("Need a Nucleotide or Protein alphabet")

# Get the molecule type
# TODO - record this explicitly in the parser?
if isinstance(a, Alphabet.ProteinAlphabet):
mol_type = self._get_annotation_str(record, "molecule_type", default=None)
if mol_type and len(mol_type) > 7:
# Deal with common cases from EMBL to GenBank
mol_type = mol_type.replace("unassigned ", "").replace("genomic ", "")
if len(mol_type) > 7:
warnings.warn("Molecule type %r too long" % mol_type,
BiopythonWarning)
mol_type = None
if mol_type == "protein":
mol_type = ""

if mol_type:
pass
elif isinstance(a, Alphabet.ProteinAlphabet):
mol_type = ""
elif isinstance(a, Alphabet.DNAAlphabet):
mol_type = "DNA"
Expand Down Expand Up @@ -993,6 +1005,12 @@ def _write_the_first_lines(self, record):
# Must be something like NucleotideAlphabet
raise ValueError("Need a DNA, RNA or Protein alphabet")

if record.annotations.get("molecule_type", None):
# Note often get RNA vs DNA discrepancy in real EMBL/NCBI files
mol_type = record.annotations["molecule_type"]
if mol_type in ["protein"]:
mol_type = "PROTEIN"

# Get the taxonomy division
division = self._get_data_division(record)

Expand Down
2 changes: 1 addition & 1 deletion Doc/Tutorial/chapter_seq_annot.tex
Original file line number Diff line number Diff line change
Expand Up @@ -1028,7 +1028,7 @@ \section{Reverse-complementing SeqRecord objects}
>>> from Bio import SeqIO
>>> record = SeqIO.read("NC_005816.gb", "genbank")
>>> print("%s %i %i %i %i" % (record.id, len(record), len(record.features), len(record.dbxrefs), len(record.annotations)))
NC_005816.1 9609 41 1 12
NC_005816.1 9609 41 1 13
\end{verbatim}

Here we take the reverse complement and specify a new identifier -- but notice
Expand Down
44 changes: 44 additions & 0 deletions Tests/output/test_GenBank
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ Key: gi
Value: 5453633
Key: keywords
Value: ['']
Key: molecule_type
Value: mRNA
Key: organism
Value: Homo sapiens
Key: sequence_version
Expand Down Expand Up @@ -73,6 +75,8 @@ Key: gi
Value: 16229
Key: keywords
Value: ['antifreeze protein homology', 'cold-regulated gene', 'cor6.6 gene', 'KIN1 homology']
Key: molecule_type
Value: mRNA
Key: organism
Value: Arabidopsis thaliana
References*
Expand Down Expand Up @@ -138,6 +142,8 @@ Key: gi
Value: 16353
Key: keywords
Value: ['kin2 gene']
Key: molecule_type
Value: DNA
Key: organism
Value: Arabidopsis thaliana
References*
Expand Down Expand Up @@ -262,6 +268,8 @@ Key: gi
Value: 167145
Key: keywords
Value: ['']
Key: molecule_type
Value: mRNA
Key: organism
Value: Brassica napus
References*
Expand Down Expand Up @@ -338,6 +346,8 @@ Key: gi
Value: 4538892
Key: keywords
Value: ['cold shock protein', 'csp14 gene']
Key: molecule_type
Value: DNA
Key: organism
Value: Armoracia rusticana
References*
Expand Down Expand Up @@ -426,6 +436,8 @@ Key: gi
Value: 1209261
Key: keywords
Value: ['']
Key: molecule_type
Value: mRNA
Key: organism
Value: Brassica rapa
References*
Expand Down Expand Up @@ -482,6 +494,8 @@ Key: gi
Value: 10121868
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Brassica napus
References*
Expand Down Expand Up @@ -568,6 +582,8 @@ Key: gi
Value: 5731880
Key: keywords
Value: ['FLI_CDNA']
Key: molecule_type
Value: mRNA
Key: organism
Value: Homo sapiens
References*
Expand Down Expand Up @@ -644,6 +660,8 @@ Key: gi
Value: 452475
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Homo sapiens
References*
Expand Down Expand Up @@ -722,6 +740,8 @@ Key: gi
Value: 6587720
Key: keywords
Value: ['HTG']
Key: molecule_type
Value: DNA
Key: organism
Value: Arabidopsis thaliana
References*
Expand Down Expand Up @@ -1104,6 +1124,8 @@ Key: gi
Value: 6946668
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Drosophila melanogaster
References*
Expand Down Expand Up @@ -1333,6 +1355,8 @@ Key: gi
Value: 885676
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Homo sapiens
References*
Expand Down Expand Up @@ -1432,6 +1456,8 @@ Key: gi
Value: 16156830
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Homo sapiens
References*
Expand Down Expand Up @@ -1502,6 +1528,8 @@ Key: gi
Value: 13470324
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Mesorhizobium loti
References*
Expand Down Expand Up @@ -1742,6 +1770,8 @@ Key: gi
Value: 1769753
Key: keywords
Value: ['nonstructural protein 1']
Key: molecule_type
Value: DNA
Key: organism
Value: Feline panleukopenia virus
References*
Expand Down Expand Up @@ -1805,6 +1835,8 @@ Key: gi
Value: 1769755
Key: keywords
Value: ['nonstructural protein 1']
Key: molecule_type
Value: DNA
Key: organism
Value: Feline panleukopenia virus
References*
Expand Down Expand Up @@ -1868,6 +1900,8 @@ Key: gi
Value: 1769757
Key: keywords
Value: ['capsid protein 2']
Key: molecule_type
Value: DNA
Key: organism
Value: Feline panleukopenia virus
References*
Expand Down Expand Up @@ -1935,6 +1969,8 @@ Key: gi
Value: 45478711
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Yersinia pestis biovar Microtus str. 91001
References*
Expand Down Expand Up @@ -2331,6 +2367,8 @@ Key: gi
Value: 15823953
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Streptomyces avermitilis
Key: sequence_version
Expand Down Expand Up @@ -2366,6 +2404,8 @@ Key: gi
Value: 15823953
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Streptomyces avermitilis
Key: sequence_version
Expand Down Expand Up @@ -2436,6 +2476,8 @@ Key: gi
Value: 15823953
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Streptomyces avermitilis
Key: sequence_version
Expand Down Expand Up @@ -2472,6 +2514,8 @@ Key: gi
Value: 15823953
Key: keywords
Value: ['']
Key: molecule_type
Value: DNA
Key: organism
Value: Streptomyces avermitilis
Key: sequence_version
Expand Down
4 changes: 4 additions & 0 deletions Tests/test_BioSQL_sqlite3.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ def test_backwards_compatibility(self):
biosql_records = [db.lookup(name=rec.name)
for rec in original_records]
# And check they agree
# Note the old parser used to create BioSQL/cor6_6.db
# did not record the molecule_type, so remove it here:
for r in original_records:
del r.annotations["molecule_type"]
self.assertTrue(compare_records(original_records, biosql_records))
server.close()

Expand Down
Loading

0 comments on commit 2b06528

Please sign in to comment.