Explicitly record EMBL/GenBank molecule type

biopython · Nov 24, 2016 · 2b06528 · 2b06528
1 parent eec4a79
commit 2b06528
Show file tree

Hide file tree

Showing 8 changed files with 123 additions and 15 deletions.
diff --git a/Bio/GenBank/Scanner.py b/Bio/GenBank/Scanner.py
@@ -636,6 +636,10 @@ def _feed_first_line(self, consumer, line):
             raise ValueError('Did not recognise the ID line layout:\n' + line)
 
     def _feed_first_line_patents(self, consumer, line):
+        # Old style EMBL patent records where ID line ended SQ
+        # Not 100% sure that PRT here is really molecule type and
+        # not the data file division...
+        #
         # Either Non-Redundant Level 1 database records,
         # ID <accession>; <molecule type>; <non-redundant level 1>; <cluster size L1>
         # e.g. ID   NRP_AX000635; PRT; NR1; 15 SQ
@@ -647,7 +651,7 @@ def _feed_first_line_patents(self, consumer, line):
         fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip()[:-3].split(";")]
         assert len(fields) == 4
         consumer.locus(fields[0])
-        consumer.residue_type(fields[1])
+        consumer.residue_type(fields[1])  # semi-redundant
         consumer.data_file_division(fields[2])
         # TODO - Record cluster size?
 
@@ -673,6 +677,7 @@ def _feed_first_line_patents_kipo(self, consumer, line):
            3. Sequence length (e.g. '111 AA.')
         """
         consumer.locus(fields[0])  # Should we also call the accession consumer?
+        # consumer.molecule_type(fields[2])
         self._feed_seq_length(consumer, fields[3])
 
     def _feed_first_line_old(self, consumer, line):
@@ -698,8 +703,12 @@ def _feed_first_line_old(self, consumer, line):
         consumer.residue_type(fields[2])
         if "circular" in fields[2]:
             consumer.topology("circular")
+            consumer.molecule_type(fields[2].replace("circular", "").strip())
         elif "linear" in fields[2]:
             consumer.topology("linear")
+            consumer.molecule_type(fields[2].replace("linear", "").strip())
+        else:
+            consumer.molecule_type(fields[2].strip())
         consumer.data_file_division(fields[3])
         self._feed_seq_length(consumer, fields[4])
 
@@ -737,9 +746,10 @@ def _feed_first_line_new(self, consumer, line):
             consumer.version_suffix(version_parts[1])
 
         # Based on how the old GenBank parser worked, merge these two:
-        consumer.residue_type(" ".join(fields[2:4]))  # TODO - Store as two fields?
+        consumer.residue_type(" ".join(fields[2:4]))  # Semi-obsolete
 
         consumer.topology(fields[2])
+        consumer.molecule_type(fields[3])
 
         # consumer.xxx(fields[4]) # TODO - What should we do with the data class?
 
@@ -968,8 +978,12 @@ def _feed_first_line(self, consumer, line):
         consumer.residue_type(fields[3])
         if "circular" in fields[3]:
             consumer.topology("circular")
+            consumer.molecule_type(fields[3].replace("circular", "").strip())
         elif "linear" in fields[3]:
             consumer.topology("linear")
+            consumer.molecule_type(fields[3].replace("linear", "").strip())
+        else:
+            consumer.molecule_type(fields[3].strip())
         consumer.data_file_division(fields[4])
         self._feed_seq_length(consumer, fields[5])
 
@@ -1151,7 +1165,7 @@ def _feed_first_line(self, consumer, line):
             #    ??:??      space
             #    ??:29      Length of sequence, right-justified
             #    29:33      space, bp, space
-            #    33:41      strand type
+            #    33:41      strand type / molecule type, e.g. DNA
             #    41:42      space
             #    42:51      Blank (implies linear), linear or circular
             #    51:52      space
@@ -1206,6 +1220,7 @@ def _feed_first_line(self, consumer, line):
             else:
                 consumer.residue_type(line[33:51].strip())
 
+            consumer.molecule_type(line[33:41].strip())
             consumer.topology(line[42:51].strip())
             consumer.data_file_division(line[52:55])
             if line[62:73].strip():
@@ -1277,6 +1292,7 @@ def _feed_first_line(self, consumer, line):
             else:
                 consumer.residue_type(line[44:63].strip())
 
+            consumer.molecule_type(line[44:54].strip())
             consumer.topology(line[55:63].strip())
             consumer.data_file_division(line[64:67])
             if line[68:79].strip():

diff --git a/Bio/GenBank/__init__.py b/Bio/GenBank/__init__.py
@@ -659,14 +659,24 @@ def residue_type(self, type):
 
         This reflects the fact that the topology (linear/circular) and
         molecule type (e.g. DNA vs RNA) were a single field in early
-        GenBank files. Current GenBank/EMBL files have two fields.
+        files. Current GenBank/EMBL files have two fields.
         """
         self._seq_type = type.strip()
 
     def topology(self, topology):
+        """Record the topology (linear or circular as strings)."""
         if topology:
+            if topology not in ['linear', 'circular']:
+                raise ParserFailureError("Unexpected topology %r should be linear or circular" % topology)
             self.data.annotations['topology'] = topology
 
+    def molecule_type(self, mol_type):
+        """Record the molecule type (for round-trip etc)."""
+        if mol_type:
+            if "circular" in mol_type or 'linear' in mol_type:
+                raise ParserFailureError("Molecule type %r should not include topology" % mol_type)
+            self.data.annotations['molecule_type'] = mol_type
+
     def data_file_division(self, division):
         self.data.annotations['data_file_division'] = division
 

diff --git a/Bio/SeqIO/InsdcIO.py b/Bio/SeqIO/InsdcIO.py
@@ -608,8 +608,20 @@ def _write_the_first_line(self, record):
             raise ValueError("Need a Nucleotide or Protein alphabet")
 
         # Get the molecule type
-        # TODO - record this explicitly in the parser?
-        if isinstance(a, Alphabet.ProteinAlphabet):
+        mol_type = self._get_annotation_str(record, "molecule_type", default=None)
+        if mol_type and len(mol_type) > 7:
+            # Deal with common cases from EMBL to GenBank
+            mol_type = mol_type.replace("unassigned ", "").replace("genomic ", "")
+            if len(mol_type) > 7:
+                warnings.warn("Molecule type %r too long" % mol_type,
+                              BiopythonWarning)
+                mol_type = None
+        if mol_type == "protein":
+            mol_type = ""
+
+        if mol_type:
+            pass
+        elif isinstance(a, Alphabet.ProteinAlphabet):
             mol_type = ""
         elif isinstance(a, Alphabet.DNAAlphabet):
             mol_type = "DNA"
@@ -993,6 +1005,12 @@ def _write_the_first_lines(self, record):
             # Must be something like NucleotideAlphabet
             raise ValueError("Need a DNA, RNA or Protein alphabet")
 
+        if record.annotations.get("molecule_type", None):
+            # Note often get RNA vs DNA discrepancy in real EMBL/NCBI files
+            mol_type = record.annotations["molecule_type"]
+            if mol_type in ["protein"]:
+                mol_type = "PROTEIN"
+
         # Get the taxonomy division
         division = self._get_data_division(record)
 

diff --git a/Doc/Tutorial/chapter_seq_annot.tex b/Doc/Tutorial/chapter_seq_annot.tex
@@ -1028,7 +1028,7 @@ \section{Reverse-complementing SeqRecord objects}
 >>> from Bio import SeqIO
 >>> record = SeqIO.read("NC_005816.gb", "genbank")
 >>> print("%s %i %i %i %i" % (record.id, len(record), len(record.features), len(record.dbxrefs), len(record.annotations)))
-NC_005816.1 9609 41 1 12
+NC_005816.1 9609 41 1 13
 \end{verbatim}
 
 Here we take the reverse complement and specify a new identifier -- but notice

diff --git a/Tests/output/test_GenBank b/Tests/output/test_GenBank
@@ -19,6 +19,8 @@ Key: gi
 Value: 5453633
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: mRNA
 Key: organism
 Value: Homo sapiens
 Key: sequence_version
@@ -73,6 +75,8 @@ Key: gi
 Value: 16229
 Key: keywords
 Value: ['antifreeze protein homology', 'cold-regulated gene', 'cor6.6 gene', 'KIN1 homology']
+Key: molecule_type
+Value: mRNA
 Key: organism
 Value: Arabidopsis thaliana
 References*
@@ -138,6 +142,8 @@ Key: gi
 Value: 16353
 Key: keywords
 Value: ['kin2 gene']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Arabidopsis thaliana
 References*
@@ -262,6 +268,8 @@ Key: gi
 Value: 167145
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: mRNA
 Key: organism
 Value: Brassica napus
 References*
@@ -338,6 +346,8 @@ Key: gi
 Value: 4538892
 Key: keywords
 Value: ['cold shock protein', 'csp14 gene']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Armoracia rusticana
 References*
@@ -426,6 +436,8 @@ Key: gi
 Value: 1209261
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: mRNA
 Key: organism
 Value: Brassica rapa
 References*
@@ -482,6 +494,8 @@ Key: gi
 Value: 10121868
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Brassica napus
 References*
@@ -568,6 +582,8 @@ Key: gi
 Value: 5731880
 Key: keywords
 Value: ['FLI_CDNA']
+Key: molecule_type
+Value: mRNA
 Key: organism
 Value: Homo sapiens
 References*
@@ -644,6 +660,8 @@ Key: gi
 Value: 452475
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Homo sapiens
 References*
@@ -722,6 +740,8 @@ Key: gi
 Value: 6587720
 Key: keywords
 Value: ['HTG']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Arabidopsis thaliana
 References*
@@ -1104,6 +1124,8 @@ Key: gi
 Value: 6946668
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Drosophila melanogaster
 References*
@@ -1333,6 +1355,8 @@ Key: gi
 Value: 885676
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Homo sapiens
 References*
@@ -1432,6 +1456,8 @@ Key: gi
 Value: 16156830
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Homo sapiens
 References*
@@ -1502,6 +1528,8 @@ Key: gi
 Value: 13470324
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Mesorhizobium loti
 References*
@@ -1742,6 +1770,8 @@ Key: gi
 Value: 1769753
 Key: keywords
 Value: ['nonstructural protein 1']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Feline panleukopenia virus
 References*
@@ -1805,6 +1835,8 @@ Key: gi
 Value: 1769755
 Key: keywords
 Value: ['nonstructural protein 1']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Feline panleukopenia virus
 References*
@@ -1868,6 +1900,8 @@ Key: gi
 Value: 1769757
 Key: keywords
 Value: ['capsid protein 2']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Feline panleukopenia virus
 References*
@@ -1935,6 +1969,8 @@ Key: gi
 Value: 45478711
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Yersinia pestis biovar Microtus str. 91001
 References*
@@ -2331,6 +2367,8 @@ Key: gi
 Value: 15823953
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Streptomyces avermitilis
 Key: sequence_version
@@ -2366,6 +2404,8 @@ Key: gi
 Value: 15823953
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Streptomyces avermitilis
 Key: sequence_version
@@ -2436,6 +2476,8 @@ Key: gi
 Value: 15823953
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Streptomyces avermitilis
 Key: sequence_version
@@ -2472,6 +2514,8 @@ Key: gi
 Value: 15823953
 Key: keywords
 Value: ['']
+Key: molecule_type
+Value: DNA
 Key: organism
 Value: Streptomyces avermitilis
 Key: sequence_version

diff --git a/Tests/test_BioSQL_sqlite3.py b/Tests/test_BioSQL_sqlite3.py
@@ -63,6 +63,10 @@ def test_backwards_compatibility(self):
         biosql_records = [db.lookup(name=rec.name)
                           for rec in original_records]
         # And check they agree
+        # Note the old parser used to create BioSQL/cor6_6.db
+        # did not record the molecule_type, so remove it here:
+        for r in original_records:
+            del r.annotations["molecule_type"]
         self.assertTrue(compare_records(original_records, biosql_records))
         server.close()