From c4babec9f54a125c1f93b73008c10a62f1cd71ee Mon Sep 17 00:00:00 2001 From: Thomas Schmitt Date: Mon, 4 Jul 2011 12:08:26 +0100 Subject: [PATCH] Adds support for SeqXML version 0.4 Fixes some problems in the SeqXML examples --- Bio/SeqIO/SeqXmlIO.py | 53 +++++++++++++++++++++++-------- Tests/SeqXML/corrupt_example1.xml | 2 +- Tests/SeqXML/corrupt_example2.xml | 2 +- Tests/SeqXML/dna_example.xml | 4 +-- Tests/SeqXML/protein_example.xml | 6 ++-- Tests/test_SeqIO_SeqXML.py | 20 ++++++++++-- 6 files changed, 63 insertions(+), 24 deletions(-) diff --git a/Bio/SeqIO/SeqXmlIO.py b/Bio/SeqIO/SeqXmlIO.py index e047d5820e6..8a0b084cf3e 100644 --- a/Bio/SeqIO/SeqXmlIO.py +++ b/Bio/SeqIO/SeqXmlIO.py @@ -101,6 +101,8 @@ def __init__(self,handle): self._source = None self._source_version = None self._version = None + self._speciesName = None + self._ncbiTaxId = None def _attr_seqXML(self,attr_dict,record): """Parse the document metadata.""" @@ -111,6 +113,10 @@ def _attr_seqXML(self,attr_dict,record): self._source_version = attr_dict["sourceVersion"] if "version" in attr_dict: self._version = attr_dict["seqXMLversion"] + if "ncbiTaxID" in attr_dict: + self._ncbiTaxId = attr_dict["ncbiTaxID"] + if "speciesName" in attr_dict: + self._speciesName = attr_dict["speciesName"] def _attr_property(self,attr_dict,record): """Parse key value pair properties and store them as annotations.""" @@ -149,7 +155,15 @@ def _attr_entry(self,attr_dict,record): record.annotations["source"] = attr_dict["source"] elif self._source != None: record.annotations["source"] = self._source - + + #initialize entry with global species definition + #the keywords for the species annotation are taken from SwissIO + if self._ncbiTaxId != None: + record.annotations["ncbi_taxid"] = self._ncbiTaxId + if self._speciesName != None: + record.annotations["organism"] = self._speciesName + + def _elem_DNAseq(self,node,record): """Parse DNA sequence.""" @@ -188,7 +202,6 @@ def _attr_DBRef(self,attr_dict,record): if "source" not in attr_dict or "id" not in attr_dict: raise ValueError("Invalid DB cross reference.") - #TODO add type if "%s:%s" % (attr_dict["source"],attr_dict["id"]) not in record.dbxrefs: record.dbxrefs.append("%s:%s" % (attr_dict["source"],attr_dict["id"]) ) @@ -202,29 +215,39 @@ class SeqXmlWriter(SequentialSequenceWriter): Bio.Alphapet.DNAAlphabet or Bio.Alphapet.ProteinAlphabet. """ - def __init__(self, handle,seqXML_version=None,source=None,source_version=None): + def __init__(self, handle,source=None,source_version=None,species=None,ncbiTaxId=None): """Create Object and start the xml generator.""" SequentialSequenceWriter.__init__(self, handle) self.xml_generator = XMLGenerator(handle, "utf-8") self.xml_generator.startDocument() - self.seqXML_version = seqXML_version self.source = source self.source_version = source_version + self.species = species + self.ncbiTaxId = ncbiTaxId def write_header(self): """Write root node with document metadata.""" SequentialSequenceWriter.write_header(self) attrs = {"xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance", - "xsi:noNamespaceSchemaLocation":"http://www.seqxml.org/0.3/seqxml.xsd"} - if self.seqXML_version != None: - attrs["seqXMLversion"] = self.seqXML_version + "xsi:noNamespaceSchemaLocation":"http://www.seqxml.org/0.4/seqxml.xsd", + "seqXMLversion":"0.4"} + if self.source != None: attrs["source"] = self.source if self.source_version != None: attrs["sourceVersion"] = self.source_ersion + if self.species != None: + if not isinstance(species,basestring): + raise TypeError("species should be of type string") + attrs["speciesName"] = self.species + if self.ncbiTaxId != None: + if not isinstance(self.ncbiTaxId,(basestring,int)): + raise TypeError("ncbiTaxID should be of type string or int") + attrs["ncbiTaxID"] = self.ncbiTaxId + self.xml_generator.startElement("seqXML", AttributesImpl(attrs)) @@ -271,10 +294,12 @@ def _write_species(self,record): if not isinstance(record.annotations["ncbi_taxid"],(basestring,int)): raise TypeError("ncbiTaxID should be of type string or int") + #The local species definition is only written if it differs from the global species definition + if record.annotations["organism"] != self.species or record.annotations["ncbi_taxid"] != self.ncbiTaxId: - attr = { "name" : record.annotations["organism"], "ncbiTaxID" :record.annotations["ncbi_taxid"] } - self.xml_generator.startElement("species",AttributesImpl(attr)) - self.xml_generator.endElement("species") + attr = { "name" : record.annotations["organism"], "ncbiTaxID" :record.annotations["ncbi_taxid"] } + self.xml_generator.startElement("species",AttributesImpl(attr)) + self.xml_generator.endElement("species") def _write_description(self,record): @@ -330,16 +355,14 @@ def _write_dbxrefs(self,record): for dbxref in record.dbxrefs: - #TODO add type if not isinstance(dbxref,basestring): - raise TypeError("dbxrefs should of type list of string") + raise TypeError("dbxrefs should be of type list of string") if dbxref.find(':') < 1: raise ValueError("dbxrefs should be in the form ['source:id', 'source:id' ]") dbsource,dbid = dbxref.split(':',1) - - attr = { "type" : "unknown", "source" : dbsource, "id" : dbid } + attr = { "source" : dbsource, "id" : dbid } self.xml_generator.startElement("DBRef",AttributesImpl(attr)) self.xml_generator.endElement("DBRef") @@ -385,8 +408,10 @@ def _write_properties(self,record): SeqIO.write(records,stringHandle,"seqxml") SeqIO.write(records,sys.stdout,"seqxml") + print stringHandle.seek(0) records = list(SeqIO.parse(stringHandle,"seqxml")) SeqIO.write(records,sys.stdout,"seqxml") + print diff --git a/Tests/SeqXML/corrupt_example1.xml b/Tests/SeqXML/corrupt_example1.xml index 852215e7952..0171503e696 100644 --- a/Tests/SeqXML/corrupt_example1.xml +++ b/Tests/SeqXML/corrupt_example1.xml @@ -1,5 +1,5 @@ - + /NCBI Tax id is missing diff --git a/Tests/SeqXML/corrupt_example2.xml b/Tests/SeqXML/corrupt_example2.xml index 475a933cfe9..a1f92e0a16c 100644 --- a/Tests/SeqXML/corrupt_example2.xml +++ b/Tests/SeqXML/corrupt_example2.xml @@ -1,5 +1,5 @@ - + id is missing diff --git a/Tests/SeqXML/dna_example.xml b/Tests/SeqXML/dna_example.xml index e874e375e69..85b2287630f 100644 --- a/Tests/SeqXML/dna_example.xml +++ b/Tests/SeqXML/dna_example.xml @@ -1,10 +1,10 @@ - + argininosuccinate synthetase 1 Gene [Source:MGI (curated);Acc:Ass1-001] CTTTGATTCACCATTTACTGGGAGCCCACGGCGATCTGGGCAGGTGCCAGGCCAGGACGTTAAAATGGCATCCCCAGCCATCCTTGGCCAACTCGTTTGCATTTTCTTGTGAACCCTTAGCTCTGGCTTGTGGGCTAGCATCAGGGAAAGCAAGAGCCTCTGAGGGGCAGATGCTTCAGTCTTTAAATCGAGTGGGGCTACACAGCCTCTGAACCTAAGCAATTTGCATCTTCACTCCCGCCTTGCAGCGCTTTCAAGGGTTCTTGCAAGACAGCTGCGATCAGCCTTGCTAGTCACGCGGGTTCCAACCTCTGCCGCCCTCCCAGGCGCCACCAGCGGGTCAGGATCAGGGGACCTAAATTGCAAGAGCCCACGCCCGTGCGCCCCCGGGAGGTGGTGCGGGCGCCAAGGGAAGAGGGAGGGGCCAGCGGGAGAGAGCGCGGGGCCACGCCCACCCGCAGGTGGCCTCGAGCCCTGAGAGGCTCGGAGCGGGGGCCGGGCCCGAGGGCGGGGCCTTTCTCGCGCCCGTCCCCACGTGTCCCAGGTTCCTGCCCCCCCCAGGCCCTGTGCTTATAACCCTGGATGCGCGCCTCTCTCAGCCCTGCTCCGCCGTCTGCCACTGCCGCCTGGGCTCACTGGTAAGAAGTTCCGGGCCCTTCTTGGTCCCCACTCTCTATCCAGGGTCTCATGACCAAGGAATCCCAGAGGGAGCGAGCAGCTGCAGGGGGTGGGCTCAGGGCAGCCCCCGAGTGACCGCGCAGTTAGTGGAAGGAAGGAGGCTGATGTCCGGGGCGCTGTGACGCACACGTTCCTCCTGCCTAGGGAACCCATGTAGGAGAACGGGTCCCAAGGTCTCCGGAGCAGGAGGACTATGGAGGTCCCCCTCTGGTTTTTCACGATCCTCTGTGTGCTGCGAAGGCTCCAAGTAGAGTCCCGGAGAGTCCAGAGGGTTGGGGGCTTCCGAATCCCTCTCCCTGGATCAGATTCGCCGATCTATGTTCAGGCTTGTTTCTCAAAGTAAGCAAAAGGTCCAGCCCCTCTTTGCCCTTAGGGGCTTGGGCGTGTGCCTTTCTCCTGAGTGTGACAGTACCCTGCTCTGGTGGACACGCGAACCCTGCCTACCCGTCCCGGGTCCTTCCTAGGCTGTGACTGGTTTCTCTTCTCCGTAGGGTGCAATGCCAGGCTCAGTCTGGGTGGGGCTTCTAGACAGGTTTCCTGCTAGCTGGCTGGCTCTTTGGGAGGAGCCCCTTCGCAGGAAATGTCTGTTTCTGGCTTGTTAACGGCTTGGGCCCTTCTCTGGAAGGCAGGCTAACCGAGAAATCGAGGGAAATCTTCTCCAGGAGAACTAACCGCGCTCTCAGTATTCCCCTCTGTGAAGTGGGTCTGTCAGGCTGTCTGCAAGTGCAGCTGGCAGGAGCAGACCTGCCCTTTCCCCAGGGGTAGAGTAGGAAAACCCTTCCTTTGATCCCTCGCTTTGCGGCTCCTCCACCTGGCAGGTGACCCCTGACGTCAGCCTCCCAGCTCCAGGGCTGCCAGCCTTGTGCCTGGACTCAGAACTCTTGCTGTCCAAGAGTTCAAGTTCAGTGAGCTGTTTTGAGAACTGTGGCCAGGGTCTGCCCACTTGCTCACTGGGTCTGGGATGAAGACCTGAAAAGACAGGGTCCCGTCCCCCCCCCCCCCCCGCAACCCTCCCCAGCCACCTACTCCCCCACTCCTCCACCCCCTCCCCCGCCCCAGACAAGGCTTTCAGAAGGAAGAAGACAGTGGAGAAACCTAAGAGCTGGGCTGGGCAGAGACTGGAATAGCCTCCATTTCAAACTCCCCCTACCCTCTCCCCCTCCCACCATAAACCAGCCAAGTCTGGGAGCATTGGAAGGGCCTCAGGGCTGCCCAAGGAATTCTGACTTTACCCAGTGCCCATGGAGTCCATAAATGAAGGGCAACTTCTGTCCACTACCCACCTCCTCCACGCACATAGTAGGACAGAACAGCTCTTAGCACCCTTGGTCAAGTGTGCCCCTCACCCCAGCTGCCTGCCTGCTCTGAGCTGAGGAAGTCAGACACTCCAGGAGCCTTGGGGTGGAGCCACGCCCCGGCCCACCTGCTGCATTTTCTCAGGGCCCATCTCATTGACCTTGGACGCCCCTAGGTCAGCCAGATCCCTCCACCAGGCTGTGAACACACTGCTTCTGGAGTGAGTGCCTTCCTTGCGGGCAGCCCTGCAGGAAGAGATACCTTGCTGAACTCTGGGTCTGAGGCATCGGTTTTCACTGAAGGTTTTGGGCAGAGTTCACTTCCCAGGACCTGGCTCCCTGTTCTTTTGAGGCTGGTTCCTCCCTAGACCTGGAAAATGGCTGGGCACCCAGAGCCTCAGTTGGTAGCATTAATGAGTGGACTGGTGTGTTCTTCTAGTGTTACTGGAGCTGTTAGCATCAGAGCACGGGCTCTGGCTGTCCCTGA - + diff --git a/Tests/SeqXML/protein_example.xml b/Tests/SeqXML/protein_example.xml index e95e86d09d8..f842a9e1287 100644 --- a/Tests/SeqXML/protein_example.xml +++ b/Tests/SeqXML/protein_example.xml @@ -1,5 +1,5 @@ - + argininosuccinate synthetase 1 Gene [Source:MGI (curated);Acc:Ass1-001] @@ -18,11 +18,11 @@ duplicated property/alternativeID GAKKVFIEDVSKEFVEEFIWPAVQSSALYE + + - - S diff --git a/Tests/test_SeqIO_SeqXML.py b/Tests/test_SeqIO_SeqXML.py index 80471f58dc5..e6d3828b06c 100644 --- a/Tests/test_SeqIO_SeqXML.py +++ b/Tests/test_SeqIO_SeqXML.py @@ -13,6 +13,7 @@ "dna" : ["SeqXML/dna_example.xml",4,None], "rna" : ["SeqXML/rna_example.xml",5,None], "protein" : ["SeqXML/protein_example.xml",5,None], + "globalSpecies" : ["SeqXML/global_species_example.xml",2,None], } corrupt_files = {"corrupt1" : ["SeqXML/corrupt_example1.xml",None], @@ -92,8 +93,8 @@ def test_duplicated_property(self): self.assertEqual(self.records["protein"][2].annotations["test"],[u"1",u"2",u"3"]) - def test_duplicated_alternativeID(self): - """Read multiple alternative identifier form single source""" + def test_duplicated_dbxref(self): + """Read multiple cross references to a single source""" self.assertEqual(self.records["protein"][2].dbxrefs,[u"someDB:G001",u"someDB:G002"]) @@ -107,7 +108,7 @@ def test_read_minial_required(self): self.assertEqual(self.records["rna"][3].dbxrefs,minimalRecord.dbxrefs) self.assertEqual(self.records["protein"][3].description,minimalRecord.description) - def test_species(self): + def test_local_species(self): self.assertEqual(self.records["rna"][1].annotations["organism"],"Mus musculus") self.assertEqual(self.records["rna"][1].annotations["ncbi_taxid"],"10090") @@ -115,6 +116,14 @@ def test_species(self): self.assertEqual(self.records["rna"][0].annotations["organism"],"Gallus gallus") self.assertEqual(self.records["rna"][0].annotations["ncbi_taxid"],"9031") + def test_global_species(self): + + self.assertEqual(self.records["globalSpecies"][0].annotations["organism"],"Mus musculus") + self.assertEqual(self.records["globalSpecies"][0].annotations["ncbi_taxid"],"10090") + + self.assertEqual(self.records["globalSpecies"][1].annotations["organism"],"Homo sapiens") + self.assertEqual(self.records["globalSpecies"][1].annotations["ncbi_taxid"],"9606") + def test_local_source_definition(self): @@ -148,6 +157,11 @@ def test_read_write_protein(self): read1_records = list(SeqIO.parse(test_files["protein"][2],"seqxml")) self._write_parse_and_compare(read1_records) + def test_read_write_globalSpecies(self): + + read1_records = list(SeqIO.parse(test_files["globalSpecies"][2],"seqxml")) + self._write_parse_and_compare(read1_records) + def _write_parse_and_compare(self,read1_records):