Skip to content

Commit

Permalink
Adds support for SeqXML version 0.4 Fixes some problems in the SeqXML…
Browse files Browse the repository at this point in the history
… examples
  • Loading branch information
wurstbonbon authored and peterjc committed Jul 14, 2011
1 parent e30fb6c commit c4babec
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 24 deletions.
53 changes: 39 additions & 14 deletions Bio/SeqIO/SeqXmlIO.py
Expand Up @@ -101,6 +101,8 @@ def __init__(self,handle):
self._source = None
self._source_version = None
self._version = None
self._speciesName = None
self._ncbiTaxId = None

def _attr_seqXML(self,attr_dict,record):
"""Parse the document metadata."""
Expand All @@ -111,6 +113,10 @@ def _attr_seqXML(self,attr_dict,record):
self._source_version = attr_dict["sourceVersion"]
if "version" in attr_dict:
self._version = attr_dict["seqXMLversion"]
if "ncbiTaxID" in attr_dict:
self._ncbiTaxId = attr_dict["ncbiTaxID"]
if "speciesName" in attr_dict:
self._speciesName = attr_dict["speciesName"]

def _attr_property(self,attr_dict,record):
"""Parse key value pair properties and store them as annotations."""
Expand Down Expand Up @@ -149,7 +155,15 @@ def _attr_entry(self,attr_dict,record):
record.annotations["source"] = attr_dict["source"]
elif self._source != None:
record.annotations["source"] = self._source


#initialize entry with global species definition
#the keywords for the species annotation are taken from SwissIO
if self._ncbiTaxId != None:
record.annotations["ncbi_taxid"] = self._ncbiTaxId
if self._speciesName != None:
record.annotations["organism"] = self._speciesName


def _elem_DNAseq(self,node,record):
"""Parse DNA sequence."""

Expand Down Expand Up @@ -188,7 +202,6 @@ def _attr_DBRef(self,attr_dict,record):
if "source" not in attr_dict or "id" not in attr_dict:
raise ValueError("Invalid DB cross reference.")

#TODO add type
if "%s:%s" % (attr_dict["source"],attr_dict["id"]) not in record.dbxrefs:
record.dbxrefs.append("%s:%s" % (attr_dict["source"],attr_dict["id"]) )

Expand All @@ -202,29 +215,39 @@ class SeqXmlWriter(SequentialSequenceWriter):
Bio.Alphapet.DNAAlphabet or Bio.Alphapet.ProteinAlphabet.
"""

def __init__(self, handle,seqXML_version=None,source=None,source_version=None):
def __init__(self, handle,source=None,source_version=None,species=None,ncbiTaxId=None):
"""Create Object and start the xml generator."""

SequentialSequenceWriter.__init__(self, handle)

self.xml_generator = XMLGenerator(handle, "utf-8")
self.xml_generator.startDocument()
self.seqXML_version = seqXML_version
self.source = source
self.source_version = source_version
self.species = species
self.ncbiTaxId = ncbiTaxId

def write_header(self):
"""Write root node with document metadata."""
SequentialSequenceWriter.write_header(self)

attrs = {"xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance",
"xsi:noNamespaceSchemaLocation":"http://www.seqxml.org/0.3/seqxml.xsd"}
if self.seqXML_version != None:
attrs["seqXMLversion"] = self.seqXML_version
"xsi:noNamespaceSchemaLocation":"http://www.seqxml.org/0.4/seqxml.xsd",
"seqXMLversion":"0.4"}

if self.source != None:
attrs["source"] = self.source
if self.source_version != None:
attrs["sourceVersion"] = self.source_ersion
if self.species != None:
if not isinstance(species,basestring):
raise TypeError("species should be of type string")
attrs["speciesName"] = self.species
if self.ncbiTaxId != None:
if not isinstance(self.ncbiTaxId,(basestring,int)):
raise TypeError("ncbiTaxID should be of type string or int")
attrs["ncbiTaxID"] = self.ncbiTaxId

self.xml_generator.startElement("seqXML", AttributesImpl(attrs))


Expand Down Expand Up @@ -271,10 +294,12 @@ def _write_species(self,record):
if not isinstance(record.annotations["ncbi_taxid"],(basestring,int)):
raise TypeError("ncbiTaxID should be of type string or int")

#The local species definition is only written if it differs from the global species definition
if record.annotations["organism"] != self.species or record.annotations["ncbi_taxid"] != self.ncbiTaxId:

attr = { "name" : record.annotations["organism"], "ncbiTaxID" :record.annotations["ncbi_taxid"] }
self.xml_generator.startElement("species",AttributesImpl(attr))
self.xml_generator.endElement("species")
attr = { "name" : record.annotations["organism"], "ncbiTaxID" :record.annotations["ncbi_taxid"] }
self.xml_generator.startElement("species",AttributesImpl(attr))
self.xml_generator.endElement("species")


def _write_description(self,record):
Expand Down Expand Up @@ -330,16 +355,14 @@ def _write_dbxrefs(self,record):

for dbxref in record.dbxrefs:

#TODO add type
if not isinstance(dbxref,basestring):
raise TypeError("dbxrefs should of type list of string")
raise TypeError("dbxrefs should be of type list of string")
if dbxref.find(':') < 1:
raise ValueError("dbxrefs should be in the form ['source:id', 'source:id' ]")

dbsource,dbid = dbxref.split(':',1)


attr = { "type" : "unknown", "source" : dbsource, "id" : dbid }
attr = { "source" : dbsource, "id" : dbid }
self.xml_generator.startElement("DBRef",AttributesImpl(attr))
self.xml_generator.endElement("DBRef")

Expand Down Expand Up @@ -385,8 +408,10 @@ def _write_properties(self,record):

SeqIO.write(records,stringHandle,"seqxml")
SeqIO.write(records,sys.stdout,"seqxml")
print

stringHandle.seek(0)
records = list(SeqIO.parse(stringHandle,"seqxml"))

SeqIO.write(records,sys.stdout,"seqxml")
print
2 changes: 1 addition & 1 deletion Tests/SeqXML/corrupt_example1.xml
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<seqXML source="Ensembl" sourceVersion="56" seqXMLversion="0.3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.3/seqxml.xsd">
<seqXML source="Ensembl" sourceVersion="56" seqXMLversion="0.4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.4/seqxml.xsd">
<entry id="ENSMUSG00000076441">
<species name="Mus musculus"/>
<description>/NCBI Tax id is missing</description>
Expand Down
2 changes: 1 addition & 1 deletion Tests/SeqXML/corrupt_example2.xml
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<seqXML source="Ensembl" sourceVersion="56" seqXMLversion="0.3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.2/seqxml.xsd">
<seqXML source="Ensembl" sourceVersion="56" seqXMLversion="0.4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.4/seqxml.xsd">
<entry>
<species name="Homo sapiens" ncbiTaxID="9606"/>
<description>id is missing</description>
Expand Down
4 changes: 2 additions & 2 deletions Tests/SeqXML/dna_example.xml
@@ -1,10 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<seqXML source="Ensembl" sourceVersion="56" seqXMLversion="0.3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.3/seqxml.xsd">
<seqXML source="Ensembl" sourceVersion="56" seqXMLversion="0.4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.4/seqxml.xsd">
<entry id="ENSMUSG00000076441">
<species name="Mus musculus" ncbiTaxID="10090"/>
<description>argininosuccinate synthetase 1 Gene [Source:MGI (curated);Acc:Ass1-001]</description>
<DNAseq>CTTTGATTCACCATTTACTGGGAGCCCACGGCGATCTGGGCAGGTGCCAGGCCAGGACGTTAAAATGGCATCCCCAGCCATCCTTGGCCAACTCGTTTGCATTTTCTTGTGAACCCTTAGCTCTGGCTTGTGGGCTAGCATCAGGGAAAGCAAGAGCCTCTGAGGGGCAGATGCTTCAGTCTTTAAATCGAGTGGGGCTACACAGCCTCTGAACCTAAGCAATTTGCATCTTCACTCCCGCCTTGCAGCGCTTTCAAGGGTTCTTGCAAGACAGCTGCGATCAGCCTTGCTAGTCACGCGGGTTCCAACCTCTGCCGCCCTCCCAGGCGCCACCAGCGGGTCAGGATCAGGGGACCTAAATTGCAAGAGCCCACGCCCGTGCGCCCCCGGGAGGTGGTGCGGGCGCCAAGGGAAGAGGGAGGGGCCAGCGGGAGAGAGCGCGGGGCCACGCCCACCCGCAGGTGGCCTCGAGCCCTGAGAGGCTCGGAGCGGGGGCCGGGCCCGAGGGCGGGGCCTTTCTCGCGCCCGTCCCCACGTGTCCCAGGTTCCTGCCCCCCCCAGGCCCTGTGCTTATAACCCTGGATGCGCGCCTCTCTCAGCCCTGCTCCGCCGTCTGCCACTGCCGCCTGGGCTCACTGGTAAGAAGTTCCGGGCCCTTCTTGGTCCCCACTCTCTATCCAGGGTCTCATGACCAAGGAATCCCAGAGGGAGCGAGCAGCTGCAGGGGGTGGGCTCAGGGCAGCCCCCGAGTGACCGCGCAGTTAGTGGAAGGAAGGAGGCTGATGTCCGGGGCGCTGTGACGCACACGTTCCTCCTGCCTAGGGAACCCATGTAGGAGAACGGGTCCCAAGGTCTCCGGAGCAGGAGGACTATGGAGGTCCCCCTCTGGTTTTTCACGATCCTCTGTGTGCTGCGAAGGCTCCAAGTAGAGTCCCGGAGAGTCCAGAGGGTTGGGGGCTTCCGAATCCCTCTCCCTGGATCAGATTCGCCGATCTATGTTCAGGCTTGTTTCTCAAAGTAAGCAAAAGGTCCAGCCCCTCTTTGCCCTTAGGGGCTTGGGCGTGTGCCTTTCTCCTGAGTGTGACAGTACCCTGCTCTGGTGGACACGCGAACCCTGCCTACCCGTCCCGGGTCCTTCCTAGGCTGTGACTGGTTTCTCTTCTCCGTAGGGTGCAATGCCAGGCTCAGTCTGGGTGGGGCTTCTAGACAGGTTTCCTGCTAGCTGGCTGGCTCTTTGGGAGGAGCCCCTTCGCAGGAAATGTCTGTTTCTGGCTTGTTAACGGCTTGGGCCCTTCTCTGGAAGGCAGGCTAACCGAGAAATCGAGGGAAATCTTCTCCAGGAGAACTAACCGCGCTCTCAGTATTCCCCTCTGTGAAGTGGGTCTGTCAGGCTGTCTGCAAGTGCAGCTGGCAGGAGCAGACCTGCCCTTTCCCCAGGGGTAGAGTAGGAAAACCCTTCCTTTGATCCCTCGCTTTGCGGCTCCTCCACCTGGCAGGTGACCCCTGACGTCAGCCTCCCAGCTCCAGGGCTGCCAGCCTTGTGCCTGGACTCAGAACTCTTGCTGTCCAAGAGTTCAAGTTCAGTGAGCTGTTTTGAGAACTGTGGCCAGGGTCTGCCCACTTGCTCACTGGGTCTGGGATGAAGACCTGAAAAGACAGGGTCCCGTCCCCCCCCCCCCCCCGCAACCCTCCCCAGCCACCTACTCCCCCACTCCTCCACCCCCTCCCCCGCCCCAGACAAGGCTTTCAGAAGGAAGAAGACAGTGGAGAAACCTAAGAGCTGGGCTGGGCAGAGACTGGAATAGCCTCCATTTCAAACTCCCCCTACCCTCTCCCCCTCCCACCATAAACCAGCCAAGTCTGGGAGCATTGGAAGGGCCTCAGGGCTGCCCAAGGAATTCTGACTTTACCCAGTGCCCATGGAGTCCATAAATGAAGGGCAACTTCTGTCCACTACCCACCTCCTCCACGCACATAGTAGGACAGAACAGCTCTTAGCACCCTTGGTCAAGTGTGCCCCTCACCCCAGCTGCCTGCCTGCTCTGAGCTGAGGAAGTCAGACACTCCAGGAGCCTTGGGGTGGAGCCACGCCCCGGCCCACCTGCTGCATTTTCTCAGGGCCCATCTCATTGACCTTGGACGCCCCTAGGTCAGCCAGATCCCTCCACCAGGCTGTGAACACACTGCTTCTGGAGTGAGTGCCTTCCTTGCGGGCAGCCCTGCAGGAAGAGATACCTTGCTGAACTCTGGGTCTGAGGCATCGGTTTTCACTGAAGGTTTTGGGCAGAGTTCACTTCCCAGGACCTGGCTCCCTGTTCTTTTGAGGCTGGTTCCTCCCTAGACCTGGAAAATGGCTGGGCACCCAGAGCCTCAGTTGGTAGCATTAATGAGTGGACTGGTGTGTTCTTCTAGTGTTACTGGAGCTGTTAGCATCAGAGCACGGGCTCTGGCTGTCCCTGA</DNAseq>
<alternativeID source="UCSC" id="uc008jdu.1"/>
<DBRef source="UCSC" id="uc008jdu.1"/>
</entry>
<entry id="fake1">
<species name="Mus musculus" ncbiTaxID="10090"/>
Expand Down
6 changes: 3 additions & 3 deletions Tests/SeqXML/protein_example.xml
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<seqXML source="Ensembl" sourceVersion="56" seqXMLversion="0.3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.3/seqxml.xsd">
<seqXML source="Ensembl" sourceVersion="56" seqXMLversion="0.4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.seqxml.org/0.4/seqxml.xsd">
<entry id="ENSMUSP00000099904">
<species name="Mus musculus" ncbiTaxID="10090"/>
<description>argininosuccinate synthetase 1 Gene [Source:MGI (curated);Acc:Ass1-001]</description>
Expand All @@ -18,11 +18,11 @@
<species name="Homo sapiens" ncbiTaxID="9606"/>
<description>duplicated property/alternativeID</description>
<AAseq>GAKKVFIEDVSKEFVEEFIWPAVQSSALYE</AAseq>
<DBRef source="someDB" id="G001"/>
<DBRef source="someDB" id="G002"/>
<property name="test" value="1"/>
<property name="test" value="2"/>
<property name="test" value="3"/>
<DBRef source="someDB" id="G001"/>
<DBRef source="someDB" id="G002"/>
</entry>
<entry id="minimal">
<AAseq>S</AAseq>
Expand Down
20 changes: 17 additions & 3 deletions Tests/test_SeqIO_SeqXML.py
Expand Up @@ -13,6 +13,7 @@
"dna" : ["SeqXML/dna_example.xml",4,None],
"rna" : ["SeqXML/rna_example.xml",5,None],
"protein" : ["SeqXML/protein_example.xml",5,None],
"globalSpecies" : ["SeqXML/global_species_example.xml",2,None],
}

corrupt_files = {"corrupt1" : ["SeqXML/corrupt_example1.xml",None],
Expand Down Expand Up @@ -92,8 +93,8 @@ def test_duplicated_property(self):

self.assertEqual(self.records["protein"][2].annotations["test"],[u"1",u"2",u"3"])

def test_duplicated_alternativeID(self):
"""Read multiple alternative identifier form single source"""
def test_duplicated_dbxref(self):
"""Read multiple cross references to a single source"""

self.assertEqual(self.records["protein"][2].dbxrefs,[u"someDB:G001",u"someDB:G002"])

Expand All @@ -107,14 +108,22 @@ def test_read_minial_required(self):
self.assertEqual(self.records["rna"][3].dbxrefs,minimalRecord.dbxrefs)
self.assertEqual(self.records["protein"][3].description,minimalRecord.description)

def test_species(self):
def test_local_species(self):

self.assertEqual(self.records["rna"][1].annotations["organism"],"Mus musculus")
self.assertEqual(self.records["rna"][1].annotations["ncbi_taxid"],"10090")

self.assertEqual(self.records["rna"][0].annotations["organism"],"Gallus gallus")
self.assertEqual(self.records["rna"][0].annotations["ncbi_taxid"],"9031")

def test_global_species(self):

self.assertEqual(self.records["globalSpecies"][0].annotations["organism"],"Mus musculus")
self.assertEqual(self.records["globalSpecies"][0].annotations["ncbi_taxid"],"10090")

self.assertEqual(self.records["globalSpecies"][1].annotations["organism"],"Homo sapiens")
self.assertEqual(self.records["globalSpecies"][1].annotations["ncbi_taxid"],"9606")


def test_local_source_definition(self):

Expand Down Expand Up @@ -148,6 +157,11 @@ def test_read_write_protein(self):
read1_records = list(SeqIO.parse(test_files["protein"][2],"seqxml"))
self._write_parse_and_compare(read1_records)

def test_read_write_globalSpecies(self):

read1_records = list(SeqIO.parse(test_files["globalSpecies"][2],"seqxml"))
self._write_parse_and_compare(read1_records)


def _write_parse_and_compare(self,read1_records):

Expand Down

0 comments on commit c4babec

Please sign in to comment.