Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

353 lines (324 sloc) 20.892 kB
#!/usr/bin/env python
# Copyright 2010 by Andrea Pierleoni
# Revisions copyright 2010-2013 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Test for the Uniprot parser on Uniprot XML files.
"""
import os
import unittest
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
#Left as None if the import within UniProtIO fails
if SeqIO.UniprotIO.ElementTree is None:
from Bio import MissingPythonDependencyError
raise MissingPythonDependencyError("No ElementTree module was found. "
"Use Python 2.5+, lxml or elementtree if you "
"want to use Bio.SeqIO.UniprotIO.")
from seq_tests_common import compare_reference, compare_record
class TestUniprot(unittest.TestCase):
def test_uni001(self):
"Parsing Uniprot file uni001"
filename = 'uni001'
# test the record parser
datafile = os.path.join('SwissProt', filename)
with open(datafile) as test_handle:
seq_record = SeqIO.read(test_handle, "uniprot-xml")
self.assertTrue(isinstance(seq_record, SeqRecord))
# test a couple of things on the record -- this is not exhaustive
self.assertEqual(seq_record.id, "Q91G55")
self.assertEqual(seq_record.name, "043L_IIV6")
self.assertEqual(seq_record.description, "Uncharacterized protein 043L")
self.assertEqual(repr(seq_record.seq), "Seq('MDLINNKLNIEIQKFCLDLEKKYNINYNNLIDLWFNKESTERLIKCEVNLENKI...IPI', ProteinAlphabet())")
# self.assertEqual(seq_record.accessions, ['Q91G55']) #seq_record.accessions does not exist
# self.assertEqual(seq_record.organism_classification, ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Mammalia', 'Eutheria', 'Primates', 'Catarrhini', 'Hominidae', 'Homo'])
# self.assertEqual(record.seqinfo, (348, 39676, '75818910'))
self.assertEqual(len(seq_record.features), 1)
self.assertEqual(repr(seq_record.features[0]), "SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(116)), type='chain', id='PRO_0000377969')")
self.assertEqual(len(seq_record.annotations['references']), 2)
self.assertEqual(seq_record.annotations['references'][0].authors, 'Jakob N.J., Mueller K., Bahr U., Darai G.')
self.assertEqual(seq_record.annotations['references'][0].title, 'Analysis of the first complete DNA sequence of an invertebrate iridovirus: coding strategy of the genome of Chilo iridescent virus.')
self.assertEqual(seq_record.annotations['references'][0].journal, 'Virology 286:182-196(2001)')
self.assertEqual(seq_record.annotations['references'][0].comment, 'journal article | 2001 | Scope: NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA] | ')
self.assertEqual(len(seq_record.dbxrefs), 11)
self.assertEqual(seq_record.dbxrefs[0], 'DOI:10.1006/viro.2001.0963')
self.assertEqual(seq_record.annotations['sequence_length'], 116)
self.assertEqual(seq_record.annotations['sequence_checksum'], '4A29B35FB716523C')
self.assertEqual(seq_record.annotations['modified'], '2009-07-07')
self.assertEqual(seq_record.annotations['accessions'], ['Q91G55'])
self.assertEqual(seq_record.annotations['taxonomy'], ['Viruses', 'dsDNA viruses, no RNA stage', 'Iridoviridae', 'Iridovirus'])
self.assertEqual(seq_record.annotations['sequence_mass'], 13673)
self.assertEqual(seq_record.annotations['dataset'], 'Swiss-Prot')
self.assertEqual(seq_record.annotations['gene_name_ORF'], ['IIV6-043L'])
self.assertEqual(seq_record.annotations['version'], 21)
self.assertEqual(seq_record.annotations['sequence_modified'], '2001-12-01')
self.assertEqual(seq_record.annotations['keywords'], ['Complete proteome', 'Virus reference strain'])
self.assertEqual(seq_record.annotations['organism_host'], ['Acheta domesticus', 'House cricket', 'Chilo suppressalis', 'striped riceborer', 'Gryllus bimaculatus', 'Two-spotted cricket', 'Gryllus campestris', 'Spodoptera frugiperda', 'Fall armyworm'])
self.assertEqual(seq_record.annotations['created'], '2009-06-16')
self.assertEqual(seq_record.annotations['organism_name'], ['Chilo iridescent virus'])
self.assertEqual(seq_record.annotations['organism'], 'Invertebrate iridescent virus 6 (IIV-6)')
self.assertEqual(seq_record.annotations['recommendedName_fullName'], ['Uncharacterized protein 043L'])
self.assertEqual(seq_record.annotations['sequence_version'], 1)
self.assertEqual(seq_record.annotations['proteinExistence'], ['Predicted'])
def test_uni003(self):
"Parsing Uniprot file uni003"
filename = 'uni003'
# test the record parser
datafile = os.path.join('SwissProt', filename)
test_handle = open(datafile)
seq_record = SeqIO.read(test_handle, "uniprot-xml")
test_handle.close()
self.assertTrue(isinstance(seq_record, SeqRecord))
# test general record entries
self.assertEqual(seq_record.id, "O44185")
self.assertEqual(seq_record.name, "FLP13_CAEEL")
self.assertEqual(seq_record.description,
"FMRFamide-like neuropeptides 13")
self.assertEqual(repr(seq_record.seq),
"Seq('MMTSLLTISMFVVAIQAFDSSEIRMLDEQYDTKNPFFQFLENSKRSDRPTRAMD...GRK', ProteinAlphabet())")
self.assertEqual(len(seq_record.annotations['references']), 7)
self.assertEqual(seq_record.annotations['references'][5].authors,
'Kim K., Li C.')
self.assertEqual(seq_record.annotations['references'][5].title,
'Expression and regulation of an FMRFamide-related '
'neuropeptide gene family in Caenorhabditis elegans.')
self.assertEqual(seq_record.annotations['references'][5].journal,
'J. Comp. Neurol. 475:540-550(2004)')
self.assertEqual(seq_record.annotations['references'][5].comment,
'journal article | 2004 | Scope: TISSUE SPECIFICITY, '
'DEVELOPMENTAL STAGE | ')
self.assertEqual(seq_record.annotations["accessions"], ['O44185'])
self.assertEqual(seq_record.annotations["created"], "2004-05-10")
self.assertEqual(seq_record.annotations["dataset"], "Swiss-Prot")
self.assertEqual(seq_record.annotations["gene_name_ORF"], ['F33D4.3'])
self.assertEqual(seq_record.annotations["gene_name_primary"], "flp-13")
self.assertEqual(seq_record.annotations["keywords"],
['Amidation', 'Cleavage on pair of basic residues',
'Complete proteome', 'Direct protein sequencing',
'Neuropeptide', 'Reference proteome', 'Repeat',
'Secreted', 'Signal'])
self.assertEqual(seq_record.annotations["modified"], "2012-11-28")
self.assertEqual(seq_record.annotations["organism"],
"Caenorhabditis elegans")
self.assertEqual(seq_record.annotations["proteinExistence"],
['evidence at protein level'])
self.assertEqual(seq_record.annotations["recommendedName_fullName"],
['FMRFamide-like neuropeptides 13'])
self.assertEqual(seq_record.annotations["sequence_length"], 160)
self.assertEqual(seq_record.annotations["sequence_checksum"],
"BE4C24E9B85FCD11")
self.assertEqual(seq_record.annotations["sequence_mass"], 17736)
self.assertEqual(seq_record.annotations["sequence_modified"], "1998-06-01")
self.assertEqual(seq_record.annotations["sequence_precursor"], "true")
self.assertEqual(seq_record.annotations["sequence_version"], 1)
self.assertEqual(seq_record.annotations["taxonomy"],
['Eukaryota', 'Metazoa', 'Ecdysozoa', 'Nematoda',
'Chromadorea', 'Rhabditida', 'Rhabditoidea', 'Rhabditidae',
'Peloderinae', 'Caenorhabditis'])
self.assertEqual(seq_record.annotations["type"],
['ECO:0000006', 'ECO:0000001'])
self.assertEqual(seq_record.annotations["version"], 74)
# test comment entries
self.assertEqual(seq_record.annotations["comment_allergen"],
['Causes an allergic reaction in human.'])
self.assertEqual(seq_record.annotations["comment_alternativeproducts_isoform"],
['Q8W1X2-1', 'Q8W1X2-2'])
self.assertEqual(seq_record.annotations["comment_biotechnology"],
['Green fluorescent protein has been engineered to produce a '
'vast number of variously colored mutants, fusion proteins, '
'and biosensors. Fluorescent proteins and its mutated allelic '
'forms, blue, cyan and yellow have become a useful and '
'ubiquitous tool for making chimeric proteins, where they '
'function as a fluorescent protein tag. Typically they '
'tolerate N- and C-terminal fusion to a broad variety of '
'proteins. They have been expressed in most known cell types '
'and are used as a noninvasive fluorescent marker in living '
'cells and organisms. They enable a wide range of applications '
'where they have functioned as a cell lineage tracer, reporter '
'of gene expression, or as a measure of protein-protein '
'interactions.', 'Can also be used as a molecular thermometer, '
'allowing accurate temperature measurements in fluids. The '
'measurement process relies on the detection of the blinking '
'of GFP using fluorescence correlation spectroscopy.'])
self.assertEqual(seq_record.annotations["comment_catalyticactivity"],
['ATP + acetyl-CoA + HCO(3)(-) = ADP + phosphate + malonyl-CoA.',
'ATP + biotin-[carboxyl-carrier-protein] + CO(2) = ADP + '
'phosphate + carboxy-biotin-[carboxyl-carrier-protein].'])
self.assertEqual(seq_record.annotations["comment_caution"],
['Could be the product of a pseudogene. The existence of a '
'transcript at this locus is supported by only one sequence '
'submission (PubMed:2174397).'])
self.assertEqual(seq_record.annotations["comment_cofactor"],
['Biotin (By similarity).', 'Binds 2 manganese ions per '
'subunit (By similarity).'])
self.assertEqual(seq_record.annotations["comment_developmentalstage"],
['Expressed from the comma stage of embryogenesis, during all '
'larval stages, and in low levels in adults.'])
self.assertEqual(seq_record.annotations["comment_disease"],
['Defects in MC2R are the cause of glucocorticoid deficiency '
'type 1 (GCCD1) [MIM:202200]; also known as familial '
'glucocorticoid deficiency type 1 (FGD1). GCCD1 is an '
'autosomal recessive disorder due to congenital '
'insensitivity or resistance to adrenocorticotropin (ACTH). '
'It is characterized by progressive primary adrenal '
'insufficiency, without mineralocorticoid deficiency.'])
self.assertEqual(seq_record.annotations["comment_disruptionphenotype"],
['Mice display impaired B-cell development which does not '
'progress pass the progenitor stage.'])
self.assertEqual(seq_record.annotations["comment_domain"],
['Two regions, an N-terminal (aa 96-107) and a C-terminal '
'(aa 274-311) are required for binding FGF2.'])
self.assertEqual(seq_record.annotations["comment_enzymeregulation"],
['By phosphorylation. The catalytic activity is inhibited by '
'soraphen A, a polyketide isolated from the myxobacterium '
'Sorangium cellulosum and a potent inhibitor of fungal growth.'])
self.assertEqual(seq_record.annotations["comment_function"],
['FMRFamides and FMRFamide-like peptides are neuropeptides. '
'AADGAPLIRF-amide and APEASPFIRF-amide inhibit muscle tension '
'in somatic muscle. APEASPFIRF-amide is a potent inhibitor of '
'the activity of dissected pharyngeal myogenic muscle system.'])
self.assertEqual(seq_record.annotations["comment_induction"],
['Repressed in presence of fatty acids. Repressed 3-fold by '
'lipid precursors, inositol and choline, and also controlled '
'by regulatory factors INO2, INO4 and OPI1.'])
self.assertEqual(seq_record.annotations["comment_interaction_intactId"],
['EBI-356720', 'EBI-746969', 'EBI-720116'])
self.assertEqual(seq_record.annotations["comment_massspectrometry"],
['88..98:1032|MALDI', '100..110:1133.7|MALDI'])
self.assertEqual(seq_record.annotations["comment_miscellaneous"],
['Present with 20200 molecules/cell in log phase SD medium.'])
self.assertEqual(seq_record.annotations["comment_onlineinformation"],
['NIEHS-SNPs@http://egp.gs.washington.edu/data/api5/'])
self.assertEqual(seq_record.annotations["comment_pathway"],
['Lipid metabolism; malonyl-CoA biosynthesis; malonyl-CoA '
'from acetyl-CoA: step 1/1.'])
self.assertEqual(seq_record.annotations["comment_RNAediting"],
['Partially edited. RNA editing generates receptor isoforms '
'that differ in their ability to interact with the '
'phospholipase C signaling cascade in a transfected cell '
'line, suggesting that this RNA processing event may '
'contribute to the modulation of serotonergic '
'neurotransmission in the central nervous system.'])
self.assertEqual(seq_record.annotations["comment_PTM"],
['Acetylation at Lys-251 impairs antiapoptotic function.'])
self.assertEqual(seq_record.annotations["comment_pharmaceutical"],
['Could be used as a possible therapeutic agent for treating '
'rheumatoid arthritis.'])
self.assertEqual(seq_record.annotations["comment_polymorphism"],
['Position 23 is polymorphic; the frequencies in unrelated '
'Caucasians are 0.87 for Cys and 0.13 for Ser.'])
self.assertEqual(seq_record.annotations["comment_similarity"],
['Belongs to the FARP (FMRFamide related peptide) family.'])
self.assertEqual(seq_record.annotations["comment_subcellularlocation_location"],
['Secreted'])
self.assertEqual(seq_record.annotations["comment_subunit"],
['Homodimer.'])
self.assertEqual(seq_record.annotations["comment_tissuespecificity"],
['Each flp gene is expressed in a distinct set of neurons. '
'Flp-13 is expressed in the ASE sensory neurons, the DD motor '
'neurons, the 15, M3 and M5 cholinergic pharyngeal '
'motoneurons, and the ASG, ASK and BAG neurons.'])
self.assertEqual(seq_record.annotations["comment_toxicdose"],
['LD(50) is 50 ug/kg in mouse by intracerebroventricular '
'injection and 600 ng/g in Blatella germanica.'])
def compare_txt_xml(self, old, new):
self.assertEqual(old.id, new.id)
self.assertEqual(old.name, new.name)
self.assertEqual(len(old), len(new))
self.assertEqual(str(old.seq), str(new.seq))
for key in set(old.annotations).intersection(new.annotations):
if key == "references":
self.assertEqual(len(old.annotations[key]),
len(new.annotations[key]))
for r1, r2 in zip(old.annotations[key], new.annotations[key]):
#Tweak for line breaks in plain text SwissProt
r1.title = r1.title.replace("- ", "-")
r2.title = r2.title.replace("- ", "-")
r1.journal = r1.journal.rstrip(".") # Should parser do this?
r1.medline_id = "" # Missing in UniPort XML? TODO - check
#Lots of extra comments in UniProt XML
r1.comment = ""
r2.comment = ""
if not r2.journal:
r1.journal = ""
compare_reference(r1, r2)
elif old.annotations[key] == new.annotations[key]:
pass
elif key in ["date"]:
#TODO - Why is this a list vs str?
pass
elif type(old.annotations[key]) != type(new.annotations[key]):
raise TypeError("%s gives %s vs %s" %
(key, old.annotations[key], new.annotations[key]))
elif key in ["organism"]:
if old.annotations[key] == new.annotations[key]:
pass
elif old.annotations[key].startswith(new.annotations[key]+" "):
pass
else:
raise ValueError(key)
elif isinstance(old.annotations[key], list) \
and sorted(old.annotations[key]) == sorted(new.annotations[key]):
pass
else:
raise ValueError("%s gives %s vs %s" %
(key, old.annotations[key], new.annotations[key]))
self.assertEqual(len(old.features), len(new.features),
"Features in %s, %i vs %i" %
(old.id, len(old.features), len(new.features)))
for f1, f2 in zip(old.features, new.features):
"""
self.assertEqual(f1.location.nofuzzy_start, f2.location.nofuzzy_start,
"%s %s vs %s %s" %
(f1.location, f1.type, f2.location, f2.type))
self.assertEqual(f1.location.nofuzzy_end, f2.location.nofuzzy_end,
"%s %s vs %s %s" %
(f1.location, f1.type, f2.location, f2.type))
"""
self.assertEqual(repr(f1.location), repr(f2.location),
"%s %s vs %s %s" %
(f1.location, f1.type, f2.location, f2.type))
def test_Q13639(self):
"""Compare SwissProt text and uniprot XML versions of Q13639."""
old = SeqIO.read("SwissProt/Q13639.txt", "swiss")
new = SeqIO.read("SwissProt/Q13639.xml", "uniprot-xml")
self.compare_txt_xml(old, new)
def test_multi_ex(self):
"""Compare SwissProt text and uniprot XML versions of several examples."""
txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss"))
xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml"))
fas_list = list(SeqIO.parse("SwissProt/multi_ex.fasta", "fasta"))
with open("SwissProt/multi_ex.list") as handle:
ids = [x.strip() for x in handle]
self.assertEqual(len(txt_list), len(ids))
self.assertEqual(len(txt_list), len(fas_list))
self.assertEqual(len(txt_list), len(xml_list))
for txt, xml, fas, id in zip(txt_list, xml_list, fas_list, ids):
self.assertEqual(txt.id, id)
self.assertTrue(txt.id in fas.id.split("|"))
self.assertEqual(str(txt.seq), str(fas.seq))
self.compare_txt_xml(txt, xml)
def test_multi_ex_index(self):
"""Index SwissProt text and uniprot XML versions of several examples."""
txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss"))
xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml"))
with open("SwissProt/multi_ex.list") as handle:
ids = [x.strip() for x in handle]
txt_index = SeqIO.index("SwissProt/multi_ex.txt", "swiss")
xml_index = SeqIO.index("SwissProt/multi_ex.xml", "uniprot-xml")
self.assertEqual(sorted(txt_index), sorted(ids))
self.assertEqual(sorted(xml_index), sorted(ids))
#Check SeqIO.parse() versus SeqIO.index() for plain text "swiss"
for old in txt_list:
new = txt_index[old.id]
compare_record(old, new)
#Check SeqIO.parse() versus SeqIO.index() for XML "uniprot-xml"
for old in xml_list:
new = xml_index[old.id]
compare_record(old, new)
txt_index.close()
xml_index.close()
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity = 2)
unittest.main(testRunner=runner)
Jump to Line
Something went wrong with that request. Please try again.