/
msigdb.py
153 lines (121 loc) 路 4.87 KB
/
msigdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
"""Parsers for MSig."""
import logging
from typing import Iterable, Optional
from xml.etree import ElementTree
import click
from more_click import verbose_option
from tqdm.auto import tqdm
from ..struct import Obo, Reference, Term, has_part
from ..utils.path import ensure_path
logger = logging.getLogger(__name__)
__all__ = [
"MSigDBGetter",
]
PREFIX = "msigdb"
BASE_URL = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release"
class MSigDBGetter(Obo):
"""An ontology representation of MMSigDB's gene set nomenclature."""
ontology = bioversions_key = PREFIX
typedefs = [has_part]
def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return iter_terms(version=self._version_or_raise, force=force)
def get_obo(force: bool = False) -> Obo:
"""Get MSIG as Obo."""
return MSigDBGetter(force=force)
_SPECIES = {
"Homo sapiens": "9606",
"Mus musculus": "10090",
"Rattus norvegicus": "10116",
"Macaca mulatta": "9544",
"Danio rerio": "7955",
}
REACTOME_URL_PREFIX = "https://www.reactome.org/content/detail/"
GO_URL_PREFIX = "http://amigo.geneontology.org/amigo/term/GO:"
KEGG_URL_PREFIX = "http://www.genome.jp/kegg/pathway/hsa/"
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
"""Get MSigDb terms."""
xml_url = f"{BASE_URL}/{version}/msigdb_v{version}.xml"
path = ensure_path(prefix=PREFIX, url=xml_url, version=version, force=force)
tree = ElementTree.parse(path)
for entry in tqdm(tree.getroot(), desc=f"{PREFIX} v{version}", unit_scale=True):
attrib = dict(entry.attrib)
tax_id = _SPECIES[attrib["ORGANISM"]]
reference_id = attrib["PMID"].strip()
if not reference_id:
reference = None
elif reference_id.startswith("GSE"):
reference = Reference("gse", reference_id)
else:
reference = Reference("pubmed", reference_id)
# NONE have the entry "HISTORICAL_NAME"
# historical_name = thing.attrib['HISTORICAL_NAME']
identifier = attrib["SYSTEMATIC_NAME"]
name = attrib["STANDARD_NAME"]
is_obsolete = attrib["CATEGORY_CODE"] == "ARCHIVED"
term = Term(
reference=Reference(PREFIX, identifier, name),
definition=_get_definition(attrib),
provenance=[] if reference is None else [reference],
is_obsolete=is_obsolete,
)
for key in [
"CATEGORY_CODE",
"SUB_CATEGORY_CODE",
"CONTRIBUTOR",
"EXACT_SOURCE",
"EXTERNAL_DETAILS_URL",
]:
value = attrib[key].strip()
if value:
term.append_property(key.lower(), value)
term.set_species(tax_id)
contributor = attrib["CONTRIBUTOR"]
external_id = attrib["EXACT_SOURCE"]
external_details = attrib["EXTERNAL_DETAILS_URL"]
if contributor == "WikiPathways":
if not external_id:
logger.warning(
"missing %s source: msigdb:%s (%s)", contributor, identifier, external_details
)
term.append_xref(Reference("wikipathways", external_id))
elif contributor == "Reactome":
if not external_id:
logger.warning(
"missing %s source: msigdb:%s (%s)", contributor, identifier, external_details
)
term.append_xref(Reference("reactome", external_id))
elif contributor == "Gene Ontology":
if not external_id:
external_id = external_details[len(GO_URL_PREFIX) :]
if not external_id:
logger.warning(
"missing %s source: msigdb:%s (%s)", contributor, identifier, external_details
)
term.append_xref(Reference("go", external_id))
elif contributor == "KEGG":
if not external_id:
external_id = external_details[len(KEGG_URL_PREFIX) : len(".html")]
if not external_id:
logger.warning(
"missing %s source: msigdb:%s (%s)", contributor, identifier, external_details
)
term.append_xref(Reference("kegg.pathway", external_id))
for ncbigene_id in attrib["MEMBERS_EZID"].strip().split(","):
if ncbigene_id:
term.append_relationship(
has_part, Reference(prefix="ncbigene", identifier=ncbigene_id)
)
yield term
def _get_definition(attrib) -> Optional[str]:
rv = attrib["DESCRIPTION_FULL"].strip() or attrib["DESCRIPTION_BRIEF"].strip() or None
if rv is not None:
return rv.replace("\d", "").replace("\s", "") # noqa: W605
return None
@click.command()
@verbose_option
def _main():
get_obo().write_default(force=True)
if __name__ == "__main__":
_main()