/
hgncgenefamily.py
109 lines (84 loc) 路 3.61 KB
/
hgncgenefamily.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
"""Converter for HGNC Gene Families."""
from collections import defaultdict
from typing import Iterable, List, Mapping
import pandas as pd
from tqdm.auto import tqdm
from ..struct import Obo, Reference, Synonym, SynonymTypeDef, Term, from_species
from ..utils.path import ensure_path
__all__ = [
"HGNCGroupGetter",
]
PREFIX = "hgnc.genegroup"
FAMILIES_URL = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/csv/genefamily_db_tables/family.csv"
# TODO use family_alias.csv
HIERARCHY_URL = (
"ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/csv/genefamily_db_tables/hierarchy.csv"
)
symbol_type = SynonymTypeDef(id="symbol", name="symbol")
class HGNCGroupGetter(Obo):
"""An ontology representation of HGNC's gene group nomenclature."""
ontology = PREFIX
dynamic_version = True
synonym_typedefs = [symbol_type]
typedefs = [from_species]
def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return get_terms(force=force)
def get_obo(force: bool = False) -> Obo:
"""Get HGNC Gene Groups as OBO."""
return HGNCGroupGetter(force=force)
def get_hierarchy(force: bool = False) -> Mapping[str, List[str]]:
"""Get the HGNC Gene Families hierarchy as a dictionary."""
path = ensure_path(PREFIX, url=HIERARCHY_URL, force=force)
df = pd.read_csv(path, dtype={"parent_fam_id": str, "child_fam_id": str})
d = defaultdict(list)
for parent_id, child_id in df.values:
d[child_id].append(parent_id)
return dict(d)
COLUMNS = ["id", "abbreviation", "name", "pubmed_ids", "desc_comment", "desc_go"]
def get_terms(force: bool = False) -> Iterable[Term]:
"""Get the HGNC Gene Group terms."""
terms = list(_get_terms_helper(force=force))
hierarchy = get_hierarchy(force=force)
id_to_term = {term.reference.identifier: term for term in terms}
for child_id, parent_ids in hierarchy.items():
child: Term = id_to_term[child_id]
for parent_id in parent_ids:
parent: Term = id_to_term[parent_id]
child.append_parent(
Reference(
prefix=PREFIX,
identifier=parent_id,
name=parent.name,
)
)
gene_group = Reference.auto("SO", "0005855")
yield Term(reference=gene_group)
for term in terms:
if not term.parents:
term.append_parent(gene_group)
yield from terms
def _get_terms_helper(force: bool = False) -> Iterable[Term]:
path = ensure_path(PREFIX, url=FAMILIES_URL, force=force)
df = pd.read_csv(path, dtype={"id": str})
it = tqdm(df[COLUMNS].values, desc=f"Mapping {PREFIX}")
for gene_group_id, symbol, name, pubmed_ids, definition, desc_go in it:
if not definition or pd.isna(definition):
definition = None
term = Term(
reference=Reference(prefix=PREFIX, identifier=gene_group_id, name=name),
definition=definition,
)
if pubmed_ids and pd.notna(pubmed_ids):
for s in pubmed_ids.split(","):
term.append_provenance(Reference(prefix="pubmed", identifier=s.strip()))
if desc_go and pd.notna(desc_go):
go_id = desc_go[len("http://purl.uniprot.org/go/") :]
term.append_xref(Reference(prefix="GO", identifier=go_id))
if symbol and pd.notna(symbol):
term.append_synonym(Synonym(name=symbol, type=symbol_type))
term.set_species(identifier="9606", name="Homo sapiens")
yield term
if __name__ == "__main__":
get_obo().write_default(force=True, write_obo=True, write_owl=True)