/
rgd.py
170 lines (147 loc) 路 5.29 KB
/
rgd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# -*- coding: utf-8 -*-
"""Converter for RGD."""
import logging
from typing import Iterable, Optional
import pandas as pd
from tqdm.auto import tqdm
from pyobo.struct import (
Obo,
Reference,
Synonym,
SynonymTypeDef,
Term,
from_species,
has_gene_product,
transcribes_to,
)
from pyobo.utils.path import ensure_df
logger = logging.getLogger(__name__)
PREFIX = "rgd"
old_symbol_type = SynonymTypeDef(id="old_symbol", name="old symbol")
old_name_type = SynonymTypeDef(id="old_name", name="old name")
# NOTE unigene id was discontinue in January 18th, 2021 dump
GENES_URL = "https://download.rgd.mcw.edu/data_release/GENES.RAT.txt"
GENES_HEADER = [
"GENE_RGD_ID",
"SYMBOL",
"NAME",
"GENE_DESC",
"CHROMOSOME_CELERA",
"CHROMOSOME_[oldAssembly#] chromosome for the old reference assembly",
"CHROMOSOME_[newAssembly#] chromosome for the current reference assembly",
"FISH_BAND",
"START_POS_CELERA",
"STOP_POS_CELERA",
"STRAND_CELERA",
"START_POS_[oldAssembly#]",
"STOP_POS_[oldAssembly#]",
"STRAND_[oldAssembly#]",
"START_POS_[newAssembly#]",
"STOP_POS_[newAssembly#]",
"STRAND_[newAssembly#]",
"CURATED_REF_RGD_ID",
"CURATED_REF_PUBMED_ID",
"UNCURATED_PUBMED_ID",
"NCBI_GENE_ID",
"UNIPROT_ID",
"UNCURATED_REF_MEDLINE_ID",
"GENBANK_NUCLEOTIDE",
"TIGR_ID",
"GENBANK_PROTEIN",
"SSLP_RGD_ID",
"SSLP_SYMBOL",
"OLD_SYMBOL",
"OLD_NAME",
"QTL_RGD_ID",
"QTL_SYMBOL",
"NOMENCLATURE_STATUS",
"SPLICE_RGD_ID",
"SPLICE_SYMBOL",
"GENE_TYPE",
"ENSEMBL_ID",
]
class RGDGetter(Obo):
"""An ontology representation of RGD's rat gene nomenclature."""
bioversions_key = ontology = PREFIX
typedefs = [from_species, transcribes_to, has_gene_product]
synonym_typedefs = [old_name_type, old_symbol_type]
def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return get_terms(force=force, version=self._version_or_raise)
def get_obo(force: bool = False) -> Obo:
"""Get RGD as OBO."""
return RGDGetter(force=force)
namespace_to_column = [
("ensembl", "ENSEMBL_ID"),
("uniprot", "UNIPROT_ID"),
("ncbigene", "NCBI_GENE_ID"),
]
def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Term]:
"""Get RGD terms."""
df = ensure_df(
PREFIX,
url=GENES_URL,
sep="\t",
header=0,
comment="#",
dtype=str,
force=force,
version=version,
quoting=3,
error_bad_lines=False,
)
for _, row in tqdm(
df.iterrows(), total=len(df.index), desc=f"Mapping {PREFIX}", unit_scale=True
):
if pd.notna(row["NAME"]):
definition = row["NAME"]
elif pd.notna(row["GENE_DESC"]):
definition = row["GENE_DESC"]
else:
definition = None
term = Term(
reference=Reference(prefix=PREFIX, identifier=row["GENE_RGD_ID"], name=row["SYMBOL"]),
definition=definition,
)
old_names = row["OLD_NAME"]
if old_names and pd.notna(old_names):
for old_name in old_names.split(";"):
term.append_synonym(Synonym(name=old_name, type=old_name_type))
old_symbols = row["OLD_SYMBOL"]
if old_symbols and pd.notna(old_symbols):
for old_symbol in old_symbols.split(";"):
term.append_synonym(Synonym(name=old_symbol, type=old_symbol_type))
for prefix, key in namespace_to_column:
xref_ids = str(row[key])
if xref_ids and pd.notna(xref_ids):
for xref_id in xref_ids.split(";"):
if xref_id == "nan":
continue
if prefix == "uniprot":
term.append_relationship(
has_gene_product, Reference.auto(prefix=prefix, identifier=xref_id)
)
elif prefix == "ensembl":
if xref_id.startswith("ENSMUSG") or xref_id.startswith("ENSRNOG"):
# second one is reverse strand
term.append_xref(Reference(prefix=prefix, identifier=xref_id))
elif xref_id.startswith("ENSMUST"):
term.append_relationship(
transcribes_to, Reference(prefix=prefix, identifier=xref_id)
)
elif xref_id.startswith("ENSMUSP"):
term.append_relationship(
has_gene_product, Reference(prefix=prefix, identifier=xref_id)
)
else:
logger.warning("[%s] unhandled xref ensembl:%s", PREFIX, xref_id)
else:
term.append_xref(Reference(prefix=prefix, identifier=xref_id))
pubmed_ids = row["CURATED_REF_PUBMED_ID"]
if pubmed_ids and pd.notna(pubmed_ids):
for pubmed_id in str(pubmed_ids).split(";"):
term.append_provenance(Reference(prefix="pubmed", identifier=pubmed_id))
term.set_species(identifier="10116", name="Rattus norvegicus")
yield term
if __name__ == "__main__":
RGDGetter.cli()