diff --git a/src/bio2bel_hgnc/enrich.py b/src/bio2bel_hgnc/enrich.py index 4966807..59f845f 100644 --- a/src/bio2bel_hgnc/enrich.py +++ b/src/bio2bel_hgnc/enrich.py @@ -16,7 +16,7 @@ def get_node(graph, node, manager=None): - """Gets a node from the database, whether it has a HGNC, RGD, MGI, or EG identifier. + """Get a node from the database, whether it has a HGNC, RGD, MGI, or EG identifier. :param pybel.BELGraph graph: A BEL graph :param tuple node: A PyBEL node tuple @@ -25,11 +25,11 @@ def get_node(graph, node, manager=None): """ if manager is None: manager = Manager() - return manager.get_node(graph, node) + return manager.get_node(graph._node[node]) def add_node_orthologies(graph, node, manager=None, add_leaves=False): - """Given a node that's HGNC, add orthology relationships + """Given a node that's HGNC, add orthology relationships. :param pybel.BELGraph graph: A BEL graph :param tuple node: A PyBEL node tuple diff --git a/src/bio2bel_hgnc/manager.py b/src/bio2bel_hgnc/manager.py index 4c622be..029a279 100644 --- a/src/bio2bel_hgnc/manager.py +++ b/src/bio2bel_hgnc/manager.py @@ -7,24 +7,21 @@ from typing import Dict, Iterable, Optional, Tuple import click -import networkx as nx +from tqdm import tqdm + from bio2bel import AbstractManager from bio2bel.manager.bel_manager import BELManagerMixin from bio2bel.manager.flask_manager import FlaskMixin from bio2bel.manager.namespace_manager import BELNamespaceManagerMixin from pybel import BELGraph from pybel.constants import FUNCTION, GENE, IDENTIFIER, MIRNA, NAME, NAMESPACE, PROTEIN, RNA, VARIANTS -from pybel.dsl import gene as gene_dsl, mirna as mirna_dsl, protein as protein_dsl, rna as rna_dsl -from pybel.dsl.nodes import BaseEntity +from pybel.dsl import BaseEntity, FUNC_TO_DSL, rna as rna_dsl from pybel.manager.models import NamespaceEntry -from tqdm import tqdm - +from pybel.struct.utils import relabel_inplace from .constants import MODULE_NAME, encodings from .gfam_manager import Manager as GfamManager -from .model_utils import ( - add_central_dogma, family_to_bel, gene_to_bel, uniprot_to_bel, -) -from .models import Base, GeneFamily, HumanGene, MouseGene, RatGene, UniProt, AliasName, AliasSymbol +from .model_utils import add_central_dogma, family_to_bel, gene_to_bel, uniprot_to_bel +from .models import AliasName, AliasSymbol, Base, GeneFamily, HumanGene, MouseGene, RatGene, UniProt from .wrapper import BaseManager log = logging.getLogger(__name__) @@ -33,12 +30,7 @@ 'Manager', ] -_func_to_dsl = { - GENE: gene_dsl, - RNA: rna_dsl, - PROTEIN: protein_dsl, - MIRNA: mirna_dsl, -} +UNIPROT_RE = r'^([A-N,R-Z][0-9]([A-Z][A-Z, 0-9][A-Z, 0-9][0-9]){1,2})|([O,P,Q][0-9][A-Z, 0-9][A-Z, 0-9][A-Z, 0-9][0-9])(\.\d+)?$' def _deal_with_nonsense(results): @@ -221,25 +213,18 @@ def get_gene_by_rgd_id(self, rgd_id): return human_genes[0] - def get_node(self, graph, node) -> Optional[HumanGene]: + def get_node(self, node: BaseEntity) -> Optional[HumanGene]: """Get a node from the database, whether it has a HGNC, RGD, MGI, or EG identifier. - :param pybel.BELGraph graph: A BEL graph - :param node: A PyBEL node tuple - :type node: tuple or BaseEntity + :param node: The node to look for :raises: KeyError """ - if isinstance(node, BaseEntity): - data = node - else: - data = graph.node[node] - - if NAMESPACE not in data: + if NAMESPACE not in node: return - namespace = data[NAMESPACE] - identifer = data.get(IDENTIFIER) - name = data.get(NAME) + namespace = node[NAMESPACE] + identifer = node.get(IDENTIFIER) + name = node.get(NAME) if namespace.lower() == 'hgnc': if identifer is not None: @@ -279,39 +264,42 @@ def get_node(self, graph, node) -> Optional[HumanGene]: raise KeyError return self.get_gene_by_rgd_id(name) - def add_namespace_to_graph(self, graph): - """Add this manager's namespace to the graph. - - :param pybel.BELGraph graph: - """ + def add_namespace_to_graph(self, graph: BELGraph): + """Add this manager's namespace to the graph.""" namespace = self.upload_bel_namespace() graph.namespace_url[namespace.keyword] = namespace.url gfam_manager = GfamManager(engine=self.engine, session=self.session) gfam_manager.add_namespace_to_graph(graph) - def _iter_genes(self, graph) -> Iterable[Tuple[BaseEntity, HumanGene]]: - for node_tuple, node_data in graph.nodes(data=True): - human_gene = self.get_node(graph, node_tuple) + def iter_genes(self, graph: BELGraph) -> Iterable[Tuple[BaseEntity, HumanGene]]: + """Iterate over pairs of BEL nodes and HGNC genes.""" + for _, node in list(graph.nodes(data=True)): + human_gene = self.get_node(node) if human_gene is not None: - yield node_data, human_gene + yield node, human_gene - def normalize_genes(self, graph) -> None: - """Add identifiers to all HGNC genes. + def normalize_genes(self, graph: BELGraph) -> None: + """Add identifiers to all HGNC genes.""" + + for _, data in graph.nodes(data=True): + assert isinstance(data, BaseEntity), f'start issue {data}' - :param pybel.BELGraph graph: The BEL graph to enrich - """ mapping = {} - for node_data, human_gene in self._iter_genes(graph): + for node_data, human_gene in self.iter_genes(graph): + new_node_data = gene_to_bel(human_gene, func=node_data.function, variants=node_data.get(VARIANTS)) + node_tuple = node_data.as_tuple() - dsl = gene_to_bel(human_gene, func=node_data[FUNCTION], variants=node_data.get(VARIANTS)) - graph.node[node_tuple] = dsl - mapping[node_tuple] = dsl.as_tuple() + graph._node[node_tuple] = new_node_data + mapping[node_tuple] = new_node_data.as_tuple() # FIXME what about when an HGNC node appears in a fusion, complex, or composite? - nx.relabel_nodes(graph, mapping, copy=False) + relabel_inplace(graph, mapping) + + for _, data in graph.nodes(data=True): + assert isinstance(data, BaseEntity), f'end issue {data}' def enrich_genes_with_equivalences(self, graph: BELGraph) -> None: """Enrich genes with their corresponding UniProt.""" @@ -321,11 +309,11 @@ def enrich_genes_with_equivalences(self, graph: BELGraph) -> None: graph.namespace_pattern[ 'uniprot'] = '^([A-N,R-Z][0-9]([A-Z][A-Z, 0-9][A-Z, 0-9][0-9]){1,2})|([O,P,Q][0-9][A-Z, 0-9][A-Z, 0-9][A-Z, 0-9][0-9])(\.\d+)?$' - for node_data, human_gene in self._iter_genes(graph): + for node_data, human_gene in self.iter_genes(graph): func = node_data[FUNCTION] if human_gene.entrez: - graph.add_equivalence(node_data, _func_to_dsl[func]( + graph.add_equivalence(node_data, FUNC_TO_DSL[func]( namespace='ncbigene', name=human_gene.symbol, identifier=str(human_gene.entrez) @@ -346,7 +334,7 @@ def enrich_genes_with_equivalences(self, graph: BELGraph) -> None: def enrich_genes_with_families(self, graph: BELGraph) -> None: """Enrich genes in the BEL graph with their families.""" self.add_namespace_to_graph(graph) - for node_data, human_gene in self._iter_genes(graph): + for node_data, human_gene in self.iter_genes(graph): for family in human_gene.gene_families: graph.add_is_a(node_data, family_to_bel(family, node_data[FUNCTION])) @@ -366,16 +354,7 @@ def get_family_by_name(self, family_name: str) -> Optional[GeneFamily]: results = self.gene_family(family_name=family_name) return _deal_with_nonsense(results) - def _enrich_hgnc_with_entrez_equivalences(self, graph, node): - """ - - :param pybel.BELGraph graph: - :param node: - :return: the hash of the edge added - :rtype: str - """ - data = graph.node[node] - + def _enrich_hgnc_with_entrez_equivalences(self, graph: BELGraph, data: BaseEntity): namespace = data.get(NAMESPACE) if namespace.lower() != 'hgnc': @@ -385,45 +364,39 @@ def _enrich_hgnc_with_entrez_equivalences(self, graph, node): name = data[NAME] entrez = self.hgnc_symbol_entrez_id_mapping[name] - return graph.add_equivalence(node, _func_to_dsl[func]( + return graph.add_equivalence(data, FUNC_TO_DSL[func]( namespace='ncbigene', name=name, - identifier=str(entrez) + identifier=str(entrez), )) - def enrich_hgnc_with_entrez_equivalences(self, graph): - """Add equivalent Entrez nodes for all HGNC genes. - - :param pybel.BELGraph graph: The BEL graph to enrich - """ + def enrich_hgnc_with_entrez_equivalences(self, graph: BELGraph): + """Add equivalent Entrez nodes for all HGNC genes.""" self.add_namespace_to_graph(graph) - for node in graph.nodes(): - self._enrich_hgnc_with_entrez_equivalences(graph, node) - - def enrich_families_with_genes(self, graph): - """Enrich gene families in the BEL graph with their member genes. + for _, data in graph.nodes(data=True): + self._enrich_hgnc_with_entrez_equivalences(graph, data) - :param pybel.BELGraph graph: The BEL graph to enrich - """ + def enrich_families_with_genes(self, graph: BELGraph): + """Enrich gene families in the BEL graph with their member genes.""" self.add_namespace_to_graph(graph) - for gene_family_node, data in graph.nodes(data=True): - if data[FUNCTION] != GENE: + for _, gene_family_node in graph.nodes(data=True): + if gene_family_node[FUNCTION] != GENE: continue - if data.get(NAMESPACE).lower() not in {'gfam', 'hgnc.family', 'hgnc.genefamily'}: + if gene_family_node.get(NAMESPACE).lower() not in {'gfam', 'hgnc.family', 'hgnc.genefamily'}: continue - if IDENTIFIER in data: - gene_family_model = self.get_family_by_id(data[IDENTIFIER]) - elif NAME in data: - gene_family_model = self.get_family_by_name(data[NAME]) + if IDENTIFIER in gene_family_node: + gene_family_model = self.get_family_by_id(gene_family_node[IDENTIFIER]) + elif NAME in gene_family_node: + gene_family_model = self.get_family_by_name(gene_family_node[NAME]) else: raise ValueError if gene_family_model is None: - log.info('family not found: %s', data) + log.info('family not found: %s', gene_family_node) continue for human_gene in gene_family_model.hgncs: @@ -431,6 +404,9 @@ def enrich_families_with_genes(self, graph): """ Mapping dictionaries""" + def _get_name(self, human_gene: HumanGene) -> str: + return str(human_gene.symbol) + def _get_identifier(self, human_gene: HumanGene) -> str: """Get the identifier from a human gene SQLAlchemy model. @@ -609,17 +585,16 @@ def get_all_hgnc_symbols_family(self): for human_gene in family.hgncs } - def add_central_dogma(self, graph, node): + def add_central_dogma(self, graph: BELGraph, node: BaseEntity): """Add the central dogma of biology. :param graph: :param node: """ - data = graph.node[node] - if VARIANTS in data: + if VARIANTS in node: return - human_gene = self.get_node(graph, node) + human_gene = self.get_node(node) encoding = encodings.get(human_gene.locus_type, 'GRP') if 'M' in encoding: diff --git a/src/bio2bel_hgnc/model_utils.py b/src/bio2bel_hgnc/model_utils.py index 18c448a..ba324e2 100644 --- a/src/bio2bel_hgnc/model_utils.py +++ b/src/bio2bel_hgnc/model_utils.py @@ -2,12 +2,13 @@ """Bio2BEL HGNC Model utilities.""" -from pybel.constants import GENE, MIRNA, PROTEIN, RNA -from pybel.dsl import gene as gene_dsl, mirna as mirna_dsl, protein as protein_dsl, rna as rna_dsl -from pybel.dsl.nodes import CentralDogma +from typing import List, Optional +from pybel import BELGraph +from pybel.constants import MIRNA, PROTEIN, RNA +from pybel.dsl import CentralDogma, FUNC_TO_DSL, Variant, gene as gene_dsl, protein as protein_dsl from .constants import encodings -from .models import HumanGene +from .models import GeneFamily, HumanGene, UniProt __all__ = [ 'gene_to_bel', @@ -16,67 +17,52 @@ ] -def gene_to_bel(human_gene: HumanGene, func=None, variants=None) -> CentralDogma: +def gene_to_bel(human_gene: HumanGene, func: Optional[str] = None, + variants: Optional[List[Variant]] = None) -> CentralDogma: """Convert a Gene to a PyBEL gene. - :param bio2bel_hgnc.models.HumanGene human_gene: A Gene model + :param human_gene: A Gene model :rtype: pybel.dsl.gene """ - if func == PROTEIN: - dsl = protein_dsl - elif func == RNA: - dsl = rna_dsl - elif func == MIRNA: - dsl = mirna_dsl - elif func == GENE: - dsl = gene_dsl - else: - raise ValueError + dsl = FUNC_TO_DSL[func] if func else gene_dsl - if variants is not None: - # FIXME handle variants - variants = [ - v - for v in variants - ] - - return dsl( + rv = dsl( namespace='hgnc', name=str(human_gene.symbol), identifier=str(human_gene.identifier), - variants=variants, ) + if variants is not None: + return rv.with_variants(variants) + + return rv + -def family_to_bel(family, func=None): - """Converts a Gene Family model to a PyBEL gene +def family_to_bel(family: GeneFamily, func: Optional[str] = None, + variants: Optional[List[Variant]] = None) -> CentralDogma: + """Convert a Gene Family model to a PyBEL gene. - :param bio2bel_hgnc.models.GeneFamily family: A Gene Family model + :param family: A Gene Family model :rtype: pybel.dsl.gene """ - if func == PROTEIN: - dsl = protein_dsl - elif func == RNA: - dsl = rna_dsl - elif func == MIRNA: - dsl = mirna_dsl - elif func == GENE: - dsl = gene_dsl - else: - raise ValueError - - return dsl( + dsl = FUNC_TO_DSL[func] if func else gene_dsl + + rv = dsl( namespace='hgnc.genefamily', identifier=str(family.family_identifier), name=str(family.family_name) ) + if variants is not None: + return rv.with_variants(variants) -def uniprot_to_bel(uniprot) -> protein_dsl: - """ + return rv + + +def uniprot_to_bel(uniprot: UniProt) -> protein_dsl: + """Convert the uniprot model to BEL. :param bio2bel_hgnc.models.UniProt uniprot: - :return: """ return protein_dsl( namespace='uniprot', @@ -85,8 +71,8 @@ def uniprot_to_bel(uniprot) -> protein_dsl: ) -def add_central_dogma(graph, human_gene): - """Add the corresponding protein and""" +def add_central_dogma(graph: BELGraph, human_gene: HumanGene): + """Add the corresponding protein and/or RNA.""" encoding = encodings.get(human_gene.locus_type, 'GRP') if 'M' in encoding: