Skip to content

Commit

Permalink
style: flake8 fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
newgene committed Jul 18, 2022
1 parent 6fe119e commit 63766dd
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 33 deletions.
42 changes: 18 additions & 24 deletions src/hub/databuild/builder.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import biothings.utils.mongo as mongo
import biothings.hub.databuild.builder as builder
from biothings.utils.common import loadobj
from biothings.hub import BUILDER_CATEGORY

from hub.dataload.sources.entrez.gene_upload import EntrezGeneUploader
from hub.dataload.sources.ensembl.gene_upload import EnsemblGeneUploader

class MyGeneDataBuilder(builder.DataBuilder):
"""
Expand All @@ -19,29 +16,26 @@ def no_other_merge_job_for_ensembl_gene(job_manager):
which are not handled by mongo as upserts and produce duplicated
errors. For this datasource in particular, we allow only one merge job
"""
return len([j for j in job_manager.jobs.values() if \
j["category"] == BUILDER_CATEGORY \
and j["step"] == "ensembl_gene"]) == 0
return len([j for j in job_manager.jobs.values() if j["category"] == BUILDER_CATEGORY and j["step"] == "ensembl_gene"]) == 0
preds = super().get_predicates()
preds.append(no_other_merge_job_for_ensembl_gene)
return preds


def generate_document_query(self, src_name):
"""Root documents are created according to species list"""
_query = None
if src_name in self.get_root_document_sources():
if "species" in self.build_config:
_query = {'taxid': {'$in': list(map(int,self.build_config['species']))}}
_query = {'taxid': {'$in': list(map(int, self.build_config['species']))}}
elif "species_to_exclude" in self.build_config:
_query = {'taxid': {'$nin': list(map(int,self.build_config['species_to_exclude']))}}
_query = {'taxid': {'$nin': list(map(int, self.build_config['species_to_exclude']))}}
else:
_query = None
if _query:
self.logger.debug("Source '%s' requires custom query: '%s'" % (src_name,_query))
self.logger.debug("Source '%s' requires custom query: '%s'" % (src_name, _query))
return _query

def document_cleaner(self,src_name,*args,**kwargs):
def document_cleaner(self, src_name, *args, **kwargs):
# only root sources document can keep their taxid
if src_name in self.get_root_document_sources():
return None
Expand All @@ -52,12 +46,12 @@ def post_merge(self, source_names, batch_size, job_manager):
tgt = mongo.get_target_db()[self.target_name]
# background=true or it'll lock the whole database...
self.logger.info("Indexing 'taxid'")
tgt.create_index("taxid",background=True)
tgt.create_index("taxid", background=True)
self.logger.info("Indexing 'entrezgene'")
tgt.create_index("entrezgene",background=True)
tgt.create_index("entrezgene", background=True)

def get_stats(self,sources,job_manager):
self.stats = super(MyGeneDataBuilder,self).get_stats(sources,job_manager)
def get_stats(self, sources, job_manager):
self.stats = super(MyGeneDataBuilder, self).get_stats(sources, job_manager)
# enrich with some specific mygene counts, specially regarding ensembl vs. entrez
tgt = mongo.get_target_db()[self.target_name]
self.stats["total_genes"] = tgt.estimated_document_count()
Expand All @@ -77,16 +71,16 @@ def get_stats(self,sources,job_manager):
# Queries are gonna use colscan strategy...
self.logger.debug("Counting 'total_ensembl_genes'")
res = tgt.aggregate([
{"$match" : {"ensembl.0" : {"$exists" : True}}},
{"$project" : {"num_gene" : {"$size" : "$ensembl"}}},
{"$group" : {"_id" : None, "sum" : {"$sum": "$num_gene"}}}
])
{"$match": {"ensembl.0": {"$exists": True}}},
{"$project": {"num_gene": {"$size": "$ensembl"}}},
{"$group": {"_id": None, "sum": {"$sum": "$num_gene"}}}
])
try:
list_count = next(res)["sum"]
except StopIteration:
list_count = 0
object_count = tgt.count_documents({"ensembl": {"$type": "object"}})
orphan_count = tgt.count_documents({"_id": {"$regex": '''\\w'''},"ensembl": {"$exists": 0}})
orphan_count = tgt.count_documents({"_id": {"$regex": '''\\w'''}, "ensembl": {"$exists": 0}})
total_ensembl_genes = list_count + object_count + orphan_count
self.stats["total_ensembl_genes"] = total_ensembl_genes
# this one can't be computed from merged collection, and is only valid when build
Expand All @@ -95,10 +89,10 @@ def get_stats(self,sources,job_manager):
# this one is similar to total_ensembl_genes except we cross with entrezgene (ie. so they're mapped)
try:
list_count = next(tgt.aggregate([
{"$match" : {"$and" : [{"ensembl.0" : {"$exists" : True}},{"entrezgene":{"$exists":1}}]}},
{"$project" : {"num_gene" : {"$size" : "$ensembl"}}},
{"$group" : {"_id" : None, "sum" : {"$sum": "$num_gene"}}}
]))["sum"]
{"$match": {"$and": [{"ensembl.0": {"$exists": True}}, {"entrezgene": {"$exists": 1}}]}},
{"$project": {"num_gene": {"$size": "$ensembl"}}},
{"$group": {"_id": None, "sum": {"$sum": "$num_gene"}}}
]))["sum"]
except StopIteration:
list_count = 0
object_count = tgt.count_documents({"$and": [{"ensembl": {"$type": "object"}}, {"entrezgene": {"$exists": 1}}]})
Expand Down
19 changes: 10 additions & 9 deletions src/hub/databuild/mapper.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from biothings.utils.common import loadobj
import biothings.hub.databuild.mapper as mapper


class EntrezRetired2Current(mapper.IDBaseMapper):

def __init__(self, db_provider, *args, **kwargs):
super(EntrezRetired2Current,self).__init__(*args,**kwargs)
super(EntrezRetired2Current, self).__init__(*args, **kwargs)
self.db_provider = db_provider

def load(self):
Expand All @@ -14,7 +15,7 @@ def load(self):
# but it's a way to know the entrez perimeter (what entrez _ids exist and should be considered
self.map = loadobj(("entrez_gene__geneid_d.pyobj", self.db_provider()), mode='gridfs')

def process(self,*args,**kwargs):
def process(self, *args, **kwargs):
raise UserWarning("Don't call me, please")


Expand All @@ -24,7 +25,7 @@ class Ensembl2Entrez(mapper.IDBaseMapper):
"""

def __init__(self, db_provider, retired2current, *args, **kwargs):
super(Ensembl2Entrez,self).__init__("ensembl2entrez",*args,**kwargs)
super(Ensembl2Entrez, self).__init__("ensembl2entrez", *args, **kwargs)
self.db_provider = db_provider
self.retired2current = retired2current

Expand All @@ -34,7 +35,7 @@ def load(self):
self.map = {}
ensembl2entrez_li = loadobj(("ensembl_gene__2entrezgene_list.pyobj", self.db_provider()), mode='gridfs')
#filter out those deprecated entrez gene ids
for ensembl_id,entrez_id in ensembl2entrez_li:
for ensembl_id, entrez_id in ensembl2entrez_li:
entrez_id = int(entrez_id)
if entrez_id in self.retired2current:
self.map[ensembl_id] = self.retired2current.translate(entrez_id)
Expand All @@ -47,7 +48,7 @@ class Ensembl2EntrezRoot(mapper.IDBaseMapper):
"""

def __init__(self, ensembl2entrez, *args, **kwargs):
super(Ensembl2EntrezRoot,self).__init__("ensembl",*args,**kwargs)
super(Ensembl2EntrezRoot, self).__init__("ensembl", *args, **kwargs)
self.ensembl2entrez = ensembl2entrez

def load(self):
Expand All @@ -56,7 +57,7 @@ def load(self):
self.ensembl2entrez.load()
self.map = self.ensembl2entrez.map

def process(self,docs,key_to_convert="_id",**kwargs):
def process(self, docs, key_to_convert="_id", **kwargs):
"""
we want to force translation, not defaulting to ensembl if no match
so, if there's a match, it means ensembl doc can be converted so it means
Expand All @@ -65,8 +66,8 @@ def process(self,docs,key_to_convert="_id",**kwargs):
convertible docs to only keep ensembl docs
"""
for doc in docs:
_new = self.translate(doc[key_to_convert],transparent=False)
_new = self.translate(doc[key_to_convert], transparent=False)
if _new:
continue # already as entrez_gene
continue # already as entrez_gene
else:
yield doc # return original
yield doc # return original

0 comments on commit 63766dd

Please sign in to comment.