style: flake8 fixes

biothings · Jul 18, 2022 · 63766dd · 63766dd
1 parent 6fe119e
commit 63766dd
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 33 deletions.
diff --git a/src/hub/databuild/builder.py b/src/hub/databuild/builder.py
@@ -1,10 +1,7 @@
 import biothings.utils.mongo as mongo
 import biothings.hub.databuild.builder as builder
-from biothings.utils.common import loadobj
 from biothings.hub import BUILDER_CATEGORY
 
-from hub.dataload.sources.entrez.gene_upload import EntrezGeneUploader
-from hub.dataload.sources.ensembl.gene_upload import EnsemblGeneUploader
 
 class MyGeneDataBuilder(builder.DataBuilder):
     """
@@ -19,29 +16,26 @@ def no_other_merge_job_for_ensembl_gene(job_manager):
             which are not handled by mongo as upserts and produce duplicated
             errors. For this datasource in particular, we allow only one merge job
             """
-            return len([j for j in job_manager.jobs.values() if \
-                    j["category"] == BUILDER_CATEGORY \
-                    and j["step"] == "ensembl_gene"]) == 0
+            return len([j for j in job_manager.jobs.values() if j["category"] == BUILDER_CATEGORY and j["step"] == "ensembl_gene"]) == 0
         preds = super().get_predicates()
         preds.append(no_other_merge_job_for_ensembl_gene)
         return preds
 
-
     def generate_document_query(self, src_name):
         """Root documents are created according to species list"""
         _query = None
         if src_name in self.get_root_document_sources():
             if "species" in self.build_config:
-                _query = {'taxid': {'$in': list(map(int,self.build_config['species']))}}
+                _query = {'taxid': {'$in': list(map(int, self.build_config['species']))}}
             elif "species_to_exclude" in self.build_config:
-                _query = {'taxid': {'$nin': list(map(int,self.build_config['species_to_exclude']))}}
+                _query = {'taxid': {'$nin': list(map(int, self.build_config['species_to_exclude']))}}
             else:
                 _query = None
         if _query:
-            self.logger.debug("Source '%s' requires custom query: '%s'" % (src_name,_query))
+            self.logger.debug("Source '%s' requires custom query: '%s'" % (src_name, _query))
         return _query
 
-    def document_cleaner(self,src_name,*args,**kwargs):
+    def document_cleaner(self, src_name, *args, **kwargs):
         # only root sources document can keep their taxid
         if src_name in self.get_root_document_sources():
             return None
@@ -52,12 +46,12 @@ def post_merge(self, source_names, batch_size, job_manager):
         tgt = mongo.get_target_db()[self.target_name]
         # background=true or it'll lock the whole database...
         self.logger.info("Indexing 'taxid'")
-        tgt.create_index("taxid",background=True)
+        tgt.create_index("taxid", background=True)
         self.logger.info("Indexing 'entrezgene'")
-        tgt.create_index("entrezgene",background=True)
+        tgt.create_index("entrezgene", background=True)
 
-    def get_stats(self,sources,job_manager):
-        self.stats = super(MyGeneDataBuilder,self).get_stats(sources,job_manager)
+    def get_stats(self, sources, job_manager):
+        self.stats = super(MyGeneDataBuilder, self).get_stats(sources, job_manager)
         # enrich with some specific mygene counts, specially regarding ensembl vs. entrez
         tgt = mongo.get_target_db()[self.target_name]
         self.stats["total_genes"] = tgt.estimated_document_count()
@@ -77,16 +71,16 @@ def get_stats(self,sources,job_manager):
         # Queries are gonna use colscan strategy...
         self.logger.debug("Counting 'total_ensembl_genes'")
         res = tgt.aggregate([
-            {"$match" : {"ensembl.0" : {"$exists" : True}}},
-            {"$project" : {"num_gene" : {"$size" : "$ensembl"}}},
-            {"$group" : {"_id" : None, "sum" : {"$sum": "$num_gene"}}}
-            ])
+            {"$match": {"ensembl.0": {"$exists": True}}},
+            {"$project": {"num_gene": {"$size": "$ensembl"}}},
+            {"$group": {"_id": None, "sum": {"$sum": "$num_gene"}}}
+        ])
         try:
             list_count = next(res)["sum"]
         except StopIteration:
             list_count = 0
         object_count = tgt.count_documents({"ensembl": {"$type": "object"}})
-        orphan_count = tgt.count_documents({"_id": {"$regex": '''\\w'''},"ensembl": {"$exists": 0}})
+        orphan_count = tgt.count_documents({"_id": {"$regex": '''\\w'''}, "ensembl": {"$exists": 0}})
         total_ensembl_genes = list_count + object_count + orphan_count
         self.stats["total_ensembl_genes"] = total_ensembl_genes
         # this one can't be computed from merged collection, and is only valid when build
@@ -95,10 +89,10 @@ def get_stats(self,sources,job_manager):
         # this one is similar to total_ensembl_genes except we cross with entrezgene (ie. so they're mapped)
         try:
             list_count = next(tgt.aggregate([
-                {"$match" : {"$and" : [{"ensembl.0" : {"$exists" : True}},{"entrezgene":{"$exists":1}}]}},
-                {"$project" : {"num_gene" : {"$size" : "$ensembl"}}},
-                {"$group" : {"_id" : None, "sum" : {"$sum": "$num_gene"}}}
-                ]))["sum"]
+                {"$match": {"$and": [{"ensembl.0": {"$exists": True}}, {"entrezgene": {"$exists": 1}}]}},
+                {"$project": {"num_gene": {"$size": "$ensembl"}}},
+                {"$group": {"_id": None, "sum": {"$sum": "$num_gene"}}}
+            ]))["sum"]
         except StopIteration:
             list_count = 0
         object_count = tgt.count_documents({"$and": [{"ensembl": {"$type": "object"}}, {"entrezgene": {"$exists": 1}}]})

diff --git a/src/hub/databuild/mapper.py b/src/hub/databuild/mapper.py
@@ -1,10 +1,11 @@
 from biothings.utils.common import loadobj
 import biothings.hub.databuild.mapper as mapper
 
+
 class EntrezRetired2Current(mapper.IDBaseMapper):
 
     def __init__(self, db_provider, *args, **kwargs):
-        super(EntrezRetired2Current,self).__init__(*args,**kwargs)
+        super(EntrezRetired2Current, self).__init__(*args, **kwargs)
         self.db_provider = db_provider
 
     def load(self):
@@ -14,7 +15,7 @@ def load(self):
             # but it's a way to know the entrez perimeter (what entrez _ids exist and should be considered
             self.map = loadobj(("entrez_gene__geneid_d.pyobj", self.db_provider()), mode='gridfs')
 
-    def process(self,*args,**kwargs):
+    def process(self, *args, **kwargs):
         raise UserWarning("Don't call me, please")
 
 
@@ -24,7 +25,7 @@ class Ensembl2Entrez(mapper.IDBaseMapper):
     """
 
     def __init__(self, db_provider, retired2current, *args, **kwargs):
-        super(Ensembl2Entrez,self).__init__("ensembl2entrez",*args,**kwargs)
+        super(Ensembl2Entrez, self).__init__("ensembl2entrez", *args, **kwargs)
         self.db_provider = db_provider
         self.retired2current = retired2current
 
@@ -34,7 +35,7 @@ def load(self):
             self.map = {}
             ensembl2entrez_li = loadobj(("ensembl_gene__2entrezgene_list.pyobj", self.db_provider()), mode='gridfs')
             #filter out those deprecated entrez gene ids
-            for ensembl_id,entrez_id in ensembl2entrez_li:
+            for ensembl_id, entrez_id in ensembl2entrez_li:
                 entrez_id = int(entrez_id)
                 if entrez_id in self.retired2current:
                     self.map[ensembl_id] = self.retired2current.translate(entrez_id)
@@ -47,7 +48,7 @@ class Ensembl2EntrezRoot(mapper.IDBaseMapper):
     """
 
     def __init__(self, ensembl2entrez, *args, **kwargs):
-        super(Ensembl2EntrezRoot,self).__init__("ensembl",*args,**kwargs)
+        super(Ensembl2EntrezRoot, self).__init__("ensembl", *args, **kwargs)
         self.ensembl2entrez = ensembl2entrez
 
     def load(self):
@@ -56,7 +57,7 @@ def load(self):
             self.ensembl2entrez.load()
             self.map = self.ensembl2entrez.map
 
-    def process(self,docs,key_to_convert="_id",**kwargs):
+    def process(self, docs, key_to_convert="_id", **kwargs):
         """
         we want to force translation, not defaulting to ensembl if no match
         so, if there's a match, it means ensembl doc can be converted so it means
@@ -65,8 +66,8 @@ def process(self,docs,key_to_convert="_id",**kwargs):
         convertible docs to only keep ensembl docs
         """
         for doc in docs:
-            _new = self.translate(doc[key_to_convert],transparent=False)
+            _new = self.translate(doc[key_to_convert], transparent=False)
             if _new:
-                continue # already as entrez_gene
+                continue    # already as entrez_gene
             else:
-                yield doc # return original
+                yield doc   # return original