Skip to content

Commit

Permalink
some code cleanup and style fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
newgene committed Dec 21, 2021
1 parent d32b152 commit c8e8d64
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 30 deletions.
11 changes: 8 additions & 3 deletions src/hub/dataload/sources/generif/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# or other ImportError
from hub.dataload.sources.entrez.parser import EntrezParserBase


class Gene2GeneRifParser(EntrezParserBase):
'''
'''
Expand All @@ -23,10 +24,14 @@ def _cvt_pubmed(self, pubmed_str):
def load(self):
cnt = 0
for datadict in tab2dict_iter(self.datafile, (1, 2, 4), 0, alwayslist=1):
datadict = dict_convert(datadict, valuefn=lambda v: {
'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]})
datadict = dict_convert(
datadict,
valuefn=lambda v: {
'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]
}
)

for id,doc in datadict.items():
for id, doc in datadict.items():
cnt += 1
doc['_id'] = id
yield doc
20 changes: 10 additions & 10 deletions src/hub/dataload/sources/homologene/dump.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import os
import os.path
import sys
import time

import biothings, config
import biothings
import config
biothings.config_for_app(config)

from config import DATA_ARCHIVE_ROOT, logger as logging
from config import DATA_ARCHIVE_ROOT
from biothings.hub.dataload.dumper import FTPDumper


Expand All @@ -21,14 +20,16 @@ class HomologeneDumper(FTPDumper):

def get_newest_info(self):
rel = None

def setrel(line):
nonlocal rel
rel = line
self.client.retrlines("RETR RELEASE_NUMBER",setrel)

self.client.retrlines("RETR RELEASE_NUMBER", setrel)
self.release = rel

def new_release_available(self):
current_release = self.src_doc.get("download",{}).get("release")
current_release = self.src_doc.get("download", {}).get("release")
if not current_release or self.release > current_release:
self.logger.info("New release '%s' found" % self.release)
return True
Expand All @@ -39,7 +40,6 @@ def new_release_available(self):
def create_todump_list(self, force=False):
self.get_newest_info()
remote_file = "homologene.data"
local_file = os.path.join(self.new_data_folder,remote_file)
if force or not os.path.exists(local_file) or self.remote_is_better(remote_file,local_file) or self.new_release_available():
self.to_dump.append({"remote": remote_file, "local":local_file})

local_file = os.path.join(self.new_data_folder, remote_file)
if force or not os.path.exists(local_file) or self.remote_is_better(remote_file, local_file) or self.new_release_available():
self.to_dump.append({"remote": remote_file, "local": local_file})
20 changes: 12 additions & 8 deletions src/hub/dataload/sources/homologene/parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os.path
from config import TAXONOMY
from biothings.utils.common import file_newer, loadobj, dump
from biothings.utils.dataload import tab2dict

try:
Expand All @@ -10,6 +9,7 @@
# or other ImportError
from hub.dataload.sources.entrez.parser import EntrezParserBase, get_geneid_d


class HomologeneParser(EntrezParserBase):
'''Parser for NCBI homologenes.data file.'''
DATAFILE = 'homologene.data'
Expand All @@ -31,21 +31,26 @@ def load(self, aslist=False):
adding "homologene" field in gene doc
'''
from biothings.utils.hub_db import get_src_dump
homo_d = tab2dict(self.datafile,(2,1),0,header=0)
entrez_doc = get_src_dump().find_one({"_id":"entrez"}) or {}
entrez_dir = entrez_doc.get("download",{}).get("data_folder")
homo_d = tab2dict(self.datafile, (2, 1), 0, header=0)
entrez_doc = get_src_dump().find_one({"_id": "entrez"}) or {}
entrez_dir = entrez_doc.get("download", {}).get("data_folder")
assert entrez_dir, "Can't find Entrez data directory"
DATAFILE = os.path.join(entrez_dir, 'gene_history.gz')
assert os.path.exists(DATAFILE), "gene_history.gz is missing (entrez_dir: %s)" % entrez_dir
retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0,includefn=lambda ld: ld[1] != '-')
retired2gene = tab2dict(
DATAFILE,
(1, 2), 1, alwayslist=0,
includefn=lambda ld: ld[1] != '-'
)
for id in list(homo_d.keys()):
homo_d[retired2gene.get(id,id)] = homo_d[id]
homo_d[retired2gene.get(id, id)] = homo_d[id]

with open(self.datafile) as df:
homologene_d = {}
doc_li = []
print()
geneid_d = get_geneid_d(entrez_dir, self.species_li,load_cache=False,save_cache=False,only_for=homo_d)
geneid_d = get_geneid_d(entrez_dir, self.species_li, load_cache=False,
save_cache=False, only_for=homo_d)

for line in df:
ld = line.strip().split('\t')
Expand Down Expand Up @@ -74,4 +79,3 @@ def load(self, aslist=False):
else:
gene_d = dict([(d['_id'], d) for d in doc_li])
return gene_d

18 changes: 9 additions & 9 deletions src/hub/dataload/sources/refseq/parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
import os.path
import datetime
from biothings.utils.common import file_newer, loadobj, dump
from biothings.utils.dataload import tab2dict, tab2list, value_convert, \
normalized_value, dict_convert, dict_to_list, \
tab2dict_iter
from biothings.utils.dataload import dict_convert, tab2dict_iter

try:
from ..entrez.parser import EntrezParserBase
Expand All @@ -12,6 +7,7 @@
# or other ImportError
from hub.dataload.sources.entrez.parser import EntrezParserBase


class GeneSummaryParser(EntrezParserBase):
'''Parser for gene2summary_all.txt, adding "summary" field in gene doc'''
DATAFILE = 'gene2summary_all.txt'
Expand Down Expand Up @@ -81,10 +77,14 @@ def _cvt_pubmed(self, pubmed_str):
def load(self):
cnt = 0
for datadict in tab2dict_iter(self.datafile, (1, 2, 4), 0, alwayslist=1, encoding="latin1"):
datadict = dict_convert(datadict, valuefn=lambda v: {
'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]})
datadict = dict_convert(
datadict,
valuefn=lambda v: {
'generif': [dict(pubmed=self._cvt_pubmed(x[0]), text=x[1]) for x in v]
}
)

for id,doc in datadict.items():
for id, doc in datadict.items():
cnt += 1
doc['_id'] = id
yield doc
Expand Down

0 comments on commit c8e8d64

Please sign in to comment.