Skip to content

Commit

Permalink
Merge pull request #131 from biothings/gene_summary_fix
Browse files Browse the repository at this point in the history
temporary fix to refseq gene summary
  • Loading branch information
newgene committed Sep 13, 2022
2 parents 257a361 + 77f80f7 commit b8148a3
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions src/hub/dataload/sources/refseq/parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from biothings.utils.dataload import dict_convert, tab2dict_iter
from biothings.utils.common import open_anyfile

try:
from ..entrez.parser import EntrezParserBase
Expand All @@ -13,9 +14,18 @@ class GeneSummaryParser(EntrezParserBase):
DATAFILE = 'gene2summary_all.txt'

def load(self, aslist=False):
geneid_set = set()
doc_li = []

with open(self.datafile) as df:
geneid_set = set()
doc_li = []
for line in df:
geneid, summary = line.strip().split('\t')
if geneid not in geneid_set:
doc_li.append(dict(_id=geneid, summary=str(summary)))
geneid_set.add(geneid)

# temporary fix to add in missing human gene summaries
with open_anyfile('/opt/mygene-hub/datasources/ncbi_gene/gene2summary_human.txt.xz') as df:
for line in df:
geneid, summary = line.strip().split('\t')
if geneid not in geneid_set:
Expand Down

0 comments on commit b8148a3

Please sign in to comment.