Skip to content

Commit

Permalink
source for testing
Browse files Browse the repository at this point in the history
  • Loading branch information
jal347 committed Oct 6, 2022
1 parent b52ebd5 commit d5ca036
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/hub/dataload/sources/ncbi_gene/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .dump import NcbiGeneDumper
from .upload import NCBIGeneSummaryUploader
48 changes: 48 additions & 0 deletions src/hub/dataload/sources/ncbi_gene/dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import os.path
import sys
import time
from datetime import datetime

import biothings, config
biothings.config_for_app(config)

from config import DATA_ARCHIVE_ROOT, logger as logging
from biothings.hub.dataload.dumper import FTPDumper


class NcbiGeneDumper(FTPDumper):

SRC_NAME = "ncbi_gene"
SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)
FTP_HOST = 'ftp.ncbi.nih.gov'
CWD_DIR = '/gene/DATA/ASN_BINARY/Mammalia'

SCHEDULE = "0 22 * * 6"

def get_newest_info(self):
res = self.client.sendcmd("MDTM All_Mammalia.ags.gz")
code, remote_lastmodified = res.split()
self.release = datetime.strptime(remote_lastmodified, '%Y%m%d%H%M%S').strftime("%Y%m%d")

def new_release_available(self):
current_release = self.src_doc.get("download",{}).get("release")
if not current_release or self.release > current_release:
self.logger.info("New release '%s' found" % self.release)
return True
else:
self.logger.debug("No new release found")
return False

def create_todump_list(self, force=False):
self.get_newest_info()
for fn in ['Sus_scrofa.ags.gz']: #TODO change to all, using sus_scrofa for testing
local_file = os.path.join(self.new_data_folder,fn)
if force or not os.path.exists(local_file) or self.remote_is_better(fn,local_file) or self.new_release_available():
self.to_dump.append({"remote": fn, "local":local_file})

def post_dump(self, *args, **kwargs):
self.logger.info("Extracting Gene Summary Data in %s", self.new_data_folder)
os.chdir(self.new_data_folder)
os.system('time gunzip -c Sus_scrofa.ags.gz |../gene2xml -i stdin -b T | ../xtract -pattern Entrezgene -element Gene-track_geneid,Entrezgene_summary | awk -F "\t" \'length($2)\' | xz -9 --stdout > gene2summary_sus.txt.xz')

33 changes: 33 additions & 0 deletions src/hub/dataload/sources/ncbi_gene/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
from biothings.utils.dataload import dict_convert, tab2dict_iter
from biothings.utils.common import open_anyfile

try:
from ..entrez.parser import EntrezParserBase
except (ValueError, ImportError):
# capture "ValueError: Attempted relative import beyond top-level package"
# or other ImportError
from hub.dataload.sources.entrez.parser import EntrezParserBase


class GeneSummaryParser(EntrezParserBase):
'''Parser for gene2summary_all.txt.xz, adding "summary" field in gene doc'''

# TODO testing only need to change file name
DATAFILE = 'gene2summary_sus.txt.xz'

def load(self, aslist=False):

geneid_set = set()
doc_li = []
with open_anyfile(self.datafile) as df:
for line in df:
geneid, summary = line.strip().split('\t')
doc_li.append(dict(_id=geneid, summary=str(summary)))


if aslist:
return doc_li
else:
gene_d = dict([(d['_id'], d) for d in doc_li])
return gene_d
20 changes: 20 additions & 0 deletions src/hub/dataload/sources/ncbi_gene/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from .parser import GeneSummaryParser
import biothings.hub.dataload.uploader as uploader


class NCBIGeneSummaryUploader(uploader.MergerSourceUploader):

name = "ncbi_gene"

def load_data(self, data_folder):
gene2summary = GeneSummaryParser(data_folder).load()
return gene2summary

@classmethod
def get_mapping(klass):
mapping = {
"summary": {
"type": "text",
"copy_to": "all"
},
}

0 comments on commit d5ca036

Please sign in to comment.