-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
103 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .dump import NcbiGeneDumper | ||
from .upload import NCBIGeneSummaryUploader |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import os | ||
import os.path | ||
import sys | ||
import time | ||
from datetime import datetime | ||
|
||
import biothings, config | ||
biothings.config_for_app(config) | ||
|
||
from config import DATA_ARCHIVE_ROOT, logger as logging | ||
from biothings.hub.dataload.dumper import FTPDumper | ||
|
||
|
||
class NcbiGeneDumper(FTPDumper): | ||
|
||
SRC_NAME = "ncbi_gene" | ||
SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME) | ||
FTP_HOST = 'ftp.ncbi.nih.gov' | ||
CWD_DIR = '/gene/DATA/ASN_BINARY/Mammalia' | ||
|
||
SCHEDULE = "0 22 * * 6" | ||
|
||
def get_newest_info(self): | ||
res = self.client.sendcmd("MDTM All_Mammalia.ags.gz") | ||
code, remote_lastmodified = res.split() | ||
self.release = datetime.strptime(remote_lastmodified, '%Y%m%d%H%M%S').strftime("%Y%m%d") | ||
|
||
def new_release_available(self): | ||
current_release = self.src_doc.get("download",{}).get("release") | ||
if not current_release or self.release > current_release: | ||
self.logger.info("New release '%s' found" % self.release) | ||
return True | ||
else: | ||
self.logger.debug("No new release found") | ||
return False | ||
|
||
def create_todump_list(self, force=False): | ||
self.get_newest_info() | ||
for fn in ['Sus_scrofa.ags.gz']: #TODO change to all, using sus_scrofa for testing | ||
local_file = os.path.join(self.new_data_folder,fn) | ||
if force or not os.path.exists(local_file) or self.remote_is_better(fn,local_file) or self.new_release_available(): | ||
self.to_dump.append({"remote": fn, "local":local_file}) | ||
|
||
def post_dump(self, *args, **kwargs): | ||
self.logger.info("Extracting Gene Summary Data in %s", self.new_data_folder) | ||
os.chdir(self.new_data_folder) | ||
os.system('time gunzip -c Sus_scrofa.ags.gz |../gene2xml -i stdin -b T | ../xtract -pattern Entrezgene -element Gene-track_geneid,Entrezgene_summary | awk -F "\t" \'length($2)\' | xz -9 --stdout > gene2summary_sus.txt.xz') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
from biothings.utils.dataload import dict_convert, tab2dict_iter | ||
from biothings.utils.common import open_anyfile | ||
|
||
try: | ||
from ..entrez.parser import EntrezParserBase | ||
except (ValueError, ImportError): | ||
# capture "ValueError: Attempted relative import beyond top-level package" | ||
# or other ImportError | ||
from hub.dataload.sources.entrez.parser import EntrezParserBase | ||
|
||
|
||
class GeneSummaryParser(EntrezParserBase): | ||
'''Parser for gene2summary_all.txt.xz, adding "summary" field in gene doc''' | ||
|
||
# TODO testing only need to change file name | ||
DATAFILE = 'gene2summary_sus.txt.xz' | ||
|
||
def load(self, aslist=False): | ||
|
||
geneid_set = set() | ||
doc_li = [] | ||
with open_anyfile(self.datafile) as df: | ||
for line in df: | ||
geneid, summary = line.strip().split('\t') | ||
doc_li.append(dict(_id=geneid, summary=str(summary))) | ||
|
||
|
||
if aslist: | ||
return doc_li | ||
else: | ||
gene_d = dict([(d['_id'], d) for d in doc_li]) | ||
return gene_d |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from .parser import GeneSummaryParser | ||
import biothings.hub.dataload.uploader as uploader | ||
|
||
|
||
class NCBIGeneSummaryUploader(uploader.MergerSourceUploader): | ||
|
||
name = "ncbi_gene" | ||
|
||
def load_data(self, data_folder): | ||
gene2summary = GeneSummaryParser(data_folder).load() | ||
return gene2summary | ||
|
||
@classmethod | ||
def get_mapping(klass): | ||
mapping = { | ||
"summary": { | ||
"type": "text", | ||
"copy_to": "all" | ||
}, | ||
} |