Skip to content
ETL configuration for BMEG
Branch: master
Clone or download
Fetching latest commit…
Cannot retrieve the latest commit at this time.
Permalink
Type Name Latest commit message Commit time
Failed to load latest commit information.
.dvc
scripts
source
src/bmeg
tests
tools
transform
.coveragerc
.dockerignore
.flake8
.gitignore
.gitmodules
.travis.yml
Dockerfile
README.md
dev-requirements.txt
dvc_commands.txt
outputs.allele.dvc
outputs.bmeg_manifest.dvc
outputs.ccle.drug_response.dvc
outputs.ccle.dvc
outputs.ccle.expression.dvc
outputs.ccle.expression_tatlow.dvc
outputs.ccle.maf.dvc
outputs.compound.normalized.dvc
outputs.ctrp.drugresponse.dvc
outputs.ensembl.dvc
outputs.ensembl.proteins.dvc
outputs.g2p.dvc
outputs.gdc.cases.dvc
outputs.gdc.projects.dvc
outputs.gdsc.dvc
outputs.go.dvc
outputs.go.gaf2schema.dvc
outputs.gtex.dvc
outputs.gtex.expression.dvc
outputs.mc3.dvc
outputs.meta.commands.dvc
outputs.pfam.dvc
outputs.pfam.toproteins.dvc
outputs.phenotype.dvc
outputs.publication.dvc
outputs.pubmed.dvc
outputs.tcga.IlluminaHumanMethylation450.dvc
outputs.tcga.expression.dvc
outputs.tcga.gistic2cna.dvc
setup.py
source.ccle.CCLE_DepMap_18q3_RNAseq_RPKM_20180718.gct.dvc
source.ccle.CCLE_DepMap_18q3_maf_20180718.txt.dvc
source.ccle.CCLE_NP24.2009_Drug_data_2015.02.24.csv.dvc
source.ccle.CCLE_depMap_18Q4_TPM_v2.dvc
source.ccle.CCLE_tpm.tsv.gz.dvc
source.ccle.DepMap-2018q4-celllines.csv.dvc
source.ccle.mafs.dvc
source.ccle.vcfs.dvc
source.ctrp.dvc
source.ensembl.Homo_sapiens.GRCh37.85.uniprot.tsv.gz.dvc
source.ensembl.Homo_sapiens.GRCh37.87.chr_patch_hapl_scaff.gff3.gz.dvc
source.ensembl.Homo_sapiens.GRCh37.87.chr_patch_hapl_scaff.trans_gene.tsv.dvc
source.g2p.all.dvc
source.gdsc.GDSC_AUC.csv.dvc
source.gene_enricher.hgnc_complete_set.json.dvc
source.go.HUMAN_9606_idmapping.dat.gz.dvc
source.go.obo.dvc
source.goa_human.gaf.gz.dvc
source.gtex.GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct.dvc
source.gtex.GTEx_v7_Annotations_SampleAttributesDS.txt.dvc
source.gtex.GTEx_v7_Annotations_SubjectPhenotypesDS.txt.dvc
source.mc3.v0.2.8.PUBLIC.maf.gz.dvc
source.myvariant.info.harvested.json.gz.dvc
source.myvariant.info.metadata.fields.json.dvc
source.myvariant.info.metadata.json.dvc
source.myvariants.biothings_current_old_hg19.json.gz.dvc
source.pfam.clans.tsv.dvc
source.pfam.homo_sapiens.json.dvc
source.pfam.id_list.txt.dvc
source.pfam.tar.gz.dvc
source.pubmed.baseline.dvc
source.tcga.TCGA_ID_MAP.csv.dvc
source.tcga.TCGA_expression_tpm.tsv.gz.dvc
source.tcga.gistic2-firehose.gistic2cna.dvc
source.tcga.methylation.IlluminaHumanMethylation450.tar.gz.dvc
source.tcga.tcga-genomics.zip.dvc
source.vep.vep_supporting_files.tar.gz.dvc
styleguide.md

README.md

License: MIT Build Status

BMEG-ETL

BMEG-ETL is a project that defines the transformers and data models for BMEG.

DVC

See:

Note: All dvc files are maintained in the zip file dvc-graph.zip. This is due to:

  • each transformer is it's own command, executed with CWD at the project root
  • dvc run captures state in a separate *.dvc file
  • therefore, there are quite a few files at the root level of the project

Setup

# until DNS setup, make sure minio.compbio.ohsu.edu is known
sudo sh -c "echo 10.50.50.118 minio.compbio.ohsu.edu >> /etc/hosts"

# see minio install for credentials
# cat /mnt/minio/.minio.sys/config/config.json  | jq .credential

# install and configure aws
$ sudo apt  install awscli
$ pip install awscli
$ aws configure
AWS Access Key ID [None]: KKKKKKKKKKK
AWS Secret Access Key [None]: SSSSSSS
Default region name [None]: us-east-1
Default output format [None]:
# test
aws --endpoint-url https://minio.compbio.ohsu.edu s3 ls
2018-10-29 22:28:14 bmeg


# Setting up minio client
# linux
# install in home directory
cd
wget https://dl.minio.io/client/mc/release/linux-amd64/mc
chmod +x mc
alias mc=~/mc
mc version
# update your config ... vi ~/.mc/config.json
# test
$ mc ls -r  bmeg/bmeg/dvc | head -5
[2018-10-31 00:19:19 UTC] 1.2MiB 07/b930da26e4a06dcf8c9a0faff57be1
[2018-10-31 17:53:28 UTC] 2.6GiB 08/ec6eb40ad76b48210aa0e939ae7aa1
[2018-10-31 00:03:17 UTC] 222MiB 10/a14a8b317a34784e5b3e62c3fa387a
[2018-10-30 23:59:22 UTC]  38MiB 15/525092ad0e0598b95b874c9660bf6c
[2018-10-30 20:23:28 UTC] 809MiB 1c/4711bb30e668d5f387e1819bae99ef

# macOS see brew install

# dvc already installed and initialized
# add our remote
dvc remote add -d minio s3://bmeg/dvc
dvc remote modify minio endpointurl https://minio.compbio.ohsu.edu

Example

# retrieve data from foreign source
dvc run \
  -o source/gene_enricher/hgnc_complete_set.json \
  --file source.gene_enricher.hgnc_complete_set.json.dvc \
  --yes \
  curl --verbose --progress-bar --ipv4 --connect-timeout 8 --max-time 120 --retry 128 --ftp-ssl --disable-epsv --ftp-pasv ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/json/hgnc_complete_set.json --output source/gene_enricher/hgnc_complete_set.json

# commit DVC's record to github
git add .gitignore source.gene_enricher.hgnc_complete_set.json.dvc
git commit -m "add hgnc_complete_set to DVC"

# commit the data to DVC's remote
$ dvc push
Preparing to push data to s3://bmeg/dvc
[##############################] 100% Collecting information
[##############################] 100% source/gene_enricher/hgnc_complete_set.json

# view the remote repository
$ mc ls -r  bmeg/bmeg/dvc
[2018-10-30 18:32:15 UTC]  29MiB f4/843dade6933b9879654417c6d93c1b

# note that the file name ~ the md5 hash
$ cat source.gene_enricher.hgnc_complete_set.json.dvc
cmd: curl --verbose --progress-bar --ipv4 --connect-timeout 8 --max-time 120 --retry
  128 --ftp-ssl --disable-epsv --ftp-pasv ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/json/hgnc_complete_set.json
  --output source/gene_enricher/hgnc_complete_set.json
md5: 361717476a2235fe752e4c52c774caa1
outs:
- cache: true
  md5: 8300e43e6513e8e0a696a952ad28b1f5
  path: source/gene_enricher/hgnc_complete_set.json

Provenance

To generate bmeg_file_manifest.txt run python scripts/generate_bmeg_file_manifest.py.

The dvc files were created using the commands in dvc_commands.txt To recreate this file, run bash scripts/generate_dvc_commands.sh.

You can’t perform that action at this time.