diff --git a/examples/example_cht_workflow.sh b/examples/example_cht_workflow.sh index 05c844c..19a1abb 100644 --- a/examples/example_cht_workflow.sh +++ b/examples/example_cht_workflow.sh @@ -1,31 +1,43 @@ + +# Set these environment vars to point to +# your local installation of WASP +WASP=$HOME/proj/WASP +DATA_DIR=$WASP/examples/example_data + # # Convert SNP files to HDF5 format. This program can be used -# on output from impute2 or on VCF files +# on output from impute2 or on VCF files. Note that +# you do not need to repeat this step here if it was already +# done in the mapping pipeline. # -./snp2h5/snp2h5 --chrom example_data/chromInfo.hg19.txt \ +$WASP/snp2h5/snp2h5 --chrom $DATA_DIR/chromInfo.hg19.txt \ --format impute \ - --geno_prob example_data/geno_probs.h5 \ - --snp_index example_data/snp_index.h5 \ - --snp_tab example_data/snp_tab.h5 \ - --haplotype example_data/haps.h5 \ - example_data/genotypes/chr*.hg19.impute2.gz \ - example_data/genotypes/chr*.hg19.impute2_haps.gz + --geno_prob $DATA_DIR/geno_probs.h5 \ + --snp_index $DATA_DIR/snp_index.h5 \ + --snp_tab $DATA_DIR/snp_tab.h5 \ + --haplotype $DATA_DIR/haps.h5 \ + $DATA_DIR/genotypes/chr*.hg19.impute2.gz \ + $DATA_DIR/genotypes/chr*.hg19.impute2_haps.gz # # Convert FASTA files to HDF5 format. # Note the HDF5 sequence files are only used for GC content -# correction part of CHT. This step can be ommitted if +# correction part of CHT. This step can be omitted if # GC-content correction is not used. # -./snp2h5/fasta2h5 --chrom example_data/chromInfo.hg19.txt \ - --seq example_data/seq.h5 \ +# The chr*.fa.gz files are the fasta files for the reference genome +# and can be downloaded from a genome browser (e.g.: +# http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/) +# +$WASP/snp2h5/fasta2h5 --chrom $DATA_DIR/chromInfo.hg19.txt \ + --seq $DATA_DIR/seq.h5 \ /data/external_public/reference_genomes/hg19/chr*.fa.gz # loop over all individuals in samples file -H3K27AC_SAMPLES_FILE=example_data/H3K27ac/samples.txt -ALL_SAMPLES_FILE=example_data/genotypes/YRI_samples.txt +H3K27AC_SAMPLES_FILE=$DATA_DIR/H3K27ac/samples.txt +ALL_SAMPLES_FILE=$DATA_DIR/genotypes/YRI_samples.txt for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE) do @@ -35,17 +47,17 @@ do # read BAM files for this individual and write read counts to # HDF5 files # - python CHT/bam2h5.py --chrom example_data/chromInfo.hg19.txt \ - --snp_index example_data/snp_index.h5 \ - --snp_tab example_data/snp_tab.h5 \ - --haplotype example_data/haps.h5 \ + python $WASP/CHT/bam2h5.py --chrom $DATA_DIR/chromInfo.hg19.txt \ + --snp_index $DATA_DIR/snp_index.h5 \ + --snp_tab $DATA_DIR/snp_tab.h5 \ + --haplotype $DATA_DIR/haps.h5 \ --samples $ALL_SAMPLES_FILE \ --individual $INDIVIDUAL \ - --ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \ - --alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \ - --other_as_counts example_data/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \ - --read_counts example_data/H3K27ac/read_counts.$INDIVIDUAL.h5 \ - example_data/H3K27ac/$INDIVIDUAL.chr*.keep.rmdup.bam + --ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \ + --alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \ + --other_as_counts $DATA_DIR/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \ + --read_counts $DATA_DIR/H3K27ac/read_counts.$INDIVIDUAL.h5 \ + $DATA_DIR/H3K27ac/$INDIVIDUAL.chr*.keep.rmdup.bam done @@ -59,20 +71,20 @@ done # also be generated by the user (for example if a specific set of # target regions and test SNPs are to be tested). # -python CHT/get_target_regions.py \ +python $WASP/CHT/get_target_regions.py \ --target_region_size 2000 \ --min_as_count 10 \ --min_read_count 100 \ --min_het_count 1 \ --min_minor_allele_count 1\ - --chrom example_data/chromInfo.hg19.txt \ - --read_count_dir example_data/H3K27ac \ + --chrom $DATA_DIR/chromInfo.hg19.txt \ + --read_count_dir $DATA_DIR/H3K27ac \ --individuals $H3K27AC_SAMPLES_FILE \ --samples $ALL_SAMPLES_FILE \ - --snp_tab example_data/snp_tab.h5 \ - --snp_index example_data/snp_index.h5 \ - --haplotype example_data/haps.h5 \ - --output_file example_data/H3K27ac/chr22.peaks.txt.gz + --snp_tab $DATA_DIR/snp_tab.h5 \ + --snp_index $DATA_DIR/snp_index.h5 \ + --haplotype $DATA_DIR/haps.h5 \ + --output_file $DATA_DIR/H3K27ac/chr22.peaks.txt.gz @@ -81,20 +93,20 @@ for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE) # # create CHT input file for this individual # - python CHT/extract_haplotype_read_counts.py \ - --chrom example_data/chromInfo.hg19.txt \ - --snp_index example_data/snp_index.h5 \ - --snp_tab example_data/snp_tab.h5 \ - --geno_prob example_data/geno_probs.h5 \ - --haplotype example_data/haps.h5 \ + python $WASP/CHT/extract_haplotype_read_counts.py \ + --chrom $DATA_DIR/chromInfo.hg19.txt \ + --snp_index $DATA_DIR/snp_index.h5 \ + --snp_tab $DATA_DIR/snp_tab.h5 \ + --geno_prob $DATA_DIR/geno_probs.h5 \ + --haplotype $DATA_DIR/haps.h5 \ --samples $ALL_SAMPLES_FILE \ --individual $INDIVIDUAL \ - --ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \ - --alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \ - --other_as_counts example_data/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \ - --read_counts example_data/H3K27ac/read_counts.$INDIVIDUAL.h5 \ - example_data/H3K27ac/chr22.peaks.txt.gz \ - | gzip > example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.txt.gz + --ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \ + --alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \ + --other_as_counts $DATA_DIR/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \ + --read_counts $DATA_DIT/H3K27ac/read_counts.$INDIVIDUAL.h5 \ + $DATA_DIR/H3K27ac/chr22.peaks.txt.gz \ + | gzip > $DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.txt.gz done @@ -105,12 +117,12 @@ done # in each sample. # (first make files containing lists of input and output files) # -IN_FILE=example_data/H3K27ac/input_files.txt -OUT_FILE=example_data/H3K27ac/output_files.txt -ls example_data/H3K27ac/haplotype_read_counts* | grep -v adjusted > $IN_FILE +IN_FILE=$DATA_DIR/H3K27ac/input_files.txt +OUT_FILE=$DATA_DIR/H3K27ac/output_files.txt +ls $DATA_DIR/H3K27ac/haplotype_read_counts* | grep -v adjusted > $IN_FILE cat $IN_FILE | sed 's/.txt/.adjusted.txt/' > $OUT_FILE -python CHT/update_total_depth.py --seq example_data/seq.h5 $IN_FILE $OUT_FILE +python $WASP/CHT/update_total_depth.py --seq $DATA_DIR/seq.h5 $IN_FILE $OUT_FILE # @@ -123,38 +135,38 @@ python CHT/update_total_depth.py --seq example_data/seq.h5 $IN_FILE $OUT_FILE # for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE) do - IN_FILE=example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.txt.gz - OUT_FILE=example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.hetp.txt.gz + IN_FILE=$DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.txt.gz + OUT_FILE=$DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.hetp.txt.gz - python CHT/update_het_probs.py \ - --ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \ - --alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \ + python $WASP/CHT/update_het_probs.py \ + --ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \ + --alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \ $IN_FILE $OUT_FILE done -CHT_IN_FILE=example_data/H3K27ac/cht_input_file.txt -ls example_data/H3K27ac/haplotype_read_counts*.adjusted.hetp.txt.gz > $CHT_IN_FILE +CHT_IN_FILE=$DATA_DIR/H3K27ac/cht_input_file.txt +ls $DATA_DIR/H3K27ac/haplotype_read_counts*.adjusted.hetp.txt.gz > $CHT_IN_FILE # # Estimate overdispersion parameters for allele-specific test (beta binomial) # -OUT_FILE=example_data/H3K27ac/cht_as_coef.txt -python CHT/fit_as_coefficients.py $CHT_IN_FILE $OUT_FILE +OUT_FILE=$DATA_DIR/H3K27ac/cht_as_coef.txt +python $WASP/CHT/fit_as_coefficients.py $CHT_IN_FILE $OUT_FILE # # Estimate overdispersion parameters for association test (beta-negative binomial) # -OUT_FILE=example_data/H3K27ac/cht_bnb_coef.txt -python CHT/fit_bnb_coefficients.py --min_counts 50 --min_as_counts 10 $CHT_IN_FILE $OUT_FILE +OUT_FILE=$DATA_DIR/H3K27ac/cht_bnb_coef.txt +python $WASP/CHT/fit_bnb_coefficients.py --min_counts 50 --min_as_counts 10 $CHT_IN_FILE $OUT_FILE # # run combined haplotype test # -OUT_FILE=example_data/H3K27ac/cht_results.txt -python CHT/combined_test.py --min_as_counts 10 \ +OUT_FILE=$DATA_DIR/H3K27ac/cht_results.txt +python $WASP/CHT/combined_test.py --min_as_counts 10 \ --bnb_disp example_data/H3K27ac/cht_bnb_coef.txt \ --as_disp example_data/H3K27ac/cht_as_coef.txt \ $CHT_IN_FILE $OUT_FILE