update example_cht_workflow paths and descriptions

bmvdgeijn · Jul 3, 2018 · ca6052f · ca6052f
1 parent d72bcd4
commit ca6052f
Showing 1 changed file with 70 additions and 58 deletions.
diff --git a/examples/example_cht_workflow.sh b/examples/example_cht_workflow.sh
@@ -1,31 +1,43 @@
+
+# Set these environment vars to point to
+# your local installation of WASP
+WASP=$HOME/proj/WASP
+DATA_DIR=$WASP/examples/example_data
+
 #
 # Convert SNP files to HDF5 format. This program can be used
-# on output from impute2 or on VCF files
+# on output from impute2 or on VCF files. Note that
+# you do not need to repeat this step here if it was already
+# done in the mapping pipeline.
 #
-./snp2h5/snp2h5 --chrom example_data/chromInfo.hg19.txt \
+$WASP/snp2h5/snp2h5 --chrom $DATA_DIR/chromInfo.hg19.txt \
 	      --format impute \
-	      --geno_prob example_data/geno_probs.h5 \
-	      --snp_index example_data/snp_index.h5 \
-	      --snp_tab example_data/snp_tab.h5 \
-	      --haplotype example_data/haps.h5 \
-	      example_data/genotypes/chr*.hg19.impute2.gz \
-	      example_data/genotypes/chr*.hg19.impute2_haps.gz
+	      --geno_prob $DATA_DIR/geno_probs.h5 \
+	      --snp_index $DATA_DIR/snp_index.h5 \
+	      --snp_tab $DATA_DIR/snp_tab.h5 \
+	      --haplotype $DATA_DIR/haps.h5 \
+	      $DATA_DIR/genotypes/chr*.hg19.impute2.gz \
+	      $DATA_DIR/genotypes/chr*.hg19.impute2_haps.gz
 
 #
 # Convert FASTA files to HDF5 format.
 # Note the HDF5 sequence files are only used for GC content
-# correction part of CHT. This step can be ommitted if
+# correction part of CHT. This step can be omitted if
 # GC-content correction is not used.
 #
-./snp2h5/fasta2h5 --chrom example_data/chromInfo.hg19.txt \
-	--seq example_data/seq.h5 \
+# The chr*.fa.gz files are the fasta files for the reference genome
+# and can be downloaded from a genome browser (e.g.:
+# http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/)
+#
+$WASP/snp2h5/fasta2h5 --chrom $DATA_DIR/chromInfo.hg19.txt \
+	--seq $DATA_DIR/seq.h5 \
 	/data/external_public/reference_genomes/hg19/chr*.fa.gz
 
 
 
 # loop over all individuals in samples file
-H3K27AC_SAMPLES_FILE=example_data/H3K27ac/samples.txt
-ALL_SAMPLES_FILE=example_data/genotypes/YRI_samples.txt
+H3K27AC_SAMPLES_FILE=$DATA_DIR/H3K27ac/samples.txt
+ALL_SAMPLES_FILE=$DATA_DIR/genotypes/YRI_samples.txt
 
 for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
 do
@@ -35,17 +47,17 @@ do
     # read BAM files for this individual and write read counts to
     # HDF5 files
     #
-    python CHT/bam2h5.py --chrom example_data/chromInfo.hg19.txt \
-	      --snp_index example_data/snp_index.h5 \
-	      --snp_tab example_data/snp_tab.h5 \
-	      --haplotype example_data/haps.h5 \
+    python $WASP/CHT/bam2h5.py --chrom $DATA_DIR/chromInfo.hg19.txt \
+	      --snp_index $DATA_DIR/snp_index.h5 \
+	      --snp_tab $DATA_DIR/snp_tab.h5 \
+	      --haplotype $DATA_DIR/haps.h5 \
 	      --samples $ALL_SAMPLES_FILE \
 	      --individual $INDIVIDUAL \
-	      --ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
-	      --alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
-	      --other_as_counts example_data/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
-	      --read_counts example_data/H3K27ac/read_counts.$INDIVIDUAL.h5 \
-	      example_data/H3K27ac/$INDIVIDUAL.chr*.keep.rmdup.bam
+	      --ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
+	      --alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
+	      --other_as_counts $DATA_DIR/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
+	      --read_counts $DATA_DIR/H3K27ac/read_counts.$INDIVIDUAL.h5 \
+	      $DATA_DIR/H3K27ac/$INDIVIDUAL.chr*.keep.rmdup.bam
 done
 
 
@@ -59,20 +71,20 @@ done
 # also be generated by the user (for example if a specific set of
 # target regions and test SNPs are to be tested).
 #
-python CHT/get_target_regions.py \
+python $WASP/CHT/get_target_regions.py \
        --target_region_size 2000 \
        --min_as_count 10 \
        --min_read_count 100 \
        --min_het_count 1 \
        --min_minor_allele_count 1\
-       --chrom example_data/chromInfo.hg19.txt \
-       --read_count_dir example_data/H3K27ac \
+       --chrom $DATA_DIR/chromInfo.hg19.txt \
+       --read_count_dir $DATA_DIR/H3K27ac \
        --individuals $H3K27AC_SAMPLES_FILE \
        --samples $ALL_SAMPLES_FILE \
-       --snp_tab example_data/snp_tab.h5 \
-       --snp_index example_data/snp_index.h5 \
-       --haplotype example_data/haps.h5 \
-       --output_file example_data/H3K27ac/chr22.peaks.txt.gz
+       --snp_tab $DATA_DIR/snp_tab.h5 \
+       --snp_index $DATA_DIR/snp_index.h5 \
+       --haplotype $DATA_DIR/haps.h5 \
+       --output_file $DATA_DIR/H3K27ac/chr22.peaks.txt.gz
 
 
 
@@ -81,20 +93,20 @@ for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
     #
     # create CHT input file for this individual
     #
-    python CHT/extract_haplotype_read_counts.py \
-       --chrom example_data/chromInfo.hg19.txt \
-       --snp_index example_data/snp_index.h5 \
-       --snp_tab example_data/snp_tab.h5 \
-       --geno_prob example_data/geno_probs.h5 \
-       --haplotype example_data/haps.h5 \
+    python $WASP/CHT/extract_haplotype_read_counts.py \
+       --chrom $DATA_DIR/chromInfo.hg19.txt \
+       --snp_index $DATA_DIR/snp_index.h5 \
+       --snp_tab $DATA_DIR/snp_tab.h5 \
+       --geno_prob $DATA_DIR/geno_probs.h5 \
+       --haplotype $DATA_DIR/haps.h5 \
        --samples $ALL_SAMPLES_FILE \
        --individual $INDIVIDUAL \
-       --ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
-       --alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
-       --other_as_counts example_data/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
-       --read_counts example_data/H3K27ac/read_counts.$INDIVIDUAL.h5 \
-       example_data/H3K27ac/chr22.peaks.txt.gz \
-       | gzip > example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.txt.gz
+       --ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
+       --alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
+       --other_as_counts $DATA_DIR/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
+       --read_counts $DATA_DIT/H3K27ac/read_counts.$INDIVIDUAL.h5 \
+       $DATA_DIR/H3K27ac/chr22.peaks.txt.gz \
+       | gzip > $DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.txt.gz
 
 done
 
@@ -105,12 +117,12 @@ done
 # in each sample.
 # (first make files containing lists of input and output files)
 #
-IN_FILE=example_data/H3K27ac/input_files.txt
-OUT_FILE=example_data/H3K27ac/output_files.txt
-ls example_data/H3K27ac/haplotype_read_counts* | grep -v adjusted > $IN_FILE
+IN_FILE=$DATA_DIR/H3K27ac/input_files.txt
+OUT_FILE=$DATA_DIR/H3K27ac/output_files.txt
+ls $DATA_DIR/H3K27ac/haplotype_read_counts* | grep -v adjusted > $IN_FILE
 cat $IN_FILE | sed 's/.txt/.adjusted.txt/' >  $OUT_FILE
 
-python CHT/update_total_depth.py --seq example_data/seq.h5 $IN_FILE $OUT_FILE
+python $WASP/CHT/update_total_depth.py --seq $DATA_DIR/seq.h5 $IN_FILE $OUT_FILE
 
 
 #
@@ -123,38 +135,38 @@ python CHT/update_total_depth.py --seq example_data/seq.h5 $IN_FILE $OUT_FILE
 #
 for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
 do
-    IN_FILE=example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.txt.gz
-    OUT_FILE=example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.hetp.txt.gz
+    IN_FILE=$DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.txt.gz
+    OUT_FILE=$DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.hetp.txt.gz
 
-    python CHT/update_het_probs.py \
-	   --ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5  \
-	   --alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
+    python $WASP/CHT/update_het_probs.py \
+	   --ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5  \
+	   --alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
 	   $IN_FILE $OUT_FILE    
 done
 
 
-CHT_IN_FILE=example_data/H3K27ac/cht_input_file.txt
-ls example_data/H3K27ac/haplotype_read_counts*.adjusted.hetp.txt.gz > $CHT_IN_FILE
+CHT_IN_FILE=$DATA_DIR/H3K27ac/cht_input_file.txt
+ls $DATA_DIR/H3K27ac/haplotype_read_counts*.adjusted.hetp.txt.gz > $CHT_IN_FILE
 
 #
 # Estimate overdispersion parameters for allele-specific test (beta binomial)
 #
-OUT_FILE=example_data/H3K27ac/cht_as_coef.txt
-python CHT/fit_as_coefficients.py $CHT_IN_FILE $OUT_FILE
+OUT_FILE=$DATA_DIR/H3K27ac/cht_as_coef.txt
+python $WASP/CHT/fit_as_coefficients.py $CHT_IN_FILE $OUT_FILE
 
 
 #
 # Estimate overdispersion parameters for association test (beta-negative binomial)
 #
-OUT_FILE=example_data/H3K27ac/cht_bnb_coef.txt
-python CHT/fit_bnb_coefficients.py --min_counts 50 --min_as_counts 10 $CHT_IN_FILE $OUT_FILE
+OUT_FILE=$DATA_DIR/H3K27ac/cht_bnb_coef.txt
+python $WASP/CHT/fit_bnb_coefficients.py --min_counts 50 --min_as_counts 10 $CHT_IN_FILE $OUT_FILE
 
 
 #
 # run combined haplotype test
 #
-OUT_FILE=example_data/H3K27ac/cht_results.txt
-python CHT/combined_test.py --min_as_counts 10 \
+OUT_FILE=$DATA_DIR/H3K27ac/cht_results.txt
+python $WASP/CHT/combined_test.py --min_as_counts 10 \
        --bnb_disp example_data/H3K27ac/cht_bnb_coef.txt \
        --as_disp example_data/H3K27ac/cht_as_coef.txt \
        $CHT_IN_FILE $OUT_FILE