Skip to content

Commit

Permalink
update example_cht_workflow paths and descriptions
Browse files Browse the repository at this point in the history
  • Loading branch information
gmcvicker committed Jul 3, 2018
1 parent d72bcd4 commit ca6052f
Showing 1 changed file with 70 additions and 58 deletions.
128 changes: 70 additions & 58 deletions examples/example_cht_workflow.sh
@@ -1,31 +1,43 @@

# Set these environment vars to point to
# your local installation of WASP
WASP=$HOME/proj/WASP
DATA_DIR=$WASP/examples/example_data

#
# Convert SNP files to HDF5 format. This program can be used
# on output from impute2 or on VCF files
# on output from impute2 or on VCF files. Note that
# you do not need to repeat this step here if it was already
# done in the mapping pipeline.
#
./snp2h5/snp2h5 --chrom example_data/chromInfo.hg19.txt \
$WASP/snp2h5/snp2h5 --chrom $DATA_DIR/chromInfo.hg19.txt \
--format impute \
--geno_prob example_data/geno_probs.h5 \
--snp_index example_data/snp_index.h5 \
--snp_tab example_data/snp_tab.h5 \
--haplotype example_data/haps.h5 \
example_data/genotypes/chr*.hg19.impute2.gz \
example_data/genotypes/chr*.hg19.impute2_haps.gz
--geno_prob $DATA_DIR/geno_probs.h5 \
--snp_index $DATA_DIR/snp_index.h5 \
--snp_tab $DATA_DIR/snp_tab.h5 \
--haplotype $DATA_DIR/haps.h5 \
$DATA_DIR/genotypes/chr*.hg19.impute2.gz \
$DATA_DIR/genotypes/chr*.hg19.impute2_haps.gz

#
# Convert FASTA files to HDF5 format.
# Note the HDF5 sequence files are only used for GC content
# correction part of CHT. This step can be ommitted if
# correction part of CHT. This step can be omitted if
# GC-content correction is not used.
#
./snp2h5/fasta2h5 --chrom example_data/chromInfo.hg19.txt \
--seq example_data/seq.h5 \
# The chr*.fa.gz files are the fasta files for the reference genome
# and can be downloaded from a genome browser (e.g.:
# http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/)
#
$WASP/snp2h5/fasta2h5 --chrom $DATA_DIR/chromInfo.hg19.txt \
--seq $DATA_DIR/seq.h5 \
/data/external_public/reference_genomes/hg19/chr*.fa.gz



# loop over all individuals in samples file
H3K27AC_SAMPLES_FILE=example_data/H3K27ac/samples.txt
ALL_SAMPLES_FILE=example_data/genotypes/YRI_samples.txt
H3K27AC_SAMPLES_FILE=$DATA_DIR/H3K27ac/samples.txt
ALL_SAMPLES_FILE=$DATA_DIR/genotypes/YRI_samples.txt

for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
do
Expand All @@ -35,17 +47,17 @@ do
# read BAM files for this individual and write read counts to
# HDF5 files
#
python CHT/bam2h5.py --chrom example_data/chromInfo.hg19.txt \
--snp_index example_data/snp_index.h5 \
--snp_tab example_data/snp_tab.h5 \
--haplotype example_data/haps.h5 \
python $WASP/CHT/bam2h5.py --chrom $DATA_DIR/chromInfo.hg19.txt \
--snp_index $DATA_DIR/snp_index.h5 \
--snp_tab $DATA_DIR/snp_tab.h5 \
--haplotype $DATA_DIR/haps.h5 \
--samples $ALL_SAMPLES_FILE \
--individual $INDIVIDUAL \
--ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
--other_as_counts example_data/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
--read_counts example_data/H3K27ac/read_counts.$INDIVIDUAL.h5 \
example_data/H3K27ac/$INDIVIDUAL.chr*.keep.rmdup.bam
--ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
--other_as_counts $DATA_DIR/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
--read_counts $DATA_DIR/H3K27ac/read_counts.$INDIVIDUAL.h5 \
$DATA_DIR/H3K27ac/$INDIVIDUAL.chr*.keep.rmdup.bam
done


Expand All @@ -59,20 +71,20 @@ done
# also be generated by the user (for example if a specific set of
# target regions and test SNPs are to be tested).
#
python CHT/get_target_regions.py \
python $WASP/CHT/get_target_regions.py \
--target_region_size 2000 \
--min_as_count 10 \
--min_read_count 100 \
--min_het_count 1 \
--min_minor_allele_count 1\
--chrom example_data/chromInfo.hg19.txt \
--read_count_dir example_data/H3K27ac \
--chrom $DATA_DIR/chromInfo.hg19.txt \
--read_count_dir $DATA_DIR/H3K27ac \
--individuals $H3K27AC_SAMPLES_FILE \
--samples $ALL_SAMPLES_FILE \
--snp_tab example_data/snp_tab.h5 \
--snp_index example_data/snp_index.h5 \
--haplotype example_data/haps.h5 \
--output_file example_data/H3K27ac/chr22.peaks.txt.gz
--snp_tab $DATA_DIR/snp_tab.h5 \
--snp_index $DATA_DIR/snp_index.h5 \
--haplotype $DATA_DIR/haps.h5 \
--output_file $DATA_DIR/H3K27ac/chr22.peaks.txt.gz



Expand All @@ -81,20 +93,20 @@ for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
#
# create CHT input file for this individual
#
python CHT/extract_haplotype_read_counts.py \
--chrom example_data/chromInfo.hg19.txt \
--snp_index example_data/snp_index.h5 \
--snp_tab example_data/snp_tab.h5 \
--geno_prob example_data/geno_probs.h5 \
--haplotype example_data/haps.h5 \
python $WASP/CHT/extract_haplotype_read_counts.py \
--chrom $DATA_DIR/chromInfo.hg19.txt \
--snp_index $DATA_DIR/snp_index.h5 \
--snp_tab $DATA_DIR/snp_tab.h5 \
--geno_prob $DATA_DIR/geno_probs.h5 \
--haplotype $DATA_DIR/haps.h5 \
--samples $ALL_SAMPLES_FILE \
--individual $INDIVIDUAL \
--ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
--other_as_counts example_data/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
--read_counts example_data/H3K27ac/read_counts.$INDIVIDUAL.h5 \
example_data/H3K27ac/chr22.peaks.txt.gz \
| gzip > example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.txt.gz
--ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
--other_as_counts $DATA_DIR/H3K27ac/other_as_counts.$INDIVIDUAL.h5 \
--read_counts $DATA_DIT/H3K27ac/read_counts.$INDIVIDUAL.h5 \
$DATA_DIR/H3K27ac/chr22.peaks.txt.gz \
| gzip > $DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.txt.gz

done

Expand All @@ -105,12 +117,12 @@ done
# in each sample.
# (first make files containing lists of input and output files)
#
IN_FILE=example_data/H3K27ac/input_files.txt
OUT_FILE=example_data/H3K27ac/output_files.txt
ls example_data/H3K27ac/haplotype_read_counts* | grep -v adjusted > $IN_FILE
IN_FILE=$DATA_DIR/H3K27ac/input_files.txt
OUT_FILE=$DATA_DIR/H3K27ac/output_files.txt
ls $DATA_DIR/H3K27ac/haplotype_read_counts* | grep -v adjusted > $IN_FILE
cat $IN_FILE | sed 's/.txt/.adjusted.txt/' > $OUT_FILE

python CHT/update_total_depth.py --seq example_data/seq.h5 $IN_FILE $OUT_FILE
python $WASP/CHT/update_total_depth.py --seq $DATA_DIR/seq.h5 $IN_FILE $OUT_FILE


#
Expand All @@ -123,38 +135,38 @@ python CHT/update_total_depth.py --seq example_data/seq.h5 $IN_FILE $OUT_FILE
#
for INDIVIDUAL in $(cat $H3K27AC_SAMPLES_FILE)
do
IN_FILE=example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.txt.gz
OUT_FILE=example_data/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.hetp.txt.gz
IN_FILE=$DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.txt.gz
OUT_FILE=$DATA_DIR/H3K27ac/haplotype_read_counts.$INDIVIDUAL.adjusted.hetp.txt.gz

python CHT/update_het_probs.py \
--ref_as_counts example_data/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts example_data/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
python $WASP/CHT/update_het_probs.py \
--ref_as_counts $DATA_DIR/H3K27ac/ref_as_counts.$INDIVIDUAL.h5 \
--alt_as_counts $DATA_DIR/H3K27ac/alt_as_counts.$INDIVIDUAL.h5 \
$IN_FILE $OUT_FILE
done


CHT_IN_FILE=example_data/H3K27ac/cht_input_file.txt
ls example_data/H3K27ac/haplotype_read_counts*.adjusted.hetp.txt.gz > $CHT_IN_FILE
CHT_IN_FILE=$DATA_DIR/H3K27ac/cht_input_file.txt
ls $DATA_DIR/H3K27ac/haplotype_read_counts*.adjusted.hetp.txt.gz > $CHT_IN_FILE

#
# Estimate overdispersion parameters for allele-specific test (beta binomial)
#
OUT_FILE=example_data/H3K27ac/cht_as_coef.txt
python CHT/fit_as_coefficients.py $CHT_IN_FILE $OUT_FILE
OUT_FILE=$DATA_DIR/H3K27ac/cht_as_coef.txt
python $WASP/CHT/fit_as_coefficients.py $CHT_IN_FILE $OUT_FILE


#
# Estimate overdispersion parameters for association test (beta-negative binomial)
#
OUT_FILE=example_data/H3K27ac/cht_bnb_coef.txt
python CHT/fit_bnb_coefficients.py --min_counts 50 --min_as_counts 10 $CHT_IN_FILE $OUT_FILE
OUT_FILE=$DATA_DIR/H3K27ac/cht_bnb_coef.txt
python $WASP/CHT/fit_bnb_coefficients.py --min_counts 50 --min_as_counts 10 $CHT_IN_FILE $OUT_FILE


#
# run combined haplotype test
#
OUT_FILE=example_data/H3K27ac/cht_results.txt
python CHT/combined_test.py --min_as_counts 10 \
OUT_FILE=$DATA_DIR/H3K27ac/cht_results.txt
python $WASP/CHT/combined_test.py --min_as_counts 10 \
--bnb_disp example_data/H3K27ac/cht_bnb_coef.txt \
--as_disp example_data/H3K27ac/cht_as_coef.txt \
$CHT_IN_FILE $OUT_FILE
Expand Down

0 comments on commit ca6052f

Please sign in to comment.