In [None]:
Step 4. Scoring ASB and control events with cluster-buster

At this step, we evaluate the impact of variants on transcription factor motif binding by evaluating cluster-buster scores of ASCAVs over non-ASCAVs. We perform this analysis in peak-wise and not variant-wise. Thus, we start by filtering peaks that contain discordant ASCAVs (ie. the peak has multiple ASCAVs with different winning haplotypes). Then the remaining peaks are lifted over to HAP1 or HAP2 (depending on the winning haplotype) and sequences were extracted from corresponding haplotype fastas. Next, we select control peaks that are not predicted as ASCAVs from the BaalCHIP output, again filtering for discordant variants. Peak positions are lifted over to HAP1 or HAP2, and sequences were extracted as in previous step. For each peak, the sequences from the reference genome were extracted as well. And finally, all sequences are scored with cluster-buster using a motif collection of 22k motifs.

In [None]:
# load necessary modules
module load Kent_tools/20181218-linux.x86_64
module load BEDTools/2.27.1-GCCcore-6.4.0
module load Cluster-Buster/20180705-foss-2018a
module load Python/3.7.0-foss-2018a


In [None]:
# sample and directories


In [None]:
###### extract test SNVs
mkdir -p motif_analysis

grep -w -f <(awk '{ if ($7=="TRUE") { print $1 } }' Bayes_report.txt) ../m_min6.txt > motif_analysis/ASB_peaks_and_muts

echo `cut -f 1 motif_analysis/ASB_peaks_and_muts | sort -u | wc -l` test variants in `cut -f 15 motif_analysis/ASB_peaks_and_muts | sort -u | wc -l` peaks

### Extract fasta for peaks with ASB associated SNVs
# First look for peaks that have multiple hits
# For the first edition, exclude peaks that have singificant alleles from both haplotypes
# in the second edition, check if they are all "winning" genotypes - if both are winning exclude the peak, if one is winning and the other is losing
# then exclude the peak twice - once with each haplotype
if [[ $(cut -f 10-15 motif_analysis/ASB_peaks_and_muts | sort -u | cut -f 2 | sort | uniq -d | wc -l) > 0 ]] ; then
    echo finding double hit peaks
    cut -f 10,15 motif_analysis/ASB_peaks_and_muts | sort -u | cut -f 2 | sort | uniq -d > motif_analysis/tmp_peaks_with_multiple_hits
    cat motif_analysis/ASB_peaks_and_muts | awk '{ print $24"\t"$15 }' | cut -f 2 | sort | uniq -d >> motif_analysis/tmp_peaks_with_multiple_hits
    echo there are `cat motif_analysis/tmp_peaks_with_multiple_hits | wc -l` peaks with multiple hits

    cat motif_analysis/tmp_peaks_with_multiple_hits \
      | while read peak ; do
            echo -n ${peak}" "; cat motif_analysis/ASB_peaks_and_muts | awk '$15=="'${peak}'"' | cut -f 24 | sort -u | tr '\n' '|' ; echo;
        done \
      | awk '{ if ($2=="HAP1_LOSES|HAP2_WINS|" || $2=="HAP1_WINS|HAP2_LOSES|") { print $1 } }' \
      > motif_analysis/double_hit_peaks

    cat motif_analysis/tmp_peaks_with_multiple_hits \
      | while read peak ; do
            echo -n ${peak}" "; cat motif_analysis/ASB_peaks_and_muts | awk '$15=="'${peak}'"' | cut -f 24 | sort -u | tr '\n' '|' ; echo;
        done \
      | awk '{ if ($2!="HAP1_LOSES|HAP2_WINS|" && $2!="HAP1_WINS|HAP2_LOSES|") { print $1 } }' \
      > motif_analysis/inconsistent_double_hit_peaks

    cat motif_analysis/inconsistent_double_hit_peaks \
      | while read peak ; do
            grep -w ${peak} motif_analysis/ASB_peaks_and_muts | cut -f 12-15 | sort -u;
        done \
      > motif_analysis/inconsistent_double_hit_peaks.bed

    echo `cat motif_analysis/double_hit_peaks | wc -l` peaks have consistent hits
else
    echo there are no peaks with inconsistent hits;
fi

# extract peak positions for HAP1 & HAP2 specific peaks (excluding inconsistent peaks)
cat motif_analysis/ASB_peaks_and_muts | awk -v 'OFS=\t' '{ if ($10=="1|0") { print $12, $13, $14, $15"|"$24 } }' | sort -u | subtractBed -a stdin -b motif_analysis/inconsistent_double_hit_peaks.bed -A -wa > motif_analysis/ASB_peaks_and_muts_HAP1_peaks.bed
cat motif_analysis/ASB_peaks_and_muts | awk -v 'OFS=\t' '{ if ($10=="1|0") { print $12, $13, $14, $15"|"$24 } }' | sort -u | subtractBed -a stdin -b motif_analysis/inconsistent_double_hit_peaks.bed -A -wa > motif_analysis/ASB_peaks_and_muts_HAP2_peaks.bed
#  cat motif_analysis/ASB_peaks_and_muts_HAP1_peaks.bed motif_analysis/ASB_peaks_and_muts_HAP2_peaks.bed | cut -f 4 | tr '|' '\t' | cut -f 1 | sort -u | wc -l
# this number + number of inconsistent peaks give the total number of inital peaks (as expected)

# hg38 peaks to hap1
liftOver motif_analysis/ASB_peaks_and_muts_HAP1_peaks.bed \
    ../${sample}.refTOhap1.chain \
    motif_analysis/ASB_peaks_and_muts_HAP1_peaks.HAP1.bed \
    motif_analysis/ASB_peaks_and_muts_HAP1_peaks.HAP1.bed_unmapped

if [[ $(cat motif_analysis/ASB_peaks_and_muts_HAP1_peaks.HAP1.bed_unmapped | wc -l) > 0 ]] ; then
    echo `cat motif_analysis/ASB_peaks_and_muts_HAP1_peaks.HAP1.bed_unmapped | grep -v '^#' | wc -l` peak unmapped to HAP1, exclude and redo liftover
    cat motif_analysis/ASB_peaks_and_muts_HAP1_peaks.HAP1.bed_unmapped | grep -v '^#' | cut -f 1-3 | sort -u > motif_analysis/to_skip
    subtractBed -a motif_analysis/ASB_peaks_and_muts_HAP1_peaks.bed -b motif_analysis/to_skip -A -wa > motif_analysis/tmp.HAP1.bed
    liftOver motif_analysis/tmp.HAP1.bed \
        ../${sample}.refTOhap1.chain \
        motif_analysis/ASB_peaks_and_muts_HAP1_peaks.HAP1.bed \
        motif_analysis/ASB_peaks_and_muts_HAP1_peaks.HAP1.bed_unmapped
    echo `cat motif_analysis/ASB_peaks_and_muts_HAP1_peaks.HAP1.bed_unmapped | grep -v '^#' | wc -l` peak unmapped to HAP1
    rm motif_analysis/to_skip motif_analysis/tmp.HAP1.bed
fi


# hg38 peaks to hap2
liftOver motif_analysis/ASB_peaks_and_muts_HAP2_peaks.bed \
    ../${sample}.refTOhap2.chain \
    motif_analysis/ASB_peaks_and_muts_HAP2_peaks.HAP2.bed \
    motif_analysis/ASB_peaks_and_muts_HAP2_peaks.HAP2.bed_unmapped

if [[ $(cat motif_analysis/ASB_peaks_and_muts_HAP2_peaks.HAP2.bed_unmapped | wc -l) > 0 ]] ; then
    echo `cat motif_analysis/ASB_peaks_and_muts_HAP2_peaks.HAP2.bed_unmapped | grep -v '^#' | wc -l` peak unmapped to HAP2, exclude and redo liftover
    cat motif_analysis/ASB_peaks_and_muts_HAP2_peaks.HAP2.bed_unmapped | grep -v '^#' | cut -f 1-3 | sort -u > motif_analysis/to_skip
    subtractBed -a motif_analysis/ASB_peaks_and_muts_HAP2_peaks.bed -b motif_analysis/to_skip -A -wa > motif_analysis/tmp.HAP2.bed
    liftOver motif_analysis/tmp.HAP2.bed \
        ../${sample}.refTOhap2.chain \
        motif_analysis/ASB_peaks_and_muts_HAP2_peaks.HAP2.bed \
        motif_analysis/ASB_peaks_and_muts_HAP2_peaks.HAP2.bed_unmapped
    echo `cat motif_analysis/ASB_peaks_and_muts_HAP2_peaks.HAP2.bed_unmapped | grep -v '^#' | wc -l` peak unmapped to HAP2
    rm motif_analysis/to_skip motif_analysis/tmp.HAP2.bed
fi

# extract fasta files from hap1 and hap2
mkdir -p motif_analysis/ASB_mut_fasta

#hap1
fastaFromBed \
    -fi  ../${sample}.hap1.fa \
    -bed motif_analysis/ASB_peaks_and_muts_HAP1_peaks.HAP1.bed \
    -fo motif_analysis/ASB_mut_fasta/ASB_peaks_and_muts_HAP1_peaks.HAP1.fa \
    -name

# hap2
fastaFromBed \
    -fi  ../${sample}.hap2.fa \
    -bed motif_analysis/ASB_peaks_and_muts_HAP2_peaks.HAP2.bed \
    -fo motif_analysis/ASB_mut_fasta/ASB_peaks_and_muts_HAP2_peaks.HAP2.fa \
    -name

cat motif_analysis/ASB_mut_fasta/* | paste - - | sort -k 1,1 -t " " | tr '\t' '\n' > motif_analysis/ASB_MUT_sorted.fa

# extract fasta files from REF

# filter out any unmapped peak from previous set
cat motif_analysis/ASB_mut_fasta/ASB_peaks_and_muts_HAP1_peaks.HAP1.fa motif_analysis/ASB_mut_fasta/ASB_peaks_and_muts_HAP2_peaks.HAP2.fa | grep '>' | tr -d '>' > tmp
awk 'FNR==NR{seen[$1]=1;next} ($4) in seen' tmp <( cat motif_analysis/ASB_peaks_and_muts_HAP1_peaks.bed motif_analysis/ASB_peaks_and_muts_HAP2_peaks.bed) > motif_analysis/ASB_peaks_and_muts_ALL_peaks.filt.bed
rm tmp

mkdir -p motif_analysis/ASB_ref_fasta

fastaFromBed \
    -fi /staging/leuven/stg_00002/lcb/resources/human/hg38/hg38.fa \
    -bed motif_analysis/ASB_peaks_and_muts_ALL_peaks.filt.bed \
    -fo motif_analysis/ASB_ref_fasta/ASB_peaks_and_muts_ALL_peaks.fa \
    -name

cat motif_analysis/ASB_ref_fasta/ASB_peaks_and_muts_ALL_peaks.fa | paste - - | sort -k 1,1 -t " " | tr '\t' '\n' > motif_analysis/ASB_REF_sorted.fa

echo Generated test fasta for `cat motif_analysis/ASB_MUT_sorted.fa | grep '^>' | wc -l` peaks with mutation and `cat motif_analysis/ASB_REF_sorted.fa | grep '^>' | wc -l` w/o mutation

rm -f motif_analysis/double_hit_peaks motif_analysis/inconsistent_double_hit_peaks.bed motif_analysis/inconsistent_double_hit_peaks.bed



In [None]:
# control variants
grep -w -f <(cat Bayes_report.txt | awk '$7=="FALSE"' | cut -f1) ../m_min6.txt > motif_analysis/control_peaks_and_muts

# filter out SNVs without a winning haplotype

echo `cat motif_analysis/control_peaks_and_muts | cut -f 1 | sort -u | wc -l` control variants in `cat motif_analysis/control_peaks_and_muts | cut -f 15 | sort -u | wc -l` peaks

if [[ $(cat motif_analysis/control_peaks_and_muts | cut -f 10,15 | sort -u | cut -f 2 | sort | uniq -d | wc -l) > 0 ]]; then
    echo finding double hit peaks
    cat motif_analysis/control_peaks_and_muts | cut -f 10,15 | sort -u | cut -f2 | sort | uniq -d  > motif_analysis/tmp_peaks_with_multiple_hits
    echo there are `cat motif_analysis/tmp_peaks_with_multiple_hits | wc -l` peaks with multiple hits

    # check for inconsistent peaks
    cat motif_analysis/tmp_peaks_with_multiple_hits \
      | while read peak ; do
            echo -n ${peak}" "; cat motif_analysis/control_peaks_and_muts | awk '$15=="'${peak}'"' | cut -f 24 | sort -u | tr '\n' '|'; echo;
        done \
      | awk '{ if ($2=="HAP1_LOSES|HAP2_WINS|" || $2=="HAP1_WINS|HAP2_LOSES|") { print $1 } }' \
      > motif_analysis/double_hit_peaks

     cat motif_analysis/tmp_peaks_with_multiple_hits \
       | while read peak ; do
             echo -n ${peak}" "; cat motif_analysis/control_peaks_and_muts | awk '$15=="'${peak}'"' | cut -f 24 | sort -u | tr '\n' '|'; echo;
         done \
       | awk '{ if($2!="HAP1_LOSES|HAP2_WINS|" && $2!="HAP1_WINS|HAP2_LOSES|") { print $1 } }' \
       > motif_analysis/inconsistent_double_hit_peaks

     cat motif_analysis/inconsistent_double_hit_peaks \
       | while read peak ; do
             grep -w ${peak} motif_analysis/control_peaks_and_muts | cut -f 12-15 | sort -u;
         done \
       > motif_analysis/inconsistent_double_hit_peaks.bed

    echo `cat motif_analysis/double_hit_peaks | wc -l` peaks have consistent hits
else
    echo there are no peaks with inconsistent hits;
fi

# create HAP1 specific and HAP2 specific bed files
# filter out inconsistent peaks
# filter out peaks in test set
cat motif_analysis/control_peaks_and_muts | awk -v 'OFS=\t' '{ if ($10=="1|0") { print $12, $13, $14, $15"|"$24 } }' | sort -u \
  | subtractBed -a stdin -b motif_analysis/inconsistent_double_hit_peaks.bed -A -wa \
  | subtractBed -a stdin -b motif_analysis/ASB_peaks_and_muts_ALL_peaks.filt.bed -A -wa \
  > motif_analysis/control_peaks_and_muts_HAP1_peaks.bed

cat motif_analysis/control_peaks_and_muts | awk -v 'OFS=\t' '{ if ($10=="0|1") { print $12, $13, $14, $15"|"$24 } }' | sort -u \
  | subtractBed -a stdin -b motif_analysis/inconsistent_double_hit_peaks.bed -A -wa \
  | subtractBed -a stdin -b motif_analysis/ASB_peaks_and_muts_ALL_peaks.filt.bed -A -wa \
  > motif_analysis/control_peaks_and_muts_HAP2_peaks.bed


cat motif_analysis/control_peaks_and_muts_HAP1_peaks.bed motif_analysis/control_peaks_and_muts_HAP2_peaks.bed > motif_analysis/control_peaks_and_muts.all_peaks.bed


# hg38 peaks to hap1
liftOver motif_analysis/control_peaks_and_muts_HAP1_peaks.bed \
    ../${sample}.refTOhap1.chain \
    motif_analysis/control_peaks_and_muts_HAP1_peaks.bed.HAP1.bed \
    motif_analysis/control_peaks_and_muts_HAP1_peaks.bed.HAP1.bed_unmapped

if [[ $(cat motif_analysis/control_peaks_and_muts_HAP1_peaks.bed.HAP1.bed_unmapped | wc -l) > 0 ]] ; then
    echo `cat motif_analysis/control_peaks_and_muts_HAP1_peaks.bed.HAP1.bed_unmapped | grep -v '^#' | wc -l` peak unmapped to HAP1, exclude and redo liftover
    cat motif_analysis/control_peaks_and_muts_HAP1_peaks.bed.HAP1.bed_unmapped | grep -v '^#' | cut -f 1-3 | sort -u > motif_analysis/to_skip
    subtractBed -a motif_analysis/control_peaks_and_muts_HAP1_peaks.bed -b motif_analysis/to_skip -A -wa > motif_analysis/tmp.HAP1.bed
    liftOver motif_analysis/tmp.HAP1.bed \
        ../${sample}.refTOhap1.chain \
        motif_analysis/control_peaks_and_muts_HAP1_peaks.bed.HAP1.bed \
        motif_analysis/control_peaks_and_muts_HAP1_peaks.bed.HAP1.bed_unmapped
        echo `cat motif_analysis/control_peaks_and_muts_HAP1_peaks.bed.HAP1.bed_unmapped | grep -v '^#' | wc -l` peak unmapped to HAP1
        rm motif_analysis/to_skip motif_analysis/tmp.HAP1.bed
fi

# hg38 peaks to hap2
liftOver motif_analysis/control_peaks_and_muts_HAP2_peaks.bed \
    ../${sample}.refTOhap2.chain \
    motif_analysis/control_peaks_and_muts_HAP2_peaks.bed.HAP2.bed \
    motif_analysis/control_peaks_and_muts_HAP2_peaks.bed.HAP2.bed_unmapped

if [[ $(cat motif_analysis/control_peaks_and_muts_HAP2_peaks.bed.HAP2.bed_unmapped | wc -l) > 0 ]] ; then
    echo `cat motif_analysis/control_peaks_and_muts_HAP2_peaks.bed.HAP2.bed_unmapped | grep -v '^#' | wc -l` peak unmapped to HAP2, exclude and redo liftover
    cat motif_analysis/control_peaks_and_muts_HAP2_peaks.bed.HAP2.bed_unmapped | grep -v '^#' | cut -f 1-3 | sort -u > motif_analysis/to_skip
    subtractBed -a motif_analysis/control_peaks_and_muts_HAP2_peaks.bed -b motif_analysis/to_skip -A -wa > motif_analysis/tmp.HAP2.bed
    liftOver motif_analysis/tmp.HAP2.bed \
        ../${sample}.refTOhap2.chain \
        motif_analysis/control_peaks_and_muts_HAP2_peaks.bed.HAP2.bed \
        motif_analysis/control_peaks_and_muts_HAP2_peaks.bed.HAP2.bed_unmapped
        echo `cat motif_analysis/control_peaks_and_muts_HAP2_peaks.bed.HAP2.bed_unmapped | grep -v '^#' | wc -l` peak unmapped to HAP2
        rm motif_analysis/to_skip motif_analysis/tmp.HAP2.bed
fi

fastaFromBed \
    -fi ../${sample}.hap1.fa \
    -bed motif_analysis/control_peaks_and_muts_HAP1_peaks.bed.HAP1.bed \
    -fo motif_analysis/control_peaks_and_muts_HAP1_peaks.HAP1.fa \
    -name

fastaFromBed \
    -fi ../${sample}.hap2.fa \
    -bed motif_analysis/control_peaks_and_muts_HAP2_peaks.bed.HAP2.bed \
    -fo motif_analysis/control_peaks_and_muts_HAP2_peaks.HAP2.fa \
    -name

cat motif_analysis/control_peaks_and_muts_HAP1_peaks.HAP1.fa motif_analysis/control_peaks_and_muts_HAP2_peaks.HAP2.fa \
  | paste - - | sort -k 1 | uniq | sort -k 1,1 -t " "| tr '\t' '\n' \
  > motif_analysis/control_MUT_sorted.fa

# filter out any unmapped peak from previous set
cat motif_analysis/control_MUT_sorted.fa | grep '>' | tr -d '>' > tmp
awk 'FNR==NR{seen[$1]=1;next} ($4) in seen' tmp motif_analysis/control_peaks_and_muts.all_peaks.bed > motif_analysis/control_peaks_and_muts.all_peaks.filt.bed
rm tmp

fastaFromBed \
    -fi /staging/leuven/stg_00002/lcb/resources/human/hg38/hg38.fa \
    -bed motif_analysis/control_peaks_and_muts.all_peaks.filt.bed \
    -fo motif_analysis/control_REF.fa \
    -name

cat motif_analysis/control_REF.fa | paste - - | sort -k 1,1 -t " " | tr '\t' '\n' \
  > motif_analysis/control_REF_sorted.fa

echo Generated control fasta for `cat motif_analysis/control_MUT_sorted.fa | grep '^>' | wc -l` peaks with mutation and `cat motif_analysis/control_REF_sorted.fa | grep '^>' | wc -l` w/o mutation


In [None]:
# motif scoring
# ASB_MUT
/staging/leuven/stg_00002/software/primescore/src/make_feature_table.py \
    -O tsv \
    -f motif_analysis/ASB_MUT_sorted.fa \
    -M /ddn1/vol1/staging/leuven/stg_00002/lcb/icistarget/data/motifCollection/v8/singletons \
    -m ${wgs_folder}/MM057/alleleseq/motif_analysis/singletons_names.txt \
    -o motif_analysis/ASB_MUT_sorted.new_cbust_result.tsv \
    -t 30
# ASB_REF
/staging/leuven/stg_00002/software/primescore/src/make_feature_table.py \
    -O tsv \
    -f motif_analysis/ASB_REF_sorted.fa \
    -M /ddn1/vol1/staging/leuven/stg_00002/lcb/icistarget/data/motifCollection/v8/singletons \
    -m ${wgs_folder}/MM057/alleleseq/motif_analysis/singletons_names.txt \
    -o motif_analysis/ASB_REF_sorted.new_cbust_result.tsv \
    -t 30
# CTRL_MUT
/staging/leuven/stg_00002/software/primescore/src/make_feature_table.py \
    -O tsv \
    -f motif_analysis/control_MUT_sorted.fa \
    -M /ddn1/vol1/staging/leuven/stg_00002/lcb/icistarget/data/motifCollection/v8/singletons \
    -m ${wgs_folder}/MM057/alleleseq/motif_analysis/singletons_names.txt \
    -o motif_analysis/control_MUT_sorted.new_cbust_result.tsv \
    -t 30
# CTRL_REF
/staging/leuven/stg_00002/software/primescore/src/make_feature_table.py \
    -O tsv \
    -f motif_analysis/control_REF_sorted.fa \
    -M /ddn1/vol1/staging/leuven/stg_00002/lcb/icistarget/data/motifCollection/v8/singletons \
    -m ${wgs_folder}/MM057/alleleseq/motif_analysis/singletons_names.txt \
    -o motif_analysis/control_REF_sorted.new_cbust_result.tsv \
    -t 30