### Step 2. Mapping functional genomics data to personalized genomes

ATAC-seq reads are mapped to HAP1, HAP2 and reference genome (hg38) individually. Bam file are further processed to remove duplicate reads and reads with low mapping quality (Q2). Peak calling is performed in each alignment file seperately with MACS2. Peak summit locations from HAP1 and HAP2 are lifted over to hg38 coordinates, filtered for blacklisted regions. Then peak summits are extended 250bp to each direction, and peak scores within each peak set is normalized (by dividing each peak score to the total peak score). Then, three peak sets (HAP1, HAP2 and hg38) are consolidated: all peaks are combined and ranked by their normalized score. Starting with the highest scoring peak, we filter out any peak that overlaps with the highest scoring peak. Then we move onto the second peak, and repeat this process until we have a non-overlapping peak set. This step ensures we obtain an accurate peak set considering all three alignments.

In [None]:
# provide the sample name and the fastq file to be mapped here
sample="MM031"
atac_fastq="/staging/leuven/stg_00002/lcb/zkalender/melanoma_ATAC/00.FASTQ/MM031_BL/OmniATAC__MM031__untreated_S3_R2_001.fastq.gz"
wgs_folder="/staging/leuven/stg_00002/lcb/zkalender/melanoma_WGS"

In [None]:
# create necessary folders
mkdir -p ${wgs_folder}/${sample}/AS_ATAC
mkdir -p ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map
mkdir -p ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map

cd ${wgs_folder}/${sample}/AS_ATAC

In [None]:
# Map to the reference genome
# map
bowtie2 -p 20 --very-sensitive \
    -x ${resources_folder}/refdata-GRCh38-2.1.0/fasta/genome \
    -U ${atac_fastq} \
  | samtools view -b - \
  | samtools sort -@ 20 -l 1 -O bam \
        -o ${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.sorted.bam

samtools flagstat \
    ${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.sorted.bam \
  > ${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.sorted.bam.flagstat

# duplicate removal
picard MarkDuplicates \
    I=${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.sorted.bam \
    O=${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.sorted.dedup.bam \
    CREATE_INDEX=true \
    VALIDATION_STRINGENCY=SILENT \
    REMOVE_DUPLICATES=TRUE \
    M=${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.dedup.metrics

samtools view -@ 10 -q 2 -b \
    ${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.sorted.dedup.bam \
  | samtools sort - -@ 10 -O bam -l 9 -T ${VSC_SCRATCH}/tmp \
        -o ${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.sorted.dedup.Q2.bam

samtools flagstat ${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.sorted.dedup.Q2.bam \
  > ${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.sorted.dedup.Q2.flagstat

# peak calling
mkdir -p ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks

macs2 callpeak \
    -t ${wgs_folder}/${sample}/AS_ATAC/REF/${sample}.ATAC.REF.sorted.dedup.Q2.bam \
    -q 0.05 \
    -n ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks/${sample}.ATAC.REF.q_005.peaks \
    --nomodel --shift -75 --extsize 150 --keep-dup all --call-summits \
    -f BAM

intersectBed \
    -a ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks/${sample}.ATAC.REF.q_005.peaks_summits.bed \
    -b ${resources_folder}/hg38/hg38.canonical_chromosomes.bed \
    -wa -nonamecheck \
  | sort -u | sortBed -i stdin \
  > ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks/${sample}.ATAC.REF.q_005.peaks_summits_filt.bed



In [None]:
# map to HAP1
bowtie2 -p 20 --very-sensitive \
    -x ${wgs_folder}/${sample}/liftover_proc/bowtie2_index/${sample}.hap1.fa \
    -U ${atac_fastq} \
  | samtools view -b - \
  | samtools sort -@ 20 -l 1 -O bam \
        -o ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.sorted.bam

samtools flagstat ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.sorted.bam \
  > ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.sorted.bam.flagstat

# duplicate removal
picard MarkDuplicates \
    I=${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.sorted.bam \
    O=${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.sorted.dedup.bam \
    CREATE_INDEX=true \
    VALIDATION_STRINGENCY=SILENT \
    REMOVE_DUPLICATES=TRUE \
    M=${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.dedup.metrics

# quality filter
samtools view -@ 10 -q 2 -b \
    ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.sorted.dedup.bam \
  | samtools sort - -@ 10 -O bam -l 9 -T ${VSC_SCRATCH}/tmp \
        -o ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.sorted.dedup.Q2.bam

samtools flagstat ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.sorted.dedup.Q2.bam \
  > ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.sorted.dedup.Q2.flagstat

# call peaks on reads mapped to HAP1
mkdir -p ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks
macs2 callpeak \
    -t ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/${sample}.ATAC.HAP1_ind_map.sorted.dedup.Q2.bam \
    -q 0.05 \
    -n ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.q_005.peaks \
    --nomodel --shift -75 --extsize 150 --keep-dup all --call-summits \
    -f BAM

In [None]:
# map to HAP2
bowtie2 -p 20 --very-sensitive \
    -x ${wgs_folder}/${sample}/liftover_proc/bowtie2_index/${sample}.hap2.fa \
    -U ${atac_fastq} \
  | samtools view -b - \
  | samtools sort -@ 20 -l 1 -O bam \
        -o ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.sorted.bam

samtools flagstat ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.sorted.bam \
  > ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.sorted.bam.flagstat

# duplicate removal
picard MarkDuplicates \
    I=${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.sorted.bam \
    O=${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.sorted.dedup.bam \
    CREATE_INDEX=true \
    VALIDATION_STRINGENCY=SILENT \
    REMOVE_DUPLICATES=TRUE \
    M=${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.dedup.metrics

# quality filter
samtools view -@ 10 -q 2 -b \
    ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.sorted.dedup.bam \
  | samtools sort - -@ 10 -O bam -l 9 -T ${VSC_SCRATCH}/tmp \
        -o ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.sorted.dedup.Q2.bam

samtools flagstat ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.sorted.dedup.Q2.bam \
  > ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.sorted.dedup.Q2.flagstat

# call peaks on reads mapped to HAP2
mkdir -p ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks
macs2 callpeak \
    -t ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/${sample}.ATAC.HAP2_ind_map.sorted.dedup.Q2.bam \
    -q 0.05 \
    -n ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.q_005.peaks \
    --nomodel --shift -75 --extsize 150 --keep-dup all --call-summits \
    -f BAM

In [None]:
## Consolidate peaks

liftOver ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.q_005.peaks_summits.bed \
    ${wgs_folder}/${sample}/liftover_proc/${sample}.hap1_to_hg38.over.chain \
    ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.q_005.peaks_summits.hg38.bed \
    ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.unmapp

liftOver ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.q_005.peaks_summits.bed \
    ${wgs_folder}/${sample}/liftover_proc/${sample}.hap2_to_hg38.over.chain \
    ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.q_005.peaks_summits.hg38.bed \
    ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.unmapp


# extend to 500 bp & filter for blacklisted regions
# HAP1
slopBed \
    -i ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.q_005.peaks_summits.hg38.bed \
    -b 250 \
    -g /staging/leuven/stg_00002/lcb/resources/human/hg38/hg38.canonical.chrom.sizes \
  | subtractBed -a stdin -b ${resources_folder}/hg38/peakPass60Perc_sorted.bed -A -wa \
  > ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.q_005.peaks_summits.hg38.extended.bed

# HAP2
slopBed \
    -i ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.q_005.peaks_summits.hg38.bed \
    -b 250 \
    -g /staging/leuven/stg_00002/lcb/resources/human/hg38/hg38.canonical.chrom.sizes \
  | subtractBed -a stdin -b ${resources_folder}/hg38/peakPass60Perc_sorted.bed -A -wa \
  > ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.q_005.peaks_summits.hg38.extended.bed

# REF
slopBed \
    -i ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks/${sample}.ATAC.REF.q_005.peaks_summits_filt.bed \
    -b 250 \
    -g /staging/leuven/stg_00002/lcb/resources/human/hg38/hg38.canonical.chrom.sizes \
  | subtractBed -a stdin -b ${resources_folder}/hg38/peakPass60Perc_sorted.bed -A -wa \
  > ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks/${sample}.ATAC.REF.q_005.peaks_summits.extended.bed

cat ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.q_005.peaks_summits.hg38.extended.bed \
  | tr '/' '\t' | cut -f 1-3,15,16 \
  > ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.q_005.peaks_summits.hg38.extended.renamed.bed

cat ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.q_005.peaks_summits.hg38.extended.bed \
  | tr '/' '\t' | cut -f 1-3,15,16 \
  > ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.q_005.peaks_summits.hg38.extended.renamed.bed

cat ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks/${sample}.ATAC.REF.q_005.peaks_summits.extended.bed \
  | tr '/' '\t' | cut -f 1-3,15,16 \
  > ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks/${sample}.ATAC.REF.q_005.peaks_summits.extended.renamed.bed

# normalize peak scores
module load R/3.4.1-foss-2014a-noX
Rscript ~/lcb/zkalender/software/src_zkalender/CDK7_Menin/normalize_fixed_width_peaks.R \
    ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.q_005.peaks_summits.hg38.extended.renamed.bed \
    ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.q_005.peaks_summits.hg38.extended_normalized.bed

Rscript ~/lcb/zkalender/software/src_zkalender/CDK7_Menin/normalize_fixed_width_peaks.R \
    ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.q_005.peaks_summits.hg38.extended.renamed.bed \
    ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.q_005.peaks_summits.hg38.extended_normalized.bed

Rscript ~/lcb/zkalender/software/src_zkalender/CDK7_Menin/normalize_fixed_width_peaks.R \
    ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks/${sample}.ATAC.REF.q_005.peaks_summits.extended.renamed.bed \
    ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks/${sample}.ATAC.REF.q_005.peaks_summits.extended_normalized.bed


module load gawk/4.1.3
module load bedtools/20181008-foss-2014a

cat ${wgs_folder}/${sample}/AS_ATAC/HAP1_ind_map/q005_peaks/${sample}.ATAC.HAP1_ind_map.q_005.peaks_summits.hg38.extended_normalized.bed \
    ${wgs_folder}/${sample}/AS_ATAC/HAP2_ind_map/q005_peaks/${sample}.ATAC.HAP2_ind_map.q_005.peaks_summits.hg38.extended_normalized.bed \
    ${wgs_folder}/${sample}/AS_ATAC/REF/q005_peaks/${sample}.ATAC.REF.q_005.peaks_summits.extended_normalized.bed \
  | /staging/leuven/stg_00002/lcb/ghuls/software/iterative_peak_filtering/iterative_peak_filtering.sh \
        - \
        ${wgs_folder}/${sample}/AS_ATAC/${sample}_REF_HAP1_HAP2_peaks.merged_and_filtered.bed \
        /staging/leuven/stg_00002/lcb/resources/human/hg38/hg38.canonical.chrom.sizes