### Step 1. Generation of personalized genomes and chain files

Here, we create a diploid personalized genome per samples by inserting phased variants from whole genomes (single nucleotide variants, small insertions and deletions, and structural variants) to reference genome (hg38).

#### Load the required modules

In [None]:
module load pigz Java
module load Java/jdk1.8.0_151
module load SAMtools/1.8-foss-2014a
module load vt/20180405-foss-2014a

SnpSift


In [None]:
# modify this field for your sample
sampleID="MM031"
input_folder="/staging/leuven/stg_00002/lcb/zkalender/Runs/MWGS_FINAL_SEQ_DATA/analysis/"
output_folder=/staging/leuven/stg_00002/lcb/zkalender/melanoma_WGS/${sampleID}
resources_folder="/staging/leuven/stg_00002/lcb/resources/longranger/refdata-GRCh38-2.1.0"

In [None]:
# Create necessary folders output folders
mkdir -p ${output_folder}/crossstich
mkdir -p ${output_folder}/snpeff
mkdir -p ${resources_folder}/chrFiles/split

### Prepare input files for crossstich pipeline

#### a. re-call SVs using sniffles

In [None]:
# first add MD tags to the bam
samtools calmd \
    -@ 30 \
    -b ${input_folder}/${sampleID}/outs/phased_possorted_bam.bam \
    ${resources_folder}/fasta/genome.fa \
  > ${output_folder}/${sampleID}_phased_possorted_bam_MDtag.bam

In [None]:
# run sniffles
sniffles \
    -m ${output_folder}/crossstich/${sampleID}_phased_possorted_bam_MDtag.bam \
    -v ${output_folder}/crossstich/${sampleID}.sniffles.n1.vcf \
    -n 1 \
    -t 30 \
  > ${output_folder}/crossstich/${sampleID}.sniffles.n1.log

#### b. filter SNVs & INDELs (for quality and genotype info)

In [None]:
# filter out variants w/o PASS tag and DP below 20
java -jar SnpSift.jar \
    filter "(FILTER = 'PASS') & (DP >=20)" \
    ${input_folder}/${sampleID}/outs/phased_variants.vcf.gz \
  > ${output_folder}/snpeff/${sampleID}_phased_variants.HQ.vcf

In [None]:
# decompose multi-allelic variants
vt decompose \
    -s ${output_folder}/snpeff/${sampleID}_phased_variants.HQ.vcf \
    -o ${output_folder}/snpeff/${sampleID}_phased_variants.HQ.decomposed.vcf

In [None]:
# filter out variants with missing genotype calls
bcftools view \
    -g '^miss' \
    -O v \
    -o ${output_folder}/snpeff/${sampleID}_phased_variants.HQ.decomposed.GTfilt.vcf \
    ${output_folder}/snpeff/${sampleID}_phased_variants.HQ.decomposed.vcf


### 2: extract "hairs" using a modified HapCut2 pipeline

In [None]:
# HapCut2 step 1/3
extractHAIRS \
    --10X 1 \
    --mbq 0 \
    --bam ${input_folder}/${sampleID}/outs/phased_possorted_bam.bam \
    --VCF ${output_folder}/snpeff/${sampleID}_phased_variants.HQ.decomposed.GTfilt.vcf \
    --out ${output_folder}/crossstich/${sampleID}_unlinked_fragment_file.mbq0

In [None]:
# HapCut2 step 2/3
python3 \
    /staging/leuven/stg_00002/lcb/cflerin/software/HapCUT2/utilities/LinkFragments.py \
    --bam ${input_folder}/${sampleID}/outs/phased_possorted_bam.bam \
    --VCF ${output_folder}/snpeff/${sampleID}_phased_variants.HQ.decomposed.GTfilt.vcf \
    --fragments ${output_folder}/crossstich/${sampleID}_unlinked_fragment_file.mbq0 \
    --out ${output_folder}/crossstich/${sampleID}_linked_fragment_file.mbq

In [None]:
# HapCut2 step 3/3
/staging/leuven/stg_00002/lcb/cflerin/software/HapCUT2/build/HAPCUT2 \
    --nf 1 \
    --fragments ${output_folder}/crossstich/${sampleID}_unlinked_fragment_file.mbq0 \
    --VCF ${output_folder}/snpeff/${sampleID}_phased_variants.HQ.decomposed.vcf \
    --output ${output_folder}/crossstich/${sampleID}_haplotype_output_file

### 3: run crossstitch pipeline

In [None]:
cd /staging/leuven/stg_00002/lcb/zkalender/melanoma_WGS/${sampleID}/crossstich

/staging/leuven/stg_00002/lcb/cflerin/software/crossstitch/src/crossstitch.sh \
    ${output_folder}/snpeff/${sampleID}_phased_variants.HQ.decomposed.vcf \
    ${output_folder}/crossstich/${sampleID}.sniffles.n1.vcf \
    ${output_folder}/crossstich/${sampleID}_phased_possorted_bam_MDtag.bam \
    ${resources_folder}/fasta/genome.fa \
    ${sampleID} \
    female \
    0

This step creates two fasta files (hap1.fa and hap2.fa) and two chain files (refTohap1.chain and refTohap2.chain)

### 4. Create hap1ToRef.chain and hap2ToRef.chain files

##### Load necessary modules

In [None]:
module load Kent/20180816
module load Parallel/20180622
module load Anaconda/5-Python-3.6

##### Create directories

In [None]:
mkdir -p ${resources_folder}/chrFiles/split
mkdir -p ${resources_folder}/chrFiles/lift

mkdir -p ${output_dir}/liftover_proc
mkdir -p ${output_dir}/liftover_proc/psl

##### Map HAP1 to reference genome
Only to canonical chromosomes

In [None]:
blat ${output_dir}/crossstich/${sampleID}.alleleseq/${sampleID}.hap1.fa \
    ${resources_folder}/genome.fa \
    -t=dna \
    -q=dna \
    -tileSize=11 \
    -fastMap \
    -minIdentity=95 \
    -noHead \
    -minScore=100 \
    -ooc=${output_dir}/liftover_proc/${sampleID}.hap1.fa.ooc \
    ${output_dir}/liftover_proc/psl/hap1.psl

##### Map HAP2 to reference genomes

In [None]:
blat ${output_dir}/crossstich/${sampleID}.alleleseq/${sampleID}.hap2.fa \
    ${resources_folder}/genome.fa \
    -t=dna \
    -q=dna \
    -tileSize=11 \
    -fastMap \
    -minIdentity=95 \
    -noHead \
    -minScore=100 \
    -ooc=${output_dir}/liftover_proc/${sampleID}.hap2.fa.ooc \
    ${output_dir}/liftover_proc/psl/hap2.psl

This step is computationally intensive and can take quite a long time. One workaround is to split the reference genome into smaller chunks, and running blat jobs in parallel.


#### Alternative mapping strategy: Split target assembly (hg38)

In [None]:
# ${resources_folder}/chrFiles folder contains a fasta file per canonical chromosome
# below command will split each chr*.fa file into 3000 files

for i in {1..22} X Y ; do
    faSplit size \
        ${resources_folder}/chrFiles/chr${i}.fasta \
        3000 \
        ${resources_folder}/chrFiles/split/hg38.${i}.split \
        -lift=${resources_folder}/chrFiles/lift/hg38.${i}.lft \
        -oneFile;
done

# will split these files even further into 5k chunks

source activate seqkit

for file in `ls ${resources_folder}/chrFiles/split/` ; do
    seqkit split ${resources_folder}/chrFiles/split/${file} \
        -s 5000 \
        -O ${resources_folder}/chrFiles/split/5k_3k_fa/${file};
done

# map hap1 to split fasta files
cat ${resources_folder}/hg38_liftover_process_files/5k_3k_fa_file_list \
  | parallel -j 30 \
        blat ${sampleID}.hap1.fa \
            ${resources_folder}/hg38_liftover_process_files/5k_3k_fa/{} \
            -t=dna \
            -q=dna \
            -tileSize=11 \
            -fastMap \
            -minIdentity=95 \
            -noHead \
            -minScore=100 \
            -ooc=${sampleID}.hap1.fa.ooc \
            psl/hap1.{}.psl


# map hap2 to split fasta files
cat ${resources_folder}/hg38_liftover_process_files/5k_3k_fa_file_list \
  | parallel -j 30 \
        blat ${sampleID}.hap2.fa \
            ${resources_folder}/hg38_liftover_process_files/5k_3k_fa/{} \
            -t=dna \
            -q=dna \
            -tileSize=11 \
            -fastMap \
            -minIdentity=95 \
            -noHead \
            -minScore=100 \
            -ooc=${sampleID}.hap2.fa.ooc \
            psl/hap2.{}.psl

# Combine psl files
cat ${resources_folder}/hg38_liftover_process_files/5k_3k_fa_file_list | while read file; do cat psl/hap1.${file}.psl ; done >> psl/hap1.psl
cat ${resources_folder}/hg38_liftover_process_files/5k_3k_fa_file_list | while read file; do cat psl/hap2.${file}.psl ; done >> psl/hap2.psl


##### Create chain files

In [None]:
# litftup
liftUp -pslQ liftup/hap1.combined.liftup.psl ${resources_folder}/hg38_liftover_process_files/lift/hg38.combined.lft warn psl/hap1.psl
liftUp -pslQ liftup/hap2.combined.liftup.psl ${resources_folder}/hg38_liftover_process_files/lift/hg38.combined.lft warn psl/hap2.psl

# make chain files
mkdir chain_raw

axtChain \
    -linearGap=medium \
    -faQ \
    -faT \
    -psl liftup/hap1.combined.liftup.psl \
    ${sample}.hap1.fa \
    ${resources_folder}/hg38_liftover_process_files/hg38_canonical_chromosomes.fa \
    chain_raw/${sample}.hap1.chain

axtChain \
    -linearGap=medium \
    -faQ \
    -faT \
    -psl liftup/hap2.combined.liftup.psl \
    ${sample}.hap2.fa \
    ${resources_folder}/hg38_liftover_process_files/hg38_canonical_chromosomes.fa \
    chain_raw/${sample}.hap2.chain

# merge and sort chain files
chainMergeSort chain_raw/${sample}.hap1.chain | chainSplit chain_split.hap1 stdin
chainMergeSort chain_raw/${sample}.hap2.chain | chainSplit chain_split.hap2 stdin

faSize ${sample}.hap1.fa -detailed > ${sample}.hap1.chr_length.txt
faSize ${sample}.hap2.fa -detailed > ${sample}.hap2.chr_length.txt

# make alignment nets from chain files
mkdir net

for i in chain_split.hap1/*.chain ; do
    tag=${i#chain_split.hap1/};
    chainNet ${i} ${sample}.hap1.chr_length.txt ${resources_folder}/hg38_liftover_process_files/hg38_canonical_chromosomes.chr_length.txt net/${tag}.net /dev/null
done

for i in chain_split.hap2/*.chain ; do
    tag=${i#chain_split.hap2/};
    chainNet ${i} ${sample}.hap2.chr_length.txt ${resources_folder}/hg38_liftover_process_files/hg38_canonical_chromosomes.chr_length.txt net/${tag}.net /dev/null
done

# create liftOver chain file
mkdir over
for i in chain_split.hap1/*.chain ; do
    tag=${i#chain_split.hap1/};
    netChainSubset net/${tag}.net ${i} over/${tag}
done

for i in chain_split.hap2/*.chain ; do
    tag=${i#chain_split.hap2/};
    netChainSubset net/${tag}.net ${i} over/${tag}
done

cat over/*hap1.chain > ${sample}.hap1_to_hg38.over.chain
cat over/*hap2.chain > ${sample}.hap2_to_hg38.over.chain