In [1]:
! bcftools -v

bcftools 1.18
Using htslib 1.18
Copyright (C) 2023 Genome Research Ltd.
License Expat: The MIT/Expat license
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.


In [2]:
working_dir = '/home/jupyter/AoU1-v1'
output_prefix = 'chr1'

# ! mkdir -p {working_dir}/hiphase-outputs
# ! gsutil cp 'gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/HPRC_8X/T2T/Phasing/Hybridphasing/{output_prefix}_*bcf' {working_dir}/hiphase-outputs

hiphase_snp_indel_bcf = f'{working_dir}/hiphase-outputs/{output_prefix}_scaffold.bcf'
hiphase_sv_bcf = f'{working_dir}/hiphase-outputs/{output_prefix}_finalsv_scaffold.bcf'

hiphase_snp_indel_vcf_gz = f'{working_dir}/hiphase-outputs/{output_prefix}_scaffold.vcf.gz'
hiphase_sv_vcf_gz = f'{working_dir}/hiphase-outputs/{output_prefix}_finalsv_scaffold.vcf.gz'

# ! bcftools view {hiphase_snp_indel_bcf} -Oz -o {hiphase_snp_indel_vcf_gz}
# ! bcftools index -t {hiphase_snp_indel_vcf_gz}
# ! bcftools view {hiphase_sv_bcf} -Oz -o {hiphase_sv_vcf_gz}
# ! bcftools index -t {hiphase_sv_vcf_gz}

In [3]:
preprocess_dir = f'{working_dir}/preprocess'
! mkdir -p {preprocess_dir}

In [6]:
af_threshold = 0.01
reg_af_tag = '5Mbp.af_pt01'

hiphase_sv_reg_af_vcf_gz = f'{preprocess_dir}/{output_prefix}.sv.{reg_af_tag}.vcf.gz'

! bcftools view -r chr1:1-5000000 -i 'AF>={af_threshold}' {hiphase_sv_vcf_gz} \
    -Oz -o {hiphase_sv_reg_af_vcf_gz}
! bcftools index -t {hiphase_sv_reg_af_vcf_gz}

In [9]:
# only SVs and atomize

atomize_tag = 'atomize'

hiphase_atomize_vcf_gz = f'{preprocess_dir}/{output_prefix}.{reg_af_tag}.{atomize_tag}.vcf.gz'

! bcftools norm {hiphase_sv_reg_af_vcf_gz} -a -Oz -o {hiphase_atomize_vcf_gz}

Lines   total/split/realigned/skipped:	2604/0/0/0


In [11]:
# ! gsutil cp gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/references/T2T/trf.bb.bed {working_dir}/resources
    
tr_bed = f'{working_dir}/resources/trf.bb.bed'

In [12]:
tr_annot_tag = 'tr-annot'
hiphase_atomize_tr_annot_vcf_gz = f'{preprocess_dir}/{output_prefix}.{reg_af_tag}.{atomize_tag}.{tr_annot_tag}.vcf.gz'
min_overlap = 0.9

! bcftools view {hiphase_atomize_vcf_gz} -Ou | \
    bcftools annotate --no-version -a {tr_bed} --min-overlap :{min_overlap} -c CHROM,FROM,TO -m +TR \
    -h <(echo '##INFO=<ID=TR,Number=0,Type=Flag,Description="Overlaps TR region with --min-overlap :0.9">') \
    -Oz -o {hiphase_atomize_tr_annot_vcf_gz}

In [13]:
no_tr_tag = 'no-tr'
tag = f'{reg_af_tag}.{atomize_tag}.{tr_annot_tag}.{no_tr_tag}'
hiphase_preprocessed_vcf_gz = f'{preprocess_dir}/{output_prefix}.{tag}.vcf.gz'

! bcftools view -i 'AF>={af_threshold} && TR=0' {hiphase_atomize_tr_annot_vcf_gz} -Oz -o {hiphase_preprocessed_vcf_gz}

In [14]:
# v3.1.0 of PanGenie run-from-callset scripts

pangenie_scripts_dir = f'{working_dir}/pangenie/pipelines/run-from-callset/scripts'
ref_fa = f'{working_dir}/ref/chm13v2.0.ebv.fa'
run_from_callset_dir = f'{working_dir}/run-from-callset'

! mkdir -p {run_from_callset_dir}

In [15]:
# validate_vcf

! bcftools norm --check-ref e --fasta-ref {ref_fa} {hiphase_preprocessed_vcf_gz} &> {run_from_callset_dir}/validate-vcf.log

In [16]:
# prepare_vcf

frac_missing = 0.2

! bcftools view {hiphase_preprocessed_vcf_gz} | \
    python3 {pangenie_scripts_dir}/prepare-vcf.py --missing {frac_missing} \
    2> {run_from_callset_dir}/{output_prefix}.{tag}.prepare-vcf.log \
    1> {run_from_callset_dir}/{output_prefix}.{tag}.prepare.vcf

In [17]:
# add_ids

! cat {run_from_callset_dir}/{output_prefix}.{tag}.prepare.vcf | \
    python3 {pangenie_scripts_dir}/add-ids.py \
    2> {run_from_callset_dir}/{output_prefix}.{tag}.add-ids.log \
    1> {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.vcf

In [18]:
# normalize

! bcftools norm -m- {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.vcf \
    --threads $(nproc) \
    2> {run_from_callset_dir}/{output_prefix}.{tag}.normalize.log \
    1> {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.vcf

In [19]:
# merge_haplotypes

! python3 {pangenie_scripts_dir}/merge_vcfs.py merge \
    -vcf {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.vcf \
    -r {ref_fa} \
    -ploidy 2  \
    2> {run_from_callset_dir}/{output_prefix}.{tag}.merge-haplotypes.log \
    1> {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.mergehap.vcf

In [20]:
! bcftools view {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.mergehap.vcf \
    -Oz -o {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.mergehap.vcf.gz
! bcftools index -t {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.mergehap.vcf.gz

[W::vcf_parse] Contig 'chr1' is not defined in the header. (Quick workaround: index the file with tabix.)


In [21]:
# convert_back_biallelic_representation (pop bubbles and convert back to biallelic)

! bcftools view {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.mergehap.vcf.gz | \
    python3 {pangenie_scripts_dir}/convert-to-biallelic-no-gq.py {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.vcf \
    > {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.mergehap.unmergehap.vcf

In [22]:
! bcftools view {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.mergehap.unmergehap.vcf \
    -Oz -o {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.mergehap.unmergehap.vcf.gz
! bcftools index -t {run_from_callset_dir}/{output_prefix}.{tag}.prepare.id.split.mergehap.unmergehap.vcf.gz

[W::vcf_parse_format] FORMAT 'GQ' at chr1:1668 is not defined in the header, assuming Type=String


In [None]:
! 