In [1]:
! bcftools -v

bcftools 1.18
Using htslib 1.18
Copyright (C) 2023 Genome Research Ltd.
License Expat: The MIT/Expat license
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.


In [2]:
working_dir = '/home/jupyter/AoU1-v1'
output_prefix = 'chr1'

# ! mkdir -p {working_dir}/hiphase-outputs
# ! gsutil cp 'gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/HPRC_8X/T2T/Phasing/Hybridphasing/{output_prefix}_*bcf' {working_dir}/hiphase-outputs

hiphase_snp_indel_bcf = f'{working_dir}/hiphase-outputs/{output_prefix}_scaffold.bcf'
hiphase_sv_bcf = f'{working_dir}/hiphase-outputs/{output_prefix}_finalsv_scaffold.bcf'

hiphase_snp_indel_vcf_gz = f'{working_dir}/hiphase-outputs/{output_prefix}_scaffold.vcf.gz'
hiphase_sv_vcf_gz = f'{working_dir}/hiphase-outputs/{output_prefix}_finalsv_scaffold.vcf.gz'

# ! bcftools view {hiphase_snp_indel_bcf} -Oz -o {hiphase_snp_indel_vcf_gz}
# ! bcftools index -t {hiphase_snp_indel_vcf_gz}
# ! bcftools view {hiphase_sv_bcf} -Oz -o {hiphase_sv_vcf_gz}
# ! bcftools index -t {hiphase_sv_vcf_gz}

Copying gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/HPRC_8X/T2T/Phasing/Hybridphasing/chr1_finalsv_scaffold.bcf...
Copying gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/HPRC_8X/T2T/Phasing/Hybridphasing/chr1_scaffold.bcf...
| [2 files][499.5 MiB/499.5 MiB]                                                
Operation completed over 2 objects/499.5 MiB.                                    


In [3]:
preprocess_dir = f'{working_dir}/preprocess'
! mkdir -p {preprocess_dir}

In [15]:
# only short variants and atomize

hiphase_short_atomize_vcf_gz = f'{preprocess_dir}/{output_prefix}.short-atomize.vcf.gz'

! bcftools norm {hiphase_snp_indel_vcf_gz} -a -Oz -o {hiphase_short_atomize_vcf_gz}

Lines   total/split/realigned/skipped:	5086893/0/0/0


In [28]:
# threshold by AF

af_threshold = 0.005
af_suffix = 'af_pt005'

hiphase_short_atomize_af_vcf_gz = f'{preprocess_dir}/{output_prefix}.short-atomize.{af_suffix}.vcf.gz'

! bcftools view -i 'AF>={af_threshold}' {hiphase_short_atomize_vcf_gz} -Oz -o {hiphase_short_atomize_af_vcf_gz}

In [29]:
# v3.1.0 of PanGenie run-from-callset scripts

pangenie_scripts_dir = f'{working_dir}/pangenie/pipelines/run-from-callset/scripts'
ref_fa = f'{working_dir}/ref/chm13v2.0.ebv.fa'
run_from_callset_dir = f'{working_dir}/run-from-callset'

! mkdir -p {run_from_callset_dir}

In [30]:
# validate_vcf

! bcftools norm --check-ref e --fasta-ref {ref_fa} {hiphase_short_atomize_af_vcf_gz} &> {run_from_callset_dir}/validate-vcf.log

In [31]:
# prepare_vcf

frac_missing = 0.2

! bcftools view {hiphase_short_atomize_af_vcf_gz} | \
    python3 {pangenie_scripts_dir}/prepare-vcf.py --missing {frac_missing} \
    2> {run_from_callset_dir}/prepare-vcf.log \
    1> {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.vcf

In [32]:
# add_ids

! cat {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.vcf | \
    python3 {pangenie_scripts_dir}/add-ids.py \
    2> {run_from_callset_dir}/add-ids.log \
    1> {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.vcf

In [33]:
# normalize

! bcftools norm -m- {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.vcf \
    --threads $(nproc) \
    2> {run_from_callset_dir}/normalize.log \
    1> {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.vcf

In [34]:
# merge_haplotypes

! python3 {pangenie_scripts_dir}/merge_vcfs.py merge \
    -vcf {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.vcf \
    -r {ref_fa} \
    -ploidy 2  \
    2> {run_from_callset_dir}/merge-haplotypes.log \
    1> {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.mergehap.vcf

In [35]:
! bcftools view {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.mergehap.vcf \
    -Oz -o {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.mergehap.vcf.gz
! bcftools index -t {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.mergehap.vcf.gz

[W::vcf_parse] Contig 'chr1' is not defined in the header. (Quick workaround: index the file with tabix.)


In [36]:
# convert_back_biallelic_representation (pop bubbles and convert back to biallelic)

! bcftools view {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.mergehap.vcf.gz | \
    python3 {pangenie_scripts_dir}/convert-to-biallelic-no-gq.py {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.vcf \
    > {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.mergehap.unmergehap.vcf

In [37]:
! bcftools view {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.mergehap.unmergehap.vcf \
    -Oz -o {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.mergehap.unmergehap.vcf.gz
! bcftools index -t {run_from_callset_dir}/{output_prefix}.short-atomize.{af_suffix}.prepare.id.split.mergehap.unmergehap.vcf.gz

[W::vcf_parse_format] FORMAT 'GQ' at chr1:174 is not defined in the header, assuming Type=String


In [4]:
# # try removing SNPs/indels that overlap with SVs first

# # threshold by AF first

# af_threshold = 0.01

# ! bcftools view -i 'AF>={af_threshold}' {hiphase_snp_indel_bcf} \
#     -Oz -o {preprocess_dir}/{output_prefix}.snp_indel.af.vcf.gz
# ! bcftools index -t {preprocess_dir}/{output_prefix}.snp_indel.af.vcf.gz

# ! bcftools view -i 'AF>={af_threshold}' {hiphase_sv_bcf} \
#     -Oz -o {preprocess_dir}/{output_prefix}.sv.af.vcf.gz
# ! bcftools index -t {preprocess_dir}/{output_prefix}.sv.af.vcf.gz

In [8]:
# ! bcftools isec \
#     {preprocess_dir}/{output_prefix}.snp_indel.af.vcf.gz \
#     {preprocess_dir}/{output_prefix}.sv.af.vcf.gz \
#     -n~11 -w1 -c all \
#     -Oz -o {preprocess_dir}/{output_prefix}.snp_indel.af.in_sv.vcf.gz
# ! bcftools index -t {preprocess_dir}/{output_prefix}.snp_indel.af.in_sv.vcf.gz
# ! bcftools stats {preprocess_dir}/{output_prefix}.snp_indel.af.in_sv.vcf.gz | grep SN

# ! bcftools isec \
#     {preprocess_dir}/{output_prefix}.snp_indel.af.vcf.gz \
#     {preprocess_dir}/{output_prefix}.sv.af.vcf.gz \
#     -n~10 -w1 -c all \
#     -Oz -o {preprocess_dir}/{output_prefix}.snp_indel.af.out_sv.vcf.gz
# ! bcftools index -t {preprocess_dir}/{output_prefix}.snp_indel.af.out_sv.vcf.gz
# ! bcftools stats {preprocess_dir}/{output_prefix}.snp_indel.af.out_sv.vcf.gz | grep SN

# SN, Summary numbers:
#   number of SNPs      .. number of rows with a SNP
#   number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs
#   counter. For example, a row with a SNP and an indel increments both the SNP and
# SN	[2]id	[3]key	[4]value
SN	0	number of samples:	1074
SN	0	number of records:	6020
SN	0	number of no-ALTs:	0
SN	0	number of SNPs:	1307
SN	0	number of MNPs:	0
SN	0	number of indels:	4712
SN	0	number of others:	1
SN	0	number of multiallelic sites:	0
SN	0	number of multiallelic SNP sites:	0
# SiS	[2]id	[3]allele count	[4]number of SNPs	[5]number of transitions	[6]number of transversions	[7]number of indels	[8]repeat-consistent	[9]repeat-inconsistent	[10]not applicable
# AF	[2]id	[3]allele frequency	[4]number of SNPs	[5]number of transitions	[6]number of transversions	[7]number of indels	[8]repeat-consistent	[9]repeat-inconsistent	[10]not applicable
# QUAL	[2]id	[3]Quality	[4]number of SNPs	[5]number of transitions (1st ALT)	[6]number 