# HapCUT2 phasing validation
  
**Date:** 2023-Sep-26  
**last update:** 2023-Oct-03

In [1]:
module load htslib/1.18 bcftools/1.18 samtools/1.18
unset $PYTHONPATH
export PYTHONPATH=$PYTHONPATH:~/.local/lib/python3.9/site-packages
export PATH=$PATH:~/_softwares/HapCUT2/build
export PATH=$PATH:~/_softwares/HapCUT2/utilities

module list


Currently Loaded Modules:
  1) git/2.35.1         6) rstudio/2023.06.1-524      11) GSL/2.7
  2) java/17            7) nodejs/18.16.1        (T)  12) perl/5.36.0
  3) texlive/20211108   8) julia/1.9.0                13) bcftools/1.18
  4) hdf5/1.13.1        9) proj/8.1.1                 14) htslib/1.18
  5) R/4.3.0           10) JupyterPython/2.0     (T)  15) samtools/1.18

  Where:
   T:  Testing

 



[HapCUT2](https://github.com/vibansal/HapCUT2/tree/master#readme) is for diploid organisms only and can assemble haplotypes for one individual at a time. VCF input should contain variants and genotypes for a single diploid individual.

## Molphase Test: 1 sample – 2x-N703-94_Am_Pla_pb0295_v3.5

### Initiate variables

In [2]:
baseDIR=/nfs/scistore18/bartogrp/apal/snap_hap/variants/molphase/validation
chrom=Chr6
sampleList=~/snap_hap/variants/molphase/validation/phaseVal_samples.list
bamList=~/snap_hap/sample_info/bam_info/bams_Am_all.txt
vcf=~/snap_hap/variants/stitch/Chr6/Am_all_stitch_Chr6_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.vcf.gz
threshold=30
SLURM_ARRAY_TASK_ID=1

# sample=2x-N703-94_Am_Pla_pb0295_v3.5
# bam=~/snap_hap/bams/v3.5/bams_merged_2023/2x-N703-94_Am_Pla_pb0295_v3.5.sorted.BXnum.merged.bam

sample=$(tail +2 $sampleList | cut -f1 | sed -n "${SLURM_ARRAY_TASK_ID}p")
bam=$(cat $bamList | grep $sample)
coverage=$(tail +2 $sampleList | cut -f2 | sed -n "${SLURM_ARRAY_TASK_ID}p")
if [ ! -d $baseDIR/cov-${coverage}_${chrom}_${sample} ]; then mkdir -p $baseDIR/cov-${coverage}_${chrom}_${sample}; fi
sampleDIR=$baseDIR/cov-${coverage}_${chrom}_${sample}

sampleHetVCF=cov_${coverage}_${chrom}_${sample}
sampleHomVCF=cov_${coverage}_${chrom}_${sample}

unLinkedFragments=cov-${coverage}_${chrom}_${sample}
LinkedFragments=cov-${coverage}_${chrom}_${sample}
outPhased=cov_${coverage}-${chrom}_${sample}_thres$threshold.hapcut2.output

echo sampleDIR: $sampleDIR
echo chrom: $chrom
echo sample: $sample
echo coverage: $coverage
echo BAM: $bam
echo VCF: $vcf
echo sample HET VCF: $sampleHetVCF
echo hapcut2 output: $outPhased

sampleDIR: /nfs/scistore18/bartogrp/apal/snap_hap/variants/molphase/validation/cov-Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5
chrom: Chr6
sample: 2x-N703-94_Am_Pla_pb0295_v3.5
coverage: Low5x
BAM: /nfs/scistore18/bartogrp/apal/snap_hap/bams/v3.5/bams_merged_2023/2x-N703-94_Am_Pla_pb0295_v3.5.sorted.BXnum.merged.bam
VCF: /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr6/Am_all_stitch_Chr6_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.vcf.gz
sample HET VCF: cov_Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5
hapcut2 output: cov_Low5x-Chr6_2x-N703-94_Am_Pla_pb0295_v3.5_thres30.hapcut2.output


### Preprocessing 

In [33]:
# Extract sample VCF in truncated region – Chr6:1-10000000
cd $sampleDIR
#time bcftools view -s $sample -r Chr6:1-10000000 $vcf | bcftools view -i "AC=1" -Ov -o ${sampleHetVCF}_method1.het.vcf
#time bcftools view -s $sample -r Chr6:1-10000000 $vcf | awk '/^#/;/Chr/ {OFS="\t"}; !/^#/ && $10~/^0\/1/' > ${sampleHetVCF}_method2.het.vcf} 


real	3m59.703s
user	3m54.586s
sys	0m6.633s

real	3m47.957s
user	3m48.545s
sys	0m1.471s


In [3]:
cd $sampleDIR
time bcftools view -s $sample -r Chr6:1-100000 $vcf | awk '/^#/;/Chr/ {OFS="\t"}; !/^#/ && $10~/^0\/1/' > ${sampleHetVCF}.het.vcf


real	0m0.631s
user	0m0.607s
sys	0m0.044s


In [4]:
bcftools view -H ./cov_Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.het.vcf | wc -l
bcftools view -H ./cov_Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.het.vcf | cut -f1-2,4-5,8-10 | head -5

205
Chr6	11703	G	C	EAF=0.39067;INFO_SCORE=0.37028;HWE=1.2e-21;ERC=1931.52;EAC=2933.95;PAF=0.39699;AC=1;AN=2	GT:PG:GP:DS:PL	0/1:./.:0.38,0.478,0.142:0.762:1,0,5
Chr6	12087	T	G	EAF=0.5619;INFO_SCORE=0.31373;HWE=1.52e-37;ERC=267.897;EAC=220.888;PAF=0.54809;AC=1;AN=2	GT:PG:GP:DS:PL	0/1:./.:0.216,0.574,0.21:0.995:4,0,4
Chr6	12088	A	G	EAF=0.57147;INFO_SCORE=0.32385;HWE=5.13e-34;ERC=271.863;EAC=213.916;PAF=0.55964;AC=1;AN=2	GT:PG:GP:DS:PL	0/1:./.:0.207,0.577,0.216:1.009:4,0,4
Chr6	12089	A	G	EAF=0.56638;INFO_SCORE=0.31418;HWE=1.98e-37;ERC=268.895;EAC=214.895;PAF=0.55581;AC=1;AN=2	GT:PG:GP:DS:PL	0/1:./.:0.213,0.573,0.214:1:4,0,4
Chr6	12090	C	A	EAF=0.56422;INFO_SCORE=0.31399;HWE=1.4e-36;ERC=261.853;EAC=211.918;PAF=0.5527;AC=1;AN=2	GT:PG:GP:DS:PL	0/1:./.:0.215,0.573,0.212:0.998:4,0,4
cut: write error: Broken pipe


### HapCUT2

#### Extract unlinked fragments

In [5]:
time extractHAIRS --10X 1 --bam $bam --VCF ${sampleHetVCF}.het.vcf --region $chrom --out ${unLinkedFragments}.unlinked.fragments
echo -e '\n'
echo -e "No. of unlinked fragments:" $(wc -l ${unLinkedFragments}.unlinked.fragments)


Extracting haplotype informative reads from bamfiles /nfs/scistore18/bartogrp/apal/snap_hap/bams/v3.5/bams_merged_2023/2x-N703-94_Am_Pla_pb0295_v3.5.sorted.BXnum.merged.bam minQV 13 minMQ 20 maxIS 1000 

VCF file cov_Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.het.vcf has 205 variants 
adding chrom Chr6 to index 
vcffile cov_Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.het.vcf chromosomes 1 hetvariants 205 variants 205 
detected 0 variants with two non-reference alleles, these variants will not be phased
reading sorted bamfile /nfs/scistore18/bartogrp/apal/snap_hap/bams/v3.5/bams_merged_2023/2x-N703-94_Am_Pla_pb0295_v3.5.sorted.BXnum.merged.bam 
processing reads mapped to chrom "Chr6" 
final cleanup of fragment list: 35 current chrom 0 prev 0 

real	0m0.563s
user	0m0.541s
sys	0m0.018s


No. of unlinked fragments: 28 cov-Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.unlinked.fragments


#### Extract unlinked fragments

In [7]:
cat ${unLinkedFragments}.unlinked.fragments | grep -ax '.*' | awk '!/[ABCD]00/' > ${unLinkedFragments}.no00.unlinked.fragments
echo -e "No. of unlinked no00 fragments" $(wc -l ${unLinkedFragments}.no00.unlinked.fragments)

No. of unlinked no00 fragments 28 cov-Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.no00.unlinked.fragments


#### Convert unlinked to linked fragments

In [8]:
time python3 ~/_softwares/HapCUT2/utilities/LinkFragments.py  --bam $bam --VCF ${sampleHetVCF}.het.vcf --fragments ${unLinkedFragments}.no00.unlinked.fragments --out ${LinkedFragments}.linked.fragments -d 50000
echo -e "\nNo. of linked fragments" $(wc -l ${LinkedFragments}.linked.fragments)

Linking 10X fragments on chromosome: Chr6
  reading bedfile...
  generating new fragments for HAIRs in boundaries...

real	0m2.245s
user	0m2.151s
sys	0m0.053s

No. of linked fragments 16 cov-Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.linked.fragments


#### Phase with HapCUT2

In [9]:
time HAPCUT2 --fragments ${LinkedFragments}.linked.fragments --VCF ${sampleHetVCF}.het.vcf --out $outPhased --nf 1 --threshold $threshold --error_analysis_mode 1 --call_homozygous 1 --outvcf 1  --v 0



[2023:10:03 15:56:07] input fragment file: cov-Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.linked.fragments
[2023:10:03 15:56:07] input variantfile (VCF format):cov_Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.het.vcf
[2023:10:03 15:56:07] haplotypes will be output to file: cov_Low5x-Chr6_2x-N703-94_Am_Pla_pb0295_v3.5_thres30.hapcut2.output
[2023:10:03 15:56:07] solution convergence cutoff: 5
[2023:10:03 15:56:07] read 205 variants from cov_Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.het.vcf file 
[2023:10:03 15:56:07] read fragment file and variant file: fragments 16 variants 205
mean number of variants per read is 5.50 
[2023:10:03 15:56:07] building read-variant graph for phasing
Number of non-trivial connected components 10 max-Degree 6 connected variants 69 coverage-per-variant 1.275362 
[2023:10:03 15:56:07] fragments 16 snps 205 component(blocks) 10
[2023:10:03 15:56:07] starting Max-Likelihood-Cut based haplotype assembly algorithm
[2023:10:03 15:56:07] starting to post-process phased 

## Run HapCUT2 validation on cluster

In [None]:
sbatch --array=1-33 -J phaseVal ~/snap_hap/_scripts/sbatch/phase/job-molphase_validation.sbatch $baseDIR $chrom $sampleList $bamList $vcf $threshold

```
#SBATCH --partition=defaultp
#SBATCH --time=120:00:00
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=32G
```

#### Rerun with lower threshold
Also rerun with `threshold = 7` (default)

In [None]:
sbatch --array=1-33 -J rerunPhaseVal-thresDEF ~/snap_hap/_scripts/sbatch/phase/job-molphase_validation_rerun.sbatch $baseDIR $chrom $sampleList $bamList $vcf $threshold

#### Rerun after removing non-BX tags

In [None]:
sbatch --array=1-30 -J phaseVal-nonBX ~/snap_hap/_scripts/sbatch/phase/job-molphase_validation_rerun.sbatch $baseDIR $chrom $sampleList $bamList $vcf $threshold

## Postprocess output

In [5]:
cd ~/snap_hap/variants/molphase/validation/cov-Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5

In [6]:
head cov-Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.unlinked.fragments

1 A01641:90:H7F3LDSX3:4:2662:4607:28009 2 A59C09B46D96-1 -1 51 0000 FFFF
1 A01641:90:H7F3LDSX3:4:1516:27471:8390 2 A67C75B39D96-1 -1 59 1111 >>>>
1 A01641:90:H7F3LDSX3:4:2519:17788:36746 2 A22C23B34D96-1 -1 66 11 ;;
1 A01641:90:H7F3LDSX3:4:1360:15510:27727 2 A06C48B74D96-1 -1 159 11 >>
1 A01641:90:H7F3LDSX3:4:2256:9001:26788 2 A88C38B83D96-1 -1 191 11 >>
1 A01641:90:H7F3LDSX3:4:2124:25880:34914 2 A80C83B02D96-1 -1 198 1 F
1 A01641:90:H7F3LDSX3:4:2352:21169:31266 2 A46C69B02D96-1 -1 214 1 >
1 A01641:90:H7F3LDSX3:4:1537:8015:31563 2 A70C94B32D96-1 -1 231 1111 ;;;;
1 A01641:90:H7F3LDSX3:4:1231:3730:21386 2 A17C69B37D96-1 -1 251 11111111 >>>>>>>>
1 A01641:90:H7F3LDSX3:4:2476:15564:33959 2 A06C38B30D96-1 -1 260 0000000 FFFF:FF


In [8]:
head cov-Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5.no00.linked.fragments

1 Chr6:19486-19707:A23C70B03D96-1 0 -1 -1 16 0000 FFFF
2 Chr6:19591-20075:A42C48B24D96-1 0 -1 -1 17 111 21 11 DDDFF
1 Chr6:34450-34601:A59C09B46D96-1 0 -1 -1 51 0000 FFFF
1 Chr6:34756-34907:A67C75B39D96-1 0 -1 -1 59 1111 >>>>
1 Chr6:35380-35499:A22C23B34D96-1 0 -1 -1 66 11 ;;
2 A01174:273:H75HTDSX3:3:2575:4182:33974 0 -1 -1 75 0000 80 0 FFFFF
1 A01174:273:H75HTDSX3:3:2501:7419:22545 0 -1 -1 77 000 F:F
1 Chr6:35560-36128:A83C69B84D96-1 0 -1 -1 77 000 ;;;
2 Chr6:36594-37991:A38C80B14D96-1 0 -1 -1 99 00000000000000000000 148 000 FFFFFFFFF:FFFF>>>F:FFFF
1 A01174:273:H75HTDSX3:3:1155:32669:16736 0 -1 -1 111 000 FFF


In [11]:
head -50 cov-Low5x_Chr6_2x-N703-94_Am_Pla_pb0295_v3.5_thres30.BX.hapcut2.output

BLOCK: offset: 16 len: 7 phased: 6 SPAN: 501 fragments 2
16	0	1	Chr6	19573	C	G	0/1:./.:0.21,0.765,0.025:0.814:6,0,15	0	100.00	37.00	1
17	0	1	Chr6	19635	T	A	0/1:./.:0.121,0.848,0.031:0.909:8,0,14	0	37.00	71.64	2
18	0	1	Chr6	19677	C	G	0/1:./.:0.018,0.808,0.174:1.156:17,0,7	0	100.00	71.64	2
19	0	1	Chr6	19701	T	G	0/1:./.:0.196,0.779,0.025:0.83:6,0,15	0	100.00	71.64	2
21	0	1	Chr6	19931	T	A	0/1:./.:0.05,0.857,0.093:1.043:12,0,10	0	73.99	37.00	1
22	0	1	Chr6	20074	A	T	0/1:./.:0.045,0.814,0.142:1.097:13,0,8	0	37.00	37.00	1
******** 
BLOCK: offset: 51 len: 4 phased: 4 SPAN: 133 fragments 1
51	0	1	Chr6	34450	C	T	0/1:./.:0.342,0.652,0.006:0.664:3,0,20	0	100.00	37.00	1
52	0	1	Chr6	34518	A	G	0/1:./.:0.109,0.816,0.075:0.966:9,0,10	0	37.00	37.00	1
53	0	1	Chr6	34559	G	A	0/1:./.:0.09,0.848,0.062:0.972:10,0,11	0	70.99	37.00	1
54	0	1	Chr6	34583	G	A	0/1:./.:0.27,0.714,0.016:0.745:4,0,16	0	37.00	37.00	1
******** 
BLOCK: offset: 59 len: 4 phased: 4 SPAN: 49 fragments 1
59	0	1	Chr6	34800	G	A	0/1:./.:0.26,0.72

In [2]:
echo -e "Sample\tphaseVal_Cov_bin\tTotalHetSites\tunLinkedFragments\tno00unLinkedFragments\tno00LinkedFragments\tBXunLinkedFragments\tBXLinkedFragments\tunPhasedSites_thres30_no00\tPhasedSites_thres30_no00\tunPhasedSites_thres30_BX\tPhasedSites_thres30_BX" > phaseVal.txt

for sampleFolder in /nfs/scistore18/bartogrp/apal/snap_hap/variants/molphase/validation/cov-*;
do 
    coverage=`basename $sampleFolder | cut -f1 -d_ | cut -f2 -d-`
    unLinkedFragments=`cat $sampleFolder/*[^no00][^BX].unlinked.fragments | wc -l`
    no00unLinkedFragments=`cat $sampleFolder/*.no00.unlinked.fragments | wc -l`
    BXunLinkedFragments=`cat $sampleFolder/*.BX.unlinked.fragments | wc -l`
    
    no00LinkedFragments=`cat $sampleFolder/*.no00.linked.fragments | wc -l`
    BXLinkedFragments=`cat $sampleFolder/*.BX.linked.fragments | wc -l`
    
    TotalHetSites=$(bcftools view -H $sampleFolder/*.het.vcf | wc -l)
    PhasedSites_thres30_no00=$(bcftools view -H $sampleFolder/*_thres30.no00.hapcut2.output.phased.VCF | grep -vc "0/1")
    unPhasedSites_thres30_no00=$(bcftools view -H $sampleFolder/*_thres30.no00.hapcut2.output.phased.VCF | grep -c "0/1")    

    PhasedSites_thres30_BX=$(bcftools view -H $sampleFolder/*_thres30.BX.hapcut2.output.phased.VCF | grep -vc "0/1")
    unPhasedSites_thres30_BX=$(bcftools view -H $sampleFolder/*_thres30.BX.hapcut2.output.phased.VCF | grep -c "0/1")    

#    PhasedSites_thres7=$(bcftools view -H $sampleFolder/*_thres6.98.hapcut2.output.phased.VCF | grep -vc "0/1")
#    unPhasedSites_thres7=$(bcftools view -H $sampleFolder/*_thres6.98.hapcut2.output.phased.VCF | grep -c "0/1")

    echo -e `basename $sampleFolder`"\t"$coverage"\t"$TotalHetSites"\t"$unLinkedFragments"\t"$no00unLinkedFragments"\t"$no00LinkedFragments"\t"$BXunLinkedFragments"\t"$BXLinkedFragments"\t"${unPhasedSites_thres30_no00}"\t"${PhasedSites_thres30_no00}"\t"${unPhasedSites_thres30_BX}"\t"${PhasedSites_thres30_BX} >> phaseVal.txt
    echo `basename $sampleFolder` - Done
done

cov-15to25x_Chr6_10x-10_Am_Pla_pb3403_v3.5 - Done
cov-15to25x_Chr6_10x-12_Am_Pla_pb4563_v3.5 - Done
cov-15to25x_Chr6_10x-1_Am_Pla_pb0174_v3.5 - Done
cov-15to25x_Chr6_10x2-13_Am_Pla_pb1196_v3.5 - Done
cov-15to25x_Chr6_10x2-15_Am_Pla_pb1253_v3.5 - Done
cov-15to25x_Chr6_10x2-16_Am_Pla_pb1682_v3.5 - Done
cov-15to25x_Chr6_10x2-17_Am_Pla_pb2190_v3.5 - Done
cov-15to25x_Chr6_10x2-18_Am_Pla_pb1687_v3.5 - Done
cov-15to25x_Chr6_10x-8_Am_Pla_pb0768_v3.5 - Done
cov-15to25x_Chr6_10x-9_Am_Pla_pb2333_v3.5 - Done
cov-5to15x_Chr6_10xNEW-Plate10-18_Am_Pla_pb1172_v3.5 - Done
cov-5to15x_Chr6_10xNEW-Plate10-57_Am_Pla_pb2320_v3.5 - Done
cov-5to15x_Chr6_10xNEW-Plate10-66_Am_Pla_pb3368_v3.5 - Done
cov-5to15x_Chr6_10xNEW-Plate10-80_Am_Pla_pb3432_v3.5 - Done
cov-5to15x_Chr6_10xNEW-Plate10-81_Am_Pla_pb3434_v3.5 - Done
cov-5to15x_Chr6_10xNEW-Plate9-37_Am_Pla_pb1606_v3.5 - Done
cov-5to15x_Chr6_10xNEW-Plate9-41_Am_Pla_pb1843_v3.5 - Done
cov-5to15x_Chr6_10xNEW-Plate9-46_Am_Pla_pb1850_v3.5 - Done
cov-5to15x_Chr6_10xNE

In [18]:
less ~/snap_hap/variants/molphase/validation/phaseVal.txt | cut -f2,18-20 

Coverage	%HET	% unphased no00	% phased no00
0.5448	0.097	0.8238	0.1762
0.7354	0.1	0.7788	0.2212
0.7801	0.098	0.7825	0.2175
0.7954	0.104	0.7188	0.2812
0.7967	0.104	0.7531	0.2469
0.853	0.096	0.7482	0.2518
0.8822	0.105	0.696	0.304
0.8901	0.097	0.7408	0.2592
0.9758	0.101	0.6855	0.3145
0.9931	0.095	0.7424	0.2576
5.1464	0.115	0.1828	0.8172
5.2478	0.109	0.1931	0.8069
5.3559	0.117	0.1626	0.8374
5.704	0.115	0.1782	0.8218
5.7733	0.121	0.1548	0.8452
5.9796	0.117	0.1384	0.8616
5.9956	0.117	0.1925	0.8075
6.0562	0.12	0.1626	0.8374
6.3294	0.119	0.1517	0.8483
6.3483	0.121	0.1422	0.8578
16.5056	0.117	0.0586	0.9414
16.547	0.116	0.0673	0.9327
16.8621	0.116	0.0521	0.9479
17.5059	0.12	0.0406	0.9594
19.4842	0.123	0.0381	0.9619
19.7656	0.12	0.0417	0.9583
19.8641	0.118	0.0314	0.9686
20.2573	0.124	0.032	0.968
21.1414	0.127	0.0277	0.9723
21.7329	0.124	0.0357	0.9643
107.1166	0.129	0.0162	0.9838
109.5559	0.129	0.0134	0.9866
111.3961	0.134	0.0154	0.9846
