# Postprocessing STITCH output

- Step 1 - Extract all genotypes from high-confidence STITCH output VCF based on posterior genotype prob.  
- Step 2 - Merge all chrom_segment VCFs into one VCF per chromosome  
- Step 3 - Extract VCF stats

In [1]:
module load bcftools/1.18 vcftools/0.1.16 parallel
module list


Currently Loaded Modules:
  1) git/2.35.1         7) R/4.3.1-b                  13) GSL/2.7
  2) java/17            8) rstudio/2023.06.1-524      14) htslib/1.18
  3) texlive/20211108   9) nodejs/18.16.1        (T)  15) perl/5.36.0
  4) hdf5/1.14.2       10) julia/1.10.1               16) bcftools/1.18
  5) gdal/3.4.2        11) proj/8.1.1                 17) vcftools/0.1.16
  6) glpk/5.0          12) JupyterPython/2.0     (T)  18) parallel/20220222

  Where:
   T:  Testing

 



## Steps for postprocessing STITCH VCF output

*NB: Change the baseDIR for stitch Run 2.*

### Initiate variables

In [2]:
## Input variable
chrom=Chr6
baseDIR=~/snap_hap/variants/stitch/${chrom}

echo -e "\n"
echo chrom: $chrom
echo baseDIR: $baseDIR 
echo -e '\n'



chrom: Chr6
baseDIR: /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr6
outVCF: /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr6/Am_all_stitch_Chr6_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz




### Extract all genotypes from high-confidence stitch output

In [None]:
echo -e '\nExtract all genotypes from STITCH'
realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*[^PL].vcf.gz > ${baseDIR}/stitchVcfs.highConf.list

## Execute in parallel mode
#time parallel -j10 "bash ~/snap_hap/_scripts/bash/stitch/add_PG_PL.sh {}" ::: ${baseDIR}/stitch_chromSegments/*/stitch.${chrom}.*.*[^PL].vcf.gz

## Execute one-by-one (parallel array job in cluster)
SLURM_ARRAY_TASK_ID=1
inVCF=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $stitchVcfList)
echo $inVCF
time bash ~/snap_hap/_scripts/bash/stitch/add_PG_PL.sh $inVCF

### Merge vcfs

In [None]:
## Make stitch-highConf-VCF list
echo -e '\nMerge highConf stitch VCFs'

realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*[^PL].vcf.gz > ${baseDIR}/stitchVcfs.highConf.list
stitchVcfList_highConf=${baseDIR}/stitchVcfs.highConf.list
outVCF_highConf=$baseDIR/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz

time bcftools concat -a -Oz -o ${outVCF_highConf} -f ${stitchVcfList_highConf}
time bcftools stats --threads 10 -s- --af-bins <(seq 0 0.1 1) --depth 0,25000,25 ${outVCF_highConf} > ${outVCF_highConf/.vcf.gz/.stats}

In [None]:
## Make stitch-ALL-VCF list
echo -e '\nMerge ALL stitch VCFs'

realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*.PL.vcf.gz > ${baseDIR}/stitchVcfs.ALL.list
stitchVcfList_ALL=${baseDIR}/stitchVcfs.ALL.list
outVCF_ALL=$baseDIR/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.vcf.gz

time bcftools concat -a -Oz -o ${outVCF_ALL} -f ${stitchVcfList_ALL}
time bcftools stats --threads 10 -s- --af-bins <(seq 0 0.1 1) --depth 0,25000,25 ${outVCF_ALL} > ${outVCF_ALL/.vcf.gz/.stats}

### Add AC,AN annotations 

In [None]:
#inputVCF = ...
bash ~/snap_hap/_scripts/bash/fill-AC-AN_vcfgzFormat.sh $inputVCF

### Create VCF statitics

In [None]:
## Calculate missing genotypes per site and individual
echo -e '\nCalculating fraction of missing data'
time vcftools --gzvcf $outVCF --missing-indv --out ${chrom}_missing-indv
time vcftools --gzvcf $outVCF --missing-site --out ${chrom}_missing-site

## Calculate heterozygosity per individual
echo -e '\nCalculating heterozygosity'
time vcftools --gzvcf $outVCF --het --out ${chrom}_het-indv

## Cluster implementation: postprocessing STITCH output

### Make list of high-conf STITCH vcfs

In [8]:
for chrom in Chr{1..8}
do
    baseDIR=~/snap_hap/variants/stitch/${chrom}
    echo -e $chrom'\t'$baseDIR'\t'$(realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*[^PL].vcf.gz | wc -l) files
    realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*[^PL].vcf.gz > ${baseDIR}/stitchVcfs.highConf.list
    echo -e ${baseDIR}/stitchVcfs.highConf.list'\n'
done

Chr1	/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr1	72 files
/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr1/stitchVcfs.highConf.list

Chr2	/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr2	78 files
/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr2/stitchVcfs.highConf.list

Chr3	/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr3	66 files
/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr3/stitchVcfs.highConf.list

Chr4	/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr4	55 files
/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr4/stitchVcfs.highConf.list

Chr5	/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr5	71 files
/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr5/stitchVcfs.highConf.list

Chr6	/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr6	56 files
/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr6/stitchVcfs.highConf.list

Chr7	/nfs/scistore18/bartogrp/apal/snap_

### Extract all genotypes from high-confidence stitch output

In [None]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/postSTITCH/extractGT

# Chr1
chrom=Chr1
baseDIR=~/snap_hap/variants/stitch/${chrom}
sbatch --array=1-72 -J postSTITCH-extractGT_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch $chrom ${baseDIR}/stitchVcfs.highConf.list
# Chr2
chrom=Chr2
baseDIR=~/snap_hap/variants/stitch/${chrom}
sbatch --array=1-78 -J postSTITCH-extractGT_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch $chrom ${baseDIR}/stitchVcfs.highConf.list
# Chr3
chrom=Chr3
baseDIR=~/snap_hap/variants/stitch/${chrom}
sbatch --array=1-66 -J postSTITCH-extractGT_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch $chrom ${baseDIR}/stitchVcfs.highConf.list
# Chr4
chrom=Chr4
baseDIR=~/snap_hap/variants/stitch/${chrom}
sbatch --array=1-55 -J postSTITCH-extractGT_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch $chrom ${baseDIR}/stitchVcfs.highConf.list
# Chr5
chrom=Chr5
baseDIR=~/snap_hap/variants/stitch/${chrom}
sbatch --array=1-71 -J postSTITCH-extractGT_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch $chrom ${baseDIR}/stitchVcfs.highConf.list
# Chr6
chrom=Chr6
baseDIR=~/snap_hap/variants/stitch/${chrom}
sbatch --array=1-56 -J postSTITCH-extractGT_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch $chrom ${baseDIR}/stitchVcfs.highConf.list
# Chr7
chrom=Chr7
baseDIR=~/snap_hap/variants/stitch/${chrom}
sbatch --array=1-56 -J postSTITCH-extractGT_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch $chrom ${baseDIR}/stitchVcfs.highConf.list
# Chr8
chrom=Chr8
baseDIR=~/snap_hap/variants/stitch/${chrom}
sbatch --array=1-58 -J postSTITCH-extractGT_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch $chrom ${baseDIR}/stitchVcfs.highConf.list

```
#SBATCH --partition=defaultp
#SBATCH --time=00:30:00
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=4G
```

It takes slightly longer time if run with `time srun ...` instead of `srun time ...` with the SLURM script. Generally, takes around ~10-15 mins.

### Concat VCFs

In [None]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/postSTITCH/concatVcfs

for chrom in Chr{1..8}
do 
    baseDIR=~/snap_hap/variants/stitch/${chrom}
    echo -e $chrom'\t'$baseDIR

    #highConf#
    realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*[^PL].vcf.gz > ${baseDIR}/stitchVcfs.highConf.list
    stitchVcfList_highConf=${baseDIR}/stitchVcfs.highConf.list
    outVCF_highConf=$baseDIR/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
    sbatch -J postSTITCH-concatVcfs-highConf_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-concatVcfs.sbatch.sh ${chrom} ${stitchVcfList_highConf} ${outVCF_highConf}

    #ALL#
    realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*.PL.vcf.gz > ${baseDIR}/stitchVcfs.ALL.list
    stitchVcfList_ALL=${baseDIR}/stitchVcfs.ALL.list
    outVCF_ALL=$baseDIR/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.vcf.gz
    sbatch -J postSTITCH-concatVcfs-ALL_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-concatVcfs.sbatch.sh ${chrom} ${stitchVcfList_ALL} ${outVCF_ALL}
done

```
#SBATCH --partition=defaultp
#SBATCH --time=3:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=5
#SBATCH --mem-per-cpu=4G
```

In [None]:
User    Account      State        JobID    JobName  AllocCPUS AllocNodes    CPUTime     ReqMem     MaxRSS    Elapsed  AllocTRES 
--------- ---------- ---------- ------------ ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
     apal   bartogrp     FAILED 9568345      postSTITC+          1          1   00:50:34       16Gc              00:50:34 billing=1+ 
            bartogrp  COMPLETED 9568345.0          time          1          1   00:50:25       16Gc     30228K   00:50:25 cpu=1,mem+

### Add AC, AN annotations

**NB:** Don't run AC,AN fill on cluster due to incompatibilities of bcftools libraries.

In [None]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/postSTITCH/annotateVcfs

for chrom in Chr{1..8}
do 
    baseDIR=~/snap_hap/variants/stitch/${chrom}
    echo -e $chrom'\t'$baseDIR

    #highConf#
    inputVCF_highConf=$baseDIR/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
    sbatch -J ${chrom}_highConf_tagVCFs ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-fill_AC_AN.sbatch ${inputVCF_highConf}

    #ALL#
    inputVCF_ALL=$baseDIR/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.vcf.gz
    sbatch -J ${chrom}_highConf_tagVCFs ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-fill_AC_AN.sbatch ${inputVCF_ALL}
done

## stitch VCF statistics

In [1]:
basedir="/nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch"

echo -e 'No. of stitched records'
for chrom in Chr{1..8}; do records=`grep "number of records:" $basedir/$chrom/*PL.stats | cut -f4`; echo -e $chrom"\t"$records; done

No. of stitched records
Chr1	3781002
Chr2	4287044
Chr3	3400629
Chr4	3059258
Chr5	4295411
Chr6	3361343
Chr7	3079991
Chr8	3692065


## Extract imputed genotypes for validation against 30 KASP markers

In [15]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/validation

In [16]:
## Read KASP coords positions
KASPcoords=~/snap_hap/impute/stitch_tests/coords/coords_for_stitch_longVersion_corrected_200kb.txt
cat $KASPcoords | cut -f1,4,8

snpID	CHROM	POS
1	Chr1	955185
2	Chr1	45535107
3	Chr1	47381180
4	Chr1	51252783
5	Chr1	60537705
6	Chr1	71126303
1	Chr2	55473945
2	Chr2	58751341
1	Chr3	1055959
2	Chr3	3488331
3	Chr3	44197469
1	Chr4	7115521
2	Chr4	38397179
3	Chr4	44460308
4	Chr4	50972302
1	Chr6	15125731
2	Chr6	32503688
3	Chr6	52921613
4	Chr6	52922047
5	Chr6	52966811
6	Chr6	52999112
7	Chr6	53016551
8	Chr6	53061080
9	Chr6	53083229
10	Chr6	53090212
11	Chr6	53094790
12	Chr6	53104387
13	Chr6	53195127
14	Chr6	53944185
1	Chr7	49828363
1	Chr8	27807304
2	Chr8	31864104
3	Chr8	38142268
4	Chr8	44933278
5	Chr8	53715058


In [17]:
## Chr1
chrom=Chr1
KASPcoords=~/snap_hap/impute/stitch_tests/coords/coords_for_stitch_longVersion_corrected_200kb.txt
vcf=~/snap_hap/variants/stitch/${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
for snpID in {1..6}
do
    markerID=${chrom/Chr}_${snpID}
    pos=$(grep -w $markerID $KASPcoords | cut -f8)
    echo -e "\n"$chrom"\t"$snpID"\t"$markerID"\t"$pos"\n"
    # Subset VCF with bcftools
    echo "Subsetting VCF with bcftools"
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.vcf.gz $vcf
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.PL.vcf.gz ${vcf/.vcf.gz/.PL.vcf.gz}    
    # Extract genotypes
    echo "Extracting stitch genotypes"
    time bash ~/snap_hap/_scripts/bash/stitch/extract_stitchGT.sh $chrom $snpID $pos ${snpID}_${chrom}_${pos}.vcf.gz
    rm ${snpID}_${chrom}_${pos}*.vcf.gz*
done


Chr1	1	1_1	955185

Subsetting VCF with bcftools

real	0m0.139s
user	0m0.044s
sys	0m0.041s

real	0m0.099s
user	0m0.058s
sys	0m0.036s
Extracting stitch genotypes

The following have been reloaded with a version change:
  1) bcftools/1.18 => bcftools/1.16     3) perl/5.36.0 => perl/5.34.0
  2) htslib/1.18 => htslib/1.13         4) vcftools/0.1.16 => vcftools/20210912


Indexing highConf-stitch VCF

Get 012 values from stitch-highconfidence VCF

VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf 1_Chr1_955185.vcf.gz
	--chr Chr1
	--to-bp 955187
	--012
	--out 1-Chr1-955185_highConf
	--from-bp 955183

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 1.00 seconds

Extract all genotypes from STITCH

real	0m0.172s
user	0m0.052s
sys	0m0.121s

Get 012 values from stitch-ALL VCF

VCFtools - 0.1.17
(C) Adam Auton and Anthony Marck

In [18]:
## Chr2
chrom=Chr2
KASPcoords=~/snap_hap/impute/stitch_tests/coords/coords_for_stitch_longVersion_corrected_200kb.txt
vcf=~/snap_hap/variants/stitch/${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
for snpID in {1..2}
do
    markerID=${chrom/Chr}_${snpID}
    pos=$(grep -w $markerID $KASPcoords | cut -f8)
    echo -e "\n"$chrom"\t"$snpID"\t"$markerID"\t"$pos"\n"
    # Subset VCF with bcftools
    echo "Subsetting VCF with bcftools"
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.vcf.gz $vcf
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.PL.vcf.gz ${vcf/.vcf.gz/.PL.vcf.gz}    
    # Extract genotypes
    echo "Extracting stitch genotypes"
    time bash ~/snap_hap/_scripts/bash/stitch/extract_stitchGT.sh $chrom $snpID $pos ${snpID}_${chrom}_${pos}.vcf.gz
    rm ${snpID}_${chrom}_${pos}*.vcf.gz*
done


Chr2	1	2_1	55473945

Subsetting VCF with bcftools

real	0m0.060s
user	0m0.033s
sys	0m0.022s

real	0m0.093s
user	0m0.064s
sys	0m0.013s
Extracting stitch genotypes

The following have been reloaded with a version change:
  1) bcftools/1.18 => bcftools/1.16     3) perl/5.36.0 => perl/5.34.0
  2) htslib/1.18 => htslib/1.13         4) vcftools/0.1.16 => vcftools/20210912


Indexing highConf-stitch VCF

Get 012 values from stitch-highconfidence VCF

VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf 1_Chr2_55473945.vcf.gz
	--chr Chr2
	--to-bp 55473947
	--012
	--out 1-Chr2-55473945_highConf
	--from-bp 55473943

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 1.00 seconds

Extract all genotypes from STITCH

real	0m0.188s
user	0m0.053s
sys	0m0.110s

Get 012 values from stitch-ALL VCF

VCFtools - 0.1.17
(C) Adam Auton and Ant

In [23]:
## Chr2
chrom=Chr3
KASPcoords=~/snap_hap/impute/stitch_tests/coords/coords_for_stitch_longVersion_corrected_200kb.txt
vcf=~/snap_hap/variants/stitch/${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
for snpID in {1..3}
do
    markerID=${chrom/Chr}_${snpID}
    pos=$(grep -w $markerID $KASPcoords | cut -f8)
    echo -e "\n"$chrom"\t"$snpID"\t"$markerID"\t"$pos"\n"
    # Subset VCF with bcftools
    echo "Subsetting VCF with bcftools"
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.vcf.gz $vcf
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.PL.vcf.gz ${vcf/.vcf.gz/.PL.vcf.gz}    
    # Extract genotypes
    echo "Extracting stitch genotypes"
    time bash ~/snap_hap/_scripts/bash/stitch/extract_stitchGT.sh $chrom $snpID $pos ${snpID}_${chrom}_${pos}.vcf.gz
    rm ${snpID}_${chrom}_${pos}*.vcf.gz*
done


Chr3	1	3_1	1055959

Subsetting VCF with bcftools

real	0m0.194s
user	0m0.058s
sys	0m0.018s

real	0m0.370s
user	0m0.071s
sys	0m0.034s
Extracting stitch genotypes

The following have been reloaded with a version change:
  1) bcftools/1.18 => bcftools/1.16     3) perl/5.36.0 => perl/5.34.0
  2) htslib/1.18 => htslib/1.13         4) vcftools/0.1.16 => vcftools/20210912


Indexing highConf-stitch VCF

Get 012 values from stitch-highconfidence VCF

VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf 1_Chr3_1055959.vcf.gz
	--chr Chr3
	--to-bp 1055961
	--012
	--out 1-Chr3-1055959_highConf
	--from-bp 1055957

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 0.00 seconds

Extract all genotypes from STITCH

real	0m0.177s
user	0m0.054s
sys	0m0.123s

Get 012 values from stitch-ALL VCF

VCFtools - 0.1.17
(C) Adam Auton and Anthony 

In [22]:
## Chr4
chrom=Chr4
KASPcoords=~/snap_hap/impute/stitch_tests/coords/coords_for_stitch_longVersion_corrected_200kb.txt
vcf=~/snap_hap/variants/stitch/${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
for snpID in {1..4}
do
    markerID=${chrom/Chr}_${snpID}
    pos=$(grep -w $markerID $KASPcoords | cut -f8)
    echo -e "\n"$chrom"\t"$snpID"\t"$markerID"\t"$pos"\n"
    # Subset VCF with bcftools
    echo "Subsetting VCF with bcftools"
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.vcf.gz $vcf
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.PL.vcf.gz ${vcf/.vcf.gz/.PL.vcf.gz}    
    # Extract genotypes
    echo "Extracting stitch genotypes"
    time bash ~/snap_hap/_scripts/bash/stitch/extract_stitchGT.sh $chrom $snpID $pos ${snpID}_${chrom}_${pos}.vcf.gz
    rm ${snpID}_${chrom}_${pos}*.vcf.gz*
done


Chr4	1	4_1	7115521

Subsetting VCF with bcftools

real	0m0.050s
user	0m0.031s
sys	0m0.012s

real	0m0.570s
user	0m0.039s
sys	0m0.027s
Extracting stitch genotypes

The following have been reloaded with a version change:
  1) bcftools/1.18 => bcftools/1.16     3) perl/5.36.0 => perl/5.34.0
  2) htslib/1.18 => htslib/1.13         4) vcftools/0.1.16 => vcftools/20210912


Indexing highConf-stitch VCF

Get 012 values from stitch-highconfidence VCF

VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf 1_Chr4_7115521.vcf.gz
	--chr Chr4
	--to-bp 7115523
	--012
	--out 1-Chr4-7115521_highConf
	--from-bp 7115519

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 0.00 seconds

Extract all genotypes from STITCH

real	0m0.151s
user	0m0.046s
sys	0m0.108s

Get 012 values from stitch-ALL VCF

VCFtools - 0.1.17
(C) Adam Auton and Anthony 

In [21]:
## Chr6
chrom=Chr6
KASPcoords=~/snap_hap/impute/stitch_tests/coords/coords_for_stitch_longVersion_corrected_200kb.txt
vcf=~/snap_hap/variants/stitch/${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
for snpID in {1..14}
do
    markerID=${chrom/Chr}_${snpID}
    pos=$(grep -w $markerID $KASPcoords | cut -f8)
    echo -e "\n"$chrom"\t"$snpID"\t"$markerID"\t"$pos"\n"
    # Subset VCF with bcftools
    echo "Subsetting VCF with bcftools"
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.vcf.gz $vcf
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.PL.vcf.gz ${vcf/.vcf.gz/.PL.vcf.gz}    
    # Extract genotypes
    echo "Extracting stitch genotypes"
    time bash ~/snap_hap/_scripts/bash/stitch/extract_stitchGT.sh $chrom $snpID $pos ${snpID}_${chrom}_${pos}.vcf.gz
    rm ${snpID}_${chrom}_${pos}*.vcf.gz*
done


Chr6	1	6_1	15125731

Subsetting VCF with bcftools

real	0m0.663s
user	0m0.032s
sys	0m0.014s

real	0m0.182s
user	0m0.025s
sys	0m0.017s
Extracting stitch genotypes

The following have been reloaded with a version change:
  1) bcftools/1.18 => bcftools/1.16     3) perl/5.36.0 => perl/5.34.0
  2) htslib/1.18 => htslib/1.13         4) vcftools/0.1.16 => vcftools/20210912


Indexing highConf-stitch VCF

Get 012 values from stitch-highconfidence VCF

VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf 1_Chr6_15125731.vcf.gz
	--chr Chr6
	--to-bp 15125733
	--012
	--out 1-Chr6-15125731_highConf
	--from-bp 15125729

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 1.00 seconds

Extract all genotypes from STITCH

real	0m0.169s
user	0m0.063s
sys	0m0.102s

Get 012 values from stitch-ALL VCF

VCFtools - 0.1.17
(C) Adam Auton and Ant

In [20]:
## Chr7
chrom=Chr7
KASPcoords=~/snap_hap/impute/stitch_tests/coords/coords_for_stitch_longVersion_corrected_200kb.txt
vcf=~/snap_hap/variants/stitch/${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
for snpID in 1
do
    markerID=${chrom/Chr}_${snpID}
    pos=$(grep -w $markerID $KASPcoords | cut -f8)
    echo -e "\n"$chrom"\t"$snpID"\t"$markerID"\t"$pos"\n"
    # Subset VCF with bcftools
    echo "Subsetting VCF with bcftools"
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.vcf.gz $vcf
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.PL.vcf.gz ${vcf/.vcf.gz/.PL.vcf.gz}    
    # Extract genotypes
    echo "Extracting stitch genotypes"
    time bash ~/snap_hap/_scripts/bash/stitch/extract_stitchGT.sh $chrom $snpID $pos ${snpID}_${chrom}_${pos}.vcf.gz
    rm ${snpID}_${chrom}_${pos}*.vcf.gz*
done


Chr7	1	7_1	49828363

Subsetting VCF with bcftools

real	0m0.169s
user	0m0.082s
sys	0m0.015s

real	0m0.194s
user	0m0.078s
sys	0m0.021s
Extracting stitch genotypes

The following have been reloaded with a version change:
  1) bcftools/1.18 => bcftools/1.16     3) perl/5.36.0 => perl/5.34.0
  2) htslib/1.18 => htslib/1.13         4) vcftools/0.1.16 => vcftools/20210912


Indexing highConf-stitch VCF

Get 012 values from stitch-highconfidence VCF

VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf 1_Chr7_49828363.vcf.gz
	--chr Chr7
	--to-bp 49828365
	--012
	--out 1-Chr7-49828363_highConf
	--from-bp 49828361

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 0.00 seconds

Extract all genotypes from STITCH

real	0m0.156s
user	0m0.045s
sys	0m0.116s

Get 012 values from stitch-ALL VCF

VCFtools - 0.1.17
(C) Adam Auton and Ant

In [19]:
## Chr8
chrom=Chr8
KASPcoords=~/snap_hap/impute/stitch_tests/coords/coords_for_stitch_longVersion_corrected_200kb.txt
vcf=~/snap_hap/variants/stitch/${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
for snpID in {1..5}
do
    markerID=${chrom/Chr}_${snpID}
    pos=$(grep -w $markerID $KASPcoords | cut -f8)
    echo -e "\n"$chrom"\t"$snpID"\t"$markerID"\t"$pos"\n"
    # Subset VCF with bcftools
    echo "Subsetting VCF with bcftools"
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.vcf.gz $vcf
    time bcftools view -r ${chrom}:$pos -Oz -o ${snpID}_${chrom}_${pos}.PL.vcf.gz ${vcf/.vcf.gz/.PL.vcf.gz}    
    # Extract genotypes
    echo "Extracting stitch genotypes"
    time bash ~/snap_hap/_scripts/bash/stitch/extract_stitchGT.sh $chrom $snpID $pos ${snpID}_${chrom}_${pos}.vcf.gz
    rm ${snpID}_${chrom}_${pos}*.vcf.gz*
done


Chr8	1	8_1	27807304

Subsetting VCF with bcftools

real	0m0.327s
user	0m0.040s
sys	0m0.016s

real	0m0.250s
user	0m0.031s
sys	0m0.031s
Extracting stitch genotypes

The following have been reloaded with a version change:
  1) bcftools/1.18 => bcftools/1.16     3) perl/5.36.0 => perl/5.34.0
  2) htslib/1.18 => htslib/1.13         4) vcftools/0.1.16 => vcftools/20210912


Indexing highConf-stitch VCF

Get 012 values from stitch-highconfidence VCF

VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--gzvcf 1_Chr8_27807304.vcf.gz
	--chr Chr8
	--to-bp 27807306
	--012
	--out 1-Chr8-27807304_highConf
	--from-bp 27807302

Using zlib version: 1.2.11
After filtering, kept 1074 out of 1074 Individuals
Writing 012 matrix files ... Done.
After filtering, kept 1 out of a possible 1 Sites
Run Time = 1.00 seconds

Extract all genotypes from STITCH

real	0m0.153s
user	0m0.032s
sys	0m0.125s

Get 012 values from stitch-ALL VCF

VCFtools - 0.1.17
(C) Adam Auton and Ant

## Extract INFO, DEPTH and QUAL for each of these KASP markers

In [9]:
## INFO from mpileup output

cd ~/snap_hap/variants/vcf_bcftools_Am_all/
for chrom in Chr{1..8}
do
    bcftools view -H -R ../stitch/KASP_validation.pos ${chrom}/Am_all_bcftools_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz | cut -f1-2,4-6,8
done

Chr1	955185	C	A	26533.4	DP=2130;VDB=0.0403254;SGB=236.2;RPBZ=-4.67188;MQBZ=-3.6874;MQSBZ=-1.6169;BQBZ=0.448842;NMBZ=2.80605;SCBZ=-5.19645;FS=0;MQ0F=0.00892019;AC=853;AN=1246;DP4=388,266,563,415;MQ=42
Chr1	45535107	C	A	28863.3	DP=2464;VDB=0.00249496;SGB=270.867;RPBZ=0.0313099;MQBZ=-22.493;MQSBZ=-3.56976;BQBZ=-1.41371;NMBZ=22.3821;SCBZ=-0.146563;FS=0;MQ0F=0.000811688;AC=836;AN=1336;DP4=452,406,454,615;MQ=54
Chr1	47381180	C	A	16317.9	DP=2576;VDB=0.830319;SGB=237.663;RPBZ=2.0871;MQBZ=-12.9647;MQSBZ=-0.44103;BQBZ=0.708826;NMBZ=21.905;SCBZ=-0.106792;FS=0;MQ0F=0.00194099;AC=314;AN=1330;DP4=685,646,287,333;MQ=52
Chr1	51252783	A	C	18432.5	DP=2946;VDB=0.858512;SGB=259.479;RPBZ=0.499348;MQBZ=-16.429;MQSBZ=1.76715;BQBZ=0.737209;NMBZ=26.3499;SCBZ=0.468853;FS=0;MQ0F=0.00305499;AC=388;AN=1364;DP4=714,708,362,332;MQ=53
Chr1	60537705	T	C	14255.6	DP=2498;VDB=0.787776;SGB=291.111;RPBZ=0.954798;MQBZ=-9.90877;MQSBZ=3.08892;BQBZ=-2.68247;NMBZ=5.17687;SCBZ=4.84531;FS=0;MQ0F=0.0424339;AC=311;AN=1304;DP4=573,6

In [10]:
## INFO from stitch finalRun output

cd ~/snap_hap/variants/stitch
for chrom in Chr{1..8}
do
    bcftools view -H -R KASP_validation.pos ${chrom}/*.PL.vcf.gz | cut -f1,2,4,5,8
done

Chr1	955185	C	A	EAF=0.61127;INFO_SCORE=0.69969;HWE=0.000312;ERC=1044.39;EAC=578.828;PAF=0.64341
Chr1	45535107	C	A	EAF=0.53496;INFO_SCORE=0.8312;HWE=0.625;ERC=1051.56;EAC=862.859;PAF=0.54928
Chr1	47381180	C	A	EAF=0.29956;INFO_SCORE=0.72775;HWE=0.236;ERC=658.886;EAC=1351.61;PAF=0.32772
Chr1	51252783	A	C	EAF=0.36975;INFO_SCORE=0.80042;HWE=0.503;ERC=809.807;EAC=1355.39;PAF=0.37401
Chr1	60537705	T	C	EAF=0.34552;INFO_SCORE=0.5466;HWE=0.173;ERC=613.451;EAC=1163.09;PAF=0.34531
Chr1	71126303	T	G	EAF=0.53355;INFO_SCORE=0.79213;HWE=0.854;ERC=1221.66;EAC=954.728;PAF=0.56133
Chr2	55473945	A	G	EAF=0.4309;INFO_SCORE=0.7368;HWE=0.9;ERC=787.793;EAC=947.772;PAF=0.45391
Chr2	58751341	A	G	EAF=0.38488;INFO_SCORE=0.61493;HWE=0.655;ERC=396.784;EAC=977.689;PAF=0.28868
Chr3	1055959	T	A	EAF=0.53935;INFO_SCORE=0.37925;HWE=5.79e-06;ERC=1016.29;EAC=841.455;PAF=0.54706
Chr3	3488331	C	T	EAF=0.56796;INFO_SCORE=0.61748;HWE=0.448;ERC=1122.85;EAC=928.436;PAF=0.54739
Chr3	44197469	G	C	EAF=0.37939;INFO_SCORE=0.76898;HWE=0

In [30]:
## INFO from stitch validation output

cd ~/snap_hap/impute/stitch_tests
for vcf in */run3.1*/run3_*-chr?_*K75_cov20*/*.PL.vcf.gz
do
    # echo $vcf
    # bash ~/snap_hap/_scripts/bash/stitch/add_PG_PL.sh $vcf
    bcftools view -H -R ~/snap_hap/variants/stitch/KASP_validation.pos $vcf | cut -f1,2,4,5,8
done

Chr1	955185	C	A	EAF=0.61852;INFO_SCORE=0.71681;HWE=6.78e-05;ERC=1069.38;EAC=598.826;PAF=0.64104
Chr1	45535107	C	A	EAF=0.5282;INFO_SCORE=0.7465;HWE=0.143;ERC=1094.52;EAC=899.864;PAF=0.5488
Chr1	47381180	C	A	EAF=0.31531;INFO_SCORE=0.62547;HWE=0.0201;ERC=672.887;EAC=1400.6;PAF=0.32452
Chr1	51252783	A	C	EAF=0.36165;INFO_SCORE=0.72718;HWE=0.841;ERC=824.806;EAC=1377.38;PAF=0.37454
Chr1	60537705	T	C	EAF=0.34305;INFO_SCORE=0.58476;HWE=0.685;ERC=633.438;EAC=1188.07;PAF=0.34775
Chr1	71126303	T	G	EAF=0.54114;INFO_SCORE=0.72912;HWE=0.157;ERC=1273.65;EAC=982.725;PAF=0.56447
Chr2	55473945	A	G	EAF=0.42485;INFO_SCORE=0.76758;HWE=0.851;ERC=824.781;EAC=991.766;PAF=0.45404
Chr2	58751341	A	G	EAF=0.34912;INFO_SCORE=0.52485;HWE=0.351;ERC=431.774;EAC=1014.69;PAF=0.2985
Chr3	1055959	T	A	EAF=0.53891;INFO_SCORE=0.42271;HWE=0.0111;ERC=1037.29;EAC=871.45;PAF=0.54344
Chr3	3488331	C	T	EAF=0.54822;INFO_SCORE=0.61503;HWE=0.118;ERC=1132.85;EAC=952.426;PAF=0.54326
Chr3	44197469	G	C	EAF=0.37404;INFO_SCORE=0.7342;HWE=0.5

## Extract stitch INFO, freqs, genotypes and  positions in all 30 KASP positions + 100kb around. 

In [28]:
## Extract stitch INFO, params, GTs final Runs
cd ~/snap_hap/variants/stitch
for chrom in Chr{1..8}
do
    vcf=${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.vcf.gz
    bcftools query -R ./INFO_acrossRuns/KASP_validation_100kb.pos --format "%CHROM\t%POS\t%EAF\t%INFO_SCORE\t%HWE\t%ERC\t%EAC\t%PAF[\t%GT]\n" $vcf >> ~/snap_hap/variants/stitch/INFO_acrossRuns/stitch.final.params
done

## Extract stitch INFO, params, GTs validation runs
cd ~/snap_hap/impute/stitch_tests
for vcf in ./Chr?/run3.1*/run3_*-chr?_*K75_cov20*/*.PL.vcf.gz
do
    bcftools query -R ~/snap_hap/variants/stitch/INFO_acrossRuns/KASP_validation_100kb.pos --format "%CHROM\t%POS\t%EAF\t%INFO_SCORE\t%HWE\t%ERC\t%EAC\t%PAF[\t%GT]\n" $vcf >> ~/snap_hap/variants/stitch/INFO_acrossRuns/stitch.validation.params
done

In [29]:
wc -l ~/snap_hap/variants/stitch/INFO_acrossRuns/stitch.final.params
wc -l ~/snap_hap/variants/stitch/INFO_acrossRuns/stitch.validation.params

142836 /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/INFO_acrossRuns/stitch.final.params
197169 /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/INFO_acrossRuns/stitch.validation.params


## Getting final STITCH parameters

In [None]:
cd ~/snap_hap/variants/stitch/

for chrom in Chr{1..8}
do    
    time bcftools query --format "%CHROM\t%POS\t%EAF\t%INFO_SCORE\t%HWE\t%ERC\t%EAC\t%PAF" ${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.vcf.gz > ${chrom}/Am_all_stitch_${chrom}.params
done

## Extract allele frequencies and counts for stitched VCFs

In [None]:
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch

for chrom in Chr{1..8}
do
    sbatch -J stat${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-statVcfs.sbatch.sh ${chrom}/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.vcf.gz ${chrom}/Am_all_stitch_${chrom}
done

## Remove invariant sites from final VCFs

In [None]:
stitchRun=stitch
baseDIR=/nfs/scistore18/bartogrp/apal/snap_hap/variants/$stitchRun
outPrefixRun=stitchRun1

cd $baseDIR
for chrom in Chr6
do
    inVCF=$baseDIR/$chrom/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.tagged.vcf.gz
    outVCF=$baseDIR/$chrom/Am_all_${outPrefixRun}_${chrom}.final.vcf.gz
    time bcftools view -e 'AC==0 | AC==AN' -Oz -o $outVCF $inVCF
    time bcftools tabix -f $outVCF
done

In [None]:
## Cluster implmentation
stitchRun=stitch ## Run1
baseDIR=/nfs/scistore18/bartogrp/apal/snap_hap/variants/$stitchRun
outPrefixRun=stitchRun1

cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/postSTITCH_run1/removeSites
# NB. array information is the chromosome no.
sbatch -J removeInvariantSites --array=1-8 ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-remove_InvariantSites.sbatch.sh $stitchRun $outPrefixRun

In [None]:
## No. of sites for stitch Run 1
stitchRun=stitch
baseDIR=/nfs/scistore18/bartogrp/apal/snap_hap/variants/$stitchRun

for chrom in Chr{1..8}
do
    vcf=$baseDIR/$chrom/*final.vcf.gz
    echo $chrom $vcf
    zgrep -c ^Chr $vcf
done

Chr1 /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr1/Am_all_stitchRun1_Chr1.final.vcf.gz
2940701

Chr2 /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr2/Am_all_stitchRun1_Chr2.final.vcf.gz
3301920

Chr3 /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr3/Am_all_stitchRun1_Chr3.final.vcf.gz
2600018

Chr4 /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr4/Am_all_stitchRun1_Chr4.final.vcf.gz
2317799

Chr5 /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr5/Am_all_stitchRun1_Chr5.final.vcf.gz
3277515

Chr6 /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr6/Am_all_stitchRun1_Chr6.final.vcf.gz
2533823

Chr7 /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr7/Am_all_stitchRun1_Chr7.final.vcf.gz
2358818

Chr8 /nfs/scistore18/bartogrp/apal/snap_hap/variants/stitch/Chr8/Am_all_stitchRun1_Chr8.final.vcf.gz
2765149

# Postprocessing stitch Run2 and Run3

In [None]:
## Always initiate this cell!!!
stitchRun=stitch_run2
run=run2
# stitchRun=stitch_run3 ### Run3

In [None]:
## Make a list of stitch highConfidence chromSegment files
for chrom in Chr{1..8}
do
    baseDIR=~/snap_hap/variants/$stitchRun/${chrom}
    echo -e $chrom'\t'$baseDIR'\t'$(realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*[^PL].vcf.gz | wc -l) files
    realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*[^PL].vcf.gz > ${baseDIR}/stitchVcfs.highConf.list
    echo -e ${baseDIR}/stitchVcfs.highConf.list'\n'
done

In [None]:
## Extract all GTs from stitch highConfidence output
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/postSTITCH_$run/extractGT

# Chr1 DONE
chrom=Chr1
sbatch --array=1-72 -J ${chrom}_postSTITCH-extractGT ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch.sh \
    $chrom $stitchRun ~/snap_hap/variants/$stitchRun/$chrom/stitchVcfs.highConf.list

# Chr2 DONE
chrom=Chr2
sbatch --array=1-78 -J ${chrom}_postSTITCH-extractGT ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch.sh \
    $chrom $stitchRun ~/snap_hap/variants/$stitchRun/$chrom/stitchVcfs.highConf.list

# Chr3
chrom=Chr3
sbatch --array=1-65 -J ${chrom}_postSTITCH-extractGT ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch.sh \
    $chrom $stitchRun ~/snap_hap/variants/$stitchRun/$chrom/stitchVcfs.highConf.list


# Chr4
chrom=Chr4
sbatch --array=1-55 -J ${chrom}_postSTITCH-extractGT ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch.sh \
    $chrom $stitchRun ~/snap_hap/variants/$stitchRun/$chrom/stitchVcfs.highConf.list

# Chr5
chrom=Chr5
sbatch --array=1-71 -J ${chrom}_postSTITCH-extractGT ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch.sh \
    $chrom $stitchRun ~/snap_hap/variants/$stitchRun/$chrom/stitchVcfs.highConf.list


# Chr6 DONE
chrom=Chr6
sbatch --array=1-56 -J ${chrom}_postSTITCH-extractGT ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch.sh \
    $chrom $stitchRun ~/snap_hap/variants/$stitchRun/$chrom/stitchVcfs.highConf.list

# Chr7
chrom=Chr7
sbatch --array=1-56 -J ${chrom}_postSTITCH-extractGT ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch.sh \
    $chrom $stitchRun ~/snap_hap/variants/$stitchRun/$chrom/stitchVcfs.highConf.list


# Chr8
chrom=Chr8
sbatch --array=1-54 -J ${chrom}_postSTITCH-extractGT ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-extractGT.sbatch.sh \
    $chrom $stitchRun ~/snap_hap/variants/$stitchRun/$chrom/stitchVcfs.highConf.list

In [None]:
## Make a list of stitch all chromSegment files
for chrom in Chr1 #done: Chr6
do
    baseDIR=~/snap_hap/variants/$stitchRun/${chrom}
    echo -e $chrom'\t'$baseDIR'\t'$(realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*.PL.vcf.gz | wc -l) files
    realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*.PL.vcf.gz > ${baseDIR}/stitchVcfs.ALL.list
    echo -e ${baseDIR}/stitchVcfs.ALL.list'\n'
done

In [None]:
## Add AC,AN annotations to stitch ALL vcf file
for chrom in Chr1 #done: Chr6
do
    while read inputVCF
    do
        echo $inputVCF
        time bash ~/snap_hap/_scripts/bash/fill-AC-AN_vcfgzFormat.sh $inputVCF
    done < ~/snap_hap/variants/$stitchRun/$chrom/stitchVcfs.ALL.list
done

In [None]:
## Make a list of tagged (AC,AN) stitch all chromSegment files
for chrom in Chr{1..8}
do
    baseDIR=~/snap_hap/variants/$stitchRun/${chrom}
    echo -e $chrom'\t'$baseDIR'\t'$(realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*.PL.tagged.vcf.gz | wc -l) files
    realpath $baseDIR/stitch_chromSegments/*/stitch.${chrom}.*.*.PL.tagged.vcf.gz > ${baseDIR}/stitchVcfs.ALL.tagged.list
    echo -e ${baseDIR}/stitchVcfs.highConf.list'\n'
done

In [None]:
#### NOT DONE yet

## Merge VCF files
cd /nfs/scistore18/bartogrp/apal/snap_hap/variants/jobs/postSTITCH_$run/mergeVCF

for chrom in Chr{1..8}
do 
    baseDIR=~/snap_hap/variants/$stitchRun/$chrom
    echo -e $chrom'\t'$baseDIR

    #highConf#
    stitchVcfList_highConf=${baseDIR}/stitchVcfs.highConf.list
    outVCF_highConf=$baseDIR/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.vcf.gz
    sbatch -J postSTITCH-concatVcfs-highConf_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-concatVcfs.sbatch.sh ${chrom} ${stitchVcfList_highConf} ${outVCF_highConf}

    #ALL Tagged#
    stitchVcfList_ALL=${baseDIR}/stitchVcfs.ALL.tagged.list
    outVCF_ALL=$baseDIR/Am_all_stitch_${chrom}_SnpGap5_biSNPs_filtered-DP_500-7732_QUAL20_MQ30.PL.tagged.vcf.gz
    sbatch -J postSTITCH-concatVcfs-ALL_${chrom} ~/snap_hap/_scripts/sbatch/impute/job-postSTITCH-concatVcfs.sbatch.sh ${chrom} ${stitchVcfList_ALL} ${outVCF_ALL}
done

In [None]:
## Remove monomorphic SNPs to make final VCF