# Run singer on snap_hap_repHZ

Chr2:45-65MB   
Run1  
Date: 03.09.2024  
last update: 12.09.2024

In [None]:
## Load modules
module load vcftools
module load bcftools
module load singer
module load python
cd ~/snap_hap_repHZ/singer

In [None]:
## Fix VCF for singer
# mkdir input
cd ~/snap_hap_repHZ/singer/input

bcftools view -h ./Flavia.vcf.gz | tail +1 > header.txt
bcftools view -H ./Flavia.vcf.gz > ./GTs.txt
cat header.txt GTs.txt > Flavia.sg.vcf
rm header.txt GTs.txt

In [None]:
## Compute pi for windows
inVcf=~/snap_hap_repHZ/singer/input/Flavia.vcf.gz
vcftools --gzvcf $inVcf --out win500k_vcftools --window-pi 500000

## Compute pi_within with Genomics General
inVcf=~/snap_hap_repHZ/singer/input/Flavia.vcf.gz
geno=~/snap_hap_repHZ/singer/input/Flavia.geno.gz
size=500000
popsFile=~/snap_hap_repHZ/singer/ALL.samples

export PATH=$PATH:$HOME/genomics_general:$HOME/genomics_general/VCF_processing
#time parseVCF.py -i $inVcf -o $geno --ploidy 2 --addRefTrack
time popgenWindows.py    -g $geno -o win500k_SHM.csv.gz -f phased \
                    --windType coordinate -w $size -s $size \
                    -p ALL --popsFile $popsFile --ploidy 2 \
                    --writeFailedWindow --addWindowID -T 16

## VCFtools
vcftools --gzvcf Flavia.vcf.gz --out win100k_vcftools --window-pi 100000

In [24]:
#%R
## Compute Ne for each window
## vcftools
mu = 5.7e-9
pi = read.table('~/snap_hap_repHZ/singer/input/win500k_vcftools.windowed.pi', header=T)
pi$Ne = round(pi$PI/(4*mu))
str(pi)
write.table(pi, '~/snap_hap_repHZ/singer/input/win500k_vcftools_Ne.windowed.pi', 
            col.names=T, row.names=F, sep='\t', quote=F)

'data.frame':	40 obs. of  6 variables:
 $ CHROM     : chr  "Chr2" "Chr2" "Chr2" "Chr2" ...
 $ BIN_START : int  45000001 45500001 46000001 46500001 47000001 47500001 48000001 48500001 49000001 49500001 ...
 $ BIN_END   : int  45500000 46000000 46500000 47000000 47500000 48000000 48500000 49000000 49500000 50000000 ...
 $ N_VARIANTS: int  7943 8312 10459 7109 5612 9628 7076 9000 11492 11662 ...
 $ PI        : num  0.00344 0.00328 0.00555 0.00374 0.00299 ...
 $ Ne        : num  151094 143781 243447 164236 131249 ...


In [19]:
#%R
## Compute Ne for each window
## SHM
mu = 5.7e-9
pi = read.csv('~/snap_hap_repHZ/singer/input/win500k_SHM.csv.gz', header=T)
pi = subset(pi, start>45000000)
pi$Ne = round(pi$pi_ALL/(4*mu))
pi$windowID = seq(0,nrow(pi)-1)
str(pi)
write.table(pi, '~/snap_hap_repHZ/singer/input/win500k_SHM_Ne.tsv', col.names=T, row.names=F, sep='\t', quote=F)

'data.frame':	40 obs. of  8 variables:
 $ windowID: int  0 1 2 3 4 5 6 7 8 9 ...
 $ scaffold: chr  "Chr2" "Chr2" "Chr2" "Chr2" ...
 $ start   : int  45000001 45500001 46000001 46500001 47000001 47500001 48000001 48500001 49000001 49500001 ...
 $ end     : int  45500000 46000000 46500000 47000000 47500000 48000000 48500000 49000000 49500000 50000000 ...
 $ mid     : num  45245645 45725597 46263989 46713650 47282978 ...
 $ sites   : int  7960 8330 10464 7119 5617 9646 7088 9016 11502 11667 ...
 $ pi_ALL  : num  0.216 0.197 0.265 0.263 0.266 ...
 $ Ne      : num  9491228 8631579 11631579 11535088 11684211 ...


In [None]:
## Load modules and set PATH
## -----
module load vcftools
module load bcftools
module load singer
module load python


## Initiate variables
## -----
baseDIR=~/snap_hap_repHZ/singer
winFile=$baseDIR/input/win500k_vcftools_Ne.windowed.pi #vcftools
# winFile=$baseDIR/input/win500k_SHM_Ne.tsv #SHM
chrom=Chr2

windowID=6
#vcftools
start=$(sed -n "$((windowID+2))p" $winFile | cut -f2)
end=$(sed -n "$((windowID+2))p" $winFile | cut -f3)
# #SHM
# start=$(sed -n "$((windowID+2))p" $winFile | cut -f3)
# end=$(sed -n "$((windowID+2))p" $winFile | cut -f4)
#Manual
# start=45000000
# end=65000000


inVcf=$baseDIR/input/Flavia.sg
outPrefix=$baseDIR/runs/$chrom.win$windowID.$start.$end/out
if [ ! -d $baseDIR/runs/$chrom.win$windowID.$start.$end ]; then mkdir $baseDIR/runs/$chrom.win$windowID.$start.$end; fi

mu=5.7e-9
ratio=1
Ne=$(sed -n "$((windowID+2))p" $winFile | cut -f6) #vcftools
# Ne=$(sed -n "$((windowID+2))p" $winFile | cut -f8) #SHM
# Ne=406694

mcmc_iters=100
thin=20
polar=0.5

echo "VCF: $inVcf.vcf"
echo "chrom: $chrom"
echo "window: $windowID"
echo "start: $start"
echo "end: $end"
echo "outPrefix: $outPrefix"
echo "mu: $mu"
echo "Ne: $Ne"
echo "mcmc iters: $mcmc_iters"
echo "mcmc thin: $thin"
echo "polar: $polar"
echo -e "-----\n\n\n"


## Run singer
# time singer_master  -vcf $inVcf \
time ~/_softwares/SINGER/releases/singer-0.1.7-beta-linux-x86_64/singer_master \
                    -vcf $inVcf \
                    -output $outPrefix \
                    -start $start \
                    -end $end \
                    -m $mu \
                    -ratio $ratio \
                    -Ne $Ne \
                    -n $mcmc_iters \
                    -thin $thin \
                    -polar $polar \
                    -seed 420

In [None]:
## Slurm script
cd ~/snap_hap_repHZ/singer/jobs/
baseDIR=~/snap_hap_repHZ/singer
chrom=Chr2
winFile=$baseDIR/input/win500k_vcftools_Ne.windowed.pi #vcftools
sbatch -J sg --array=0-40 ~/snap_hap_repHZ/singer/_scripts/job-singer_multiWindows.sbatch.sh $chrom $winFile
#arrays=0-39

```
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=16G
```

#NB: ~180mins on head node

In [None]:
## Convert to tskit format
baseDIR=~/snap_hap_repHZ/singer
chrom=Chr2
windowSize=500000

for windowID in {0..39};
do
    # windowID=0
    start=$((45000001+$windowID*$windowSize))
    end=$(($start+$windowSize-1))
    echo -e $windowID"\t"$start"\t"$end
    
    convert_to_tskit -input $baseDIR/runs/$chrom.win$windowID.$start.$end/out \
                     -output $baseDIR/runs/$chrom.win$windowID.$start.$end/out \
                     -start 99 -end 100 #-step 1
done

In [None]:
## Run on python to make newick files
module load python
cd ~/snap_hap_repHZ/singer
for treeFile in ./runs/Chr2.win*.*.*/out_99.trees; 
do 
    echo $treeFile
    python _scripts/convert_to_newick.py $treeFile ${treeFile/.trees/.newick.tsv}
    gzip ${treeFile/.trees/.newick.tsv}
done