# Masking invariant sites from multi-sequence alignment to minimize BEAST computational time
### This script generates a BED file for use with `augur mask`. 

Use [snp-sites](https://github.com/sanger-pathogens/snp-sites) to generate VCF from alignment file: 
    
    `snp-sites -v -o snps.vcf align.fasta`


In [100]:
import pandas as pd 
import re 

from pysam import VariantFile
from random import sample

In [101]:
# read in VCF
vcf = VariantFile("/Users/nashwa/Desktop/git/hmpxv_dynamics/out/snps/snps.vcf")

# get alignment length
alignment_length = 0 
for rec in vcf.header.records:
    match = (re.compile("\d{5}\d")).search(str(rec))
    if match:
        alignment_length = int(match.group())

# get variant sites, store in snps[] 
snps = [] 
for snp in vcf.fetch():
    snps.append(snp.pos)

#get invariant sites, store in invar[] 
invar = []
for i in range(alignment_length):
    if i not in snps:
        invar.append(i)
    
#get fraction of variants vs invariants 
var_frac = round((len(snps)/alignment_length),3)
mono_frac = 1 - var_frac


## sample a random 90% of invar[] to remove
prune = int(0.9*alignment_length) 
remove = sample(invar,prune)


196753 456 197209


In [150]:
###### append invariants to BED file  

### note: BED is a text format using 0-based positions + half-open notation range  

# import existing mask.bed file 
bed = pd.read_csv('/Users/nashwa/Desktop/git/monkeypox-build/config/mask.bed', delimiter='\t')
bed_made = pd.read_csv('/Users/nashwa/Desktop/git/monkeypox-build/config/mask.bed', delimiter='\t')

# append sites-to-mask to df  
for i in range((len(bed)+1), prune):
    bed_made.loc[i,'Chrom'] = 'chr'
    bed_made.loc[i,'ChromStart'] = remove[i]
    bed_made.loc[i,'ChromEnd'] = remove[i] + 1


In [None]:
# export new BED file. Use with `augur mask --mask [BED file]`. 
bed_made.to_csv("to_mask.bed", sep = "\t", index=False)

In [164]:
## report basic stats

print("before pruning:")
print("alignment length:", alignment_length)
print("variant sites:", len(snps), '(',var_frac,'%)')
print("invariant sites:", len(invar), '(', mono_frac,'%)')
print("\n")
print("after pruning:")
print("alignment length:", (len(invar) - prune) + len(snps), '(removed', prune, 'sites)' )  
print("variant sites:", len(snps),
      '(',round(  (len(snps) / ((len(invar) - prune)+len(snps))) ,3),'%)')
print("invariant sites:", len(invar) - prune, 
      '(', round( (len(invar)-prune) / ((len(invar) - prune)+len(snps)),3  )  ,'%)')

before pruning:
alignment length: 197209
variant sites: 456 ( 0.002 %)
invariant sites: 196753 ( 0.998 %)


after pruning:
alignment length: 19721 (removed 177488 sites)
variant sites: 456 ( 0.023 %)
invariant sites: 19265 ( 0.977 %)
