# Masking invariant sites from multi-sequence alignment 
### This script generates a BED file for use with `augur mask` 
### Requires VCF from aligned fasta.  

Use [snp-sites](https://github.com/sanger-pathogens/snp-sites) to generate VCF from alignment file: 
    
    `snp-sites -v -o snps.vcf align.fasta`


In [170]:
import pandas as pd 
import numpy as np
import re 

from pysam import VariantFile
from random import sample

In [171]:
# read in VCF
vcf = VariantFile("/Users/nashwa/Desktop/git/hmpxv_dynamics/out/snps/snps.vcf")

# get alignment length
alignment_length = 0 
for rec in vcf.header.records:
    match = (re.compile("\d{5}\d")).search(str(rec))
    if match:
        alignment_length = int(match.group())

# get variant sites, store in snps[] 
snps = [] 
for snp in vcf.fetch():
    snps.append(snp.pos)

#get invariant sites, store in invar[] 
invar = []
for i in range(alignment_length):
    if i not in snps:
        invar.append(i)
    
#get fraction of variants vs invariants 
var_frac = round((len(snps)/alignment_length),3)
mono_frac = 1 - var_frac


## sample a random 90% of invar[] to remove
prune = int(0.9*alignment_length) 
remove = sample(invar,prune)


### Append invariants to BED file
Note: BED text format uses 0-based positions + half-open notation range 

In [209]:
# import existing mask.bed file
bed = pd.read_csv('/Users/nashwa/Desktop/git/monkeypox-build/config/mask.bed', delimiter='\t')
bed_out = pd.read_csv('/Users/nashwa/Desktop/git/monkeypox-build/config/mask.bed', delimiter='\t')

# append sites-to-mask
bed_out = bed_out.reindex(list(range(0, (len(bed)+prune)))).reset_index(drop=True) 
bed_out.loc[:,'Chrom'] = 'chr'
bed_out.loc[len(bed):len(bed)+prune, 'ChromStart'] = remove

chrom_end = [x + 1 for x in remove] 
bed_out.loc[len(bed):len(bed)+prune, 'ChromEnd'] = chrom_end

# add description 
bed_out.loc[:, 'Comment'] = 'invariant site'

In [210]:
# report basic stats

print("before pruning:")
print("alignment length:", alignment_length)
print("variable sites:", len(snps), '(',var_frac,'%)')
print("invariant sites:", len(invar), '(', mono_frac,'%)')
print("\n")
print("after pruning:")
print("alignment length:", (len(invar) - prune) + len(snps), '(removed', prune, 'sites)' )  
print("variable sites:", len(snps),
      '(',round(  (len(snps) / ((len(invar) - prune)+len(snps))) ,3),'%)')
print("invariant sites:", len(invar) - prune, 
      '(', round( (len(invar)-prune) / ((len(invar) - prune)+len(snps)),3  )  ,'%)')

before pruning:
alignment length: 197209
variant sites: 1270 ( 0.006 %)
invariant sites: 195939 ( 0.994 %)


after pruning:
alignment length: 19721 (removed 177488 sites)
variant sites: 1270 ( 0.064 %)
invariant sites: 18451 ( 0.936 %)


In [211]:
# export new BED file. Use with `augur mask --mask [BED file]`. 
bed_out.to_csv("/Users/nashwa/Desktop/git/monkeypox-build/config/to_mask.bed", sep = "\t", index=False)