# SNP signature

* Assumption: a set of alleles from an MLST scheme which has already been aligned with clustalo.

## Get only a kmer which happens to overlap a SNP

In [72]:
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.Seq import reverse_complement
import pandas as pd
import math

class MLSTSchemeKmerSNP:
    kmer_size = 21
    
    def __init__(self, input_file):
        self._input_file = input_file
        
        (kmer_table, snp_table) = self._build_tables(input_file)
        
        self._kmer_table = kmer_table
        self._snp_table = snp_table

    def _canonical_kmer(self, kmer):
        kmer_upper = kmer.upper()
        kmer_rc = reverse_complement(kmer_upper)
        if kmer_upper < kmer_rc:
            return kmer_upper
        else:
            return kmer_rc

    def _check_freq(self, str):
        freqcy = {}
        min_v=1000
        min_char = None
        for c in set(str):
           freqcy[c] = str.count(c)
           if freqcy[c] < min_v:
               min_v = freqcy[c]
               min_char = c
        return (freqcy, min_char)
    
    def _build_tables(self, input_file):
        
        align = AlignIO.read(input_file, 'fasta')

        # Stores table like {'allele': set(kmers)}
        table_of_kmers = {}

        # Stores table like {'allele': set(position_snp)}
        snp_table = {}

        for i in range(align.get_alignment_length()):
            col_align = align[:, i:(i+1)]
            col = align[:, i]
            (cf, min_char) = self._check_freq(col)

            # If a SNP (e.g., more than one different nucleotide)
            if len(cf) > 1:
                for ca in col_align:
                    snp_table.setdefault(ca.id, set()).add(f"{i}_{str(ca.seq)}")
                left_edge = max(0, i - math.floor(kmer_size/2))
                right_edge = left_edge + kmer_size
                if right_edge > align.get_alignment_length():
                    right_edge = align.get_alignment_length()
                    left_edge = right_edge - kmer_size

                kmer_slice = align[:, left_edge:right_edge]

                for allele in kmer_slice:
                    kmer_string = self._canonical_kmer(str(allele.seq))
                    table_of_kmers.setdefault(allele.id, set()).add(kmer_string)
                    
        return (table_of_kmers, snp_table)
    
    def get_kmer_table(self):
        return self._kmer_table
    
    def get_snp_table(self):
        return self._snp_table

input_file = "mlst-campy/aspA.aligned.fasta"
mlst_scheme = MLSTSchemeKmerSNP(input_file)
table_of_kmers = mlst_scheme.get_kmer_table()
snp_table = mlst_scheme.get_snp_table()

align = AlignIO.read(input_file, 'fasta')

# Only print first 2 alleles
print(f"kmers: {len(list(table_of_kmers.values())[0])}")
    
# Print SNPs
print(f"snps: {len(list(snp_table.values())[0])}/{align.get_alignment_length()}")

kmers: 254
snps: 262/477


In [81]:
import glob

files = glob.glob('mlst-campy/*.aligned.fasta')
total_snps = 0
total_kmers = 0
total_align_length = 0
for input_file in files:
    mlst_scheme = MLSTSchemeKmerSNP(input_file)
    table_of_kmers = mlst_scheme.get_kmer_table()
    snp_table = mlst_scheme.get_snp_table()

    align = AlignIO.read(input_file, 'fasta')
    
    snp_count = len(list(snp_table.values())[0])
    kmer_count = len(list(table_of_kmers.values())[0])
    align_length = align.get_alignment_length()
    
    total_snps += snp_count
    total_kmers += kmer_count
    total_align_length += align_length

    print(f"{input_file}: kmers = {kmer_count}, snps = {snp_count}/{align_length}")
print(f"total snps: {total_snps}/{total_align_length}, total_kmers: {total_kmers}")

mlst-campy/tkt.aligned.fasta: kmers = 280, snps = 295/460
mlst-campy/gltA.aligned.fasta: kmers = 262, snps = 274/405
mlst-campy/uncA.aligned.fasta: kmers = 270, snps = 286/490
mlst-campy/pgm.aligned.fasta: kmers = 308, snps = 315/499
mlst-campy/glyA.aligned.fasta: kmers = 311, snps = 323/510
mlst-campy/glnA.aligned.fasta: kmers = 310, snps = 317/478
mlst-campy/aspA.aligned.fasta: kmers = 254, snps = 262/477
total snps: 2072/3319, total_kmers: 1995


## Print kmers that uniquely differentiate one allele from the rest

In [59]:
ordered_labels = sorted(table_of_kmers.keys())

data = []
for allele1 in ordered_labels:
    unique_kmers1 = table_of_kmers[allele1]
    
    for allele2 in ordered_labels:
        if allele2 != allele1 and len(unique_kmers1) > 0:
            unique_kmers1 = unique_kmers1 - table_of_kmers[allele2]
            
    data.append(len(unique_kmers1))
    
df = pd.DataFrame(data, index=ordered_labels)
df

Unnamed: 0,0
aspA_1,0
aspA_10,0
aspA_100,11
aspA_101,2
aspA_102,1
aspA_103,0
aspA_104,0
aspA_105,8
aspA_106,14
aspA_107,18


## Print SNPs that uniquely differentiate one allele from the rest

In [66]:
ordered_labels = sorted(snp_table.keys())

data = []
for allele1 in ordered_labels:
    unique_snps1 = snp_table[allele1]
    
    for allele2 in ordered_labels:
        if allele2 != allele1 and len(unique_snps1) > 0:
            unique_snps1 = unique_snps1 - snp_table[allele2]
            
    data.append(len(unique_snps1))
    
df2 = pd.DataFrame(data, index=ordered_labels, columns=['unique_snps'])
df2[df2['unique_snps'] > 0]

Unnamed: 0,unique_snps
aspA_100,1
aspA_139,1
aspA_143,1
aspA_151,1
aspA_167,21
aspA_169,1
aspA_176,1
aspA_177,1
aspA_179,1
aspA_184,1
