# SNP signature

* Assumption: a set of alleles from an MLST scheme which has already been aligned with clustalo.

## Get only a kmer which happens to overlap a SNP

In [14]:
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.Seq import reverse_complement
import pandas as pd
import math

align = AlignIO.read("aspA-10.aligned.fasta", "fasta")

kmer_size = 21

def canonical_kmer(kmer):
    kmer_upper = kmer.upper()
    kmer_rc = reverse_complement(kmer_upper)
    if kmer_upper < kmer_rc:
        return kmer_upper
    else:
        return kmer_rc

def check_freq(str):
    freq = {}
    min=1000
    min_char = None
    for c in set(str):
       freq[c] = str.count(c)
       if freq[c] < min:
           min = freq[c]
           min_char = c
    return (freq, min_char)

table_of_kmers = {}
for i in range(align.get_alignment_length()):
    col = align[:, i]
    (cf, min_char) = check_freq(col)
    
    if len(cf) > 1:
        left_edge = max(0, i - math.floor(kmer_size/2))
        right_edge = left_edge + kmer_size
        if right_edge > align.get_alignment_length():
            right_edge = align.get_alignment_length()
            left_edge = right_edge - kmer_size
            
        kmer_slice = align[:, left_edge:right_edge]
        
        for allele in kmer_slice:
            kmer_string = canonical_kmer(str(allele.seq))
            table_of_kmers.setdefault(allele.id, set()).add(kmer_string)

# Only print first 2 alleles
count = 0
for id in table_of_kmers:
    if count >= 2:
        break
    else:
        print(f"{id}")
        for kmer in sorted(table_of_kmers[id]):
            print(f"\t{kmer}")
    count += 1

aspA_1
	AATGTTTTTGAACCAGTTGCA
	AATTAATTCCCGTTCCAATAG
	ATGATAGGTGAAGATATACAA
	CATTAAGTTGTAATTGTCCAC
	CTAGAAAATTGATTTTAGAGA
	CTGAGGATTTGATCGAGGCGA
	GACTTTTAAGCAGTGGTCCAA
	GTAGTTCTATCATGCCAGGTA
	GTTCAAAAACATTAAGTTGTA
	TTACAGTTACATCTGCTCCAA
aspA_2
	AATGTTTTTGAACCAGTTGTA
	AATTAATTCCTGTTCCAATAG
	ATGATAGGTGAAGATATACAA
	CATTAAGTTGTAATTGTCCAC
	CTAGAAAATTGATTTTAGAGA
	CTGAGGATTTAATCGAGGCGA
	GACTTTTAAGTAGTGGTCCAA
	GTAGTTCTATCATGCCAGGTA
	GTTCAAAAACATTAAGTTGTA
	TTACAGTTACGTCTGCTCCAA


## Print kmers that uniquely differentiate one allele from the rest

In [15]:
ordered_labels = sorted(table_of_kmers.keys())

data = []
for allele1 in ordered_labels:
    unique_kmers1 = table_of_kmers[allele1]
    
    for allele2 in ordered_labels:
        if allele2 != allele1 and len(unique_kmers1) > 0:
            unique_kmers1 = unique_kmers1 - table_of_kmers[allele2]
            
    data.append(len(unique_kmers1))
    
df = pd.DataFrame(data, index=ordered_labels)
df

Unnamed: 0,0
aspA_1,0
aspA_10,0
aspA_2,0
aspA_3,0
aspA_4,0
aspA_5,3
aspA_6,0
aspA_7,0
aspA_8,0
aspA_9,0
