# SNP signature

* Assumption: a set of alleles from an MLST scheme which has already been aligned with clustalo.

## Get only a kmer which happens to overlap a SNP

In [54]:
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.Seq import reverse_complement
import pandas as pd
import math

align = AlignIO.read("aspA-10.aligned.fasta", "fasta")

kmer_size = 21

def canonical_kmer(kmer):
    kmer_upper = kmer.upper()
    kmer_rc = reverse_complement(kmer_upper)
    if kmer_upper < kmer_rc:
        return kmer_upper
    else:
        return kmer_rc

def check_freq(str):
    freqcy = {}
    min_v=1000
    min_char = None
    for c in set(str):
       freqcy[c] = str.count(c)
       if freqcy[c] < min_v:
           min_v = freqcy[c]
           min_char = c
    return (freqcy, min_char)

# Stores table like {'allele': set(kmers)}
table_of_kmers = {}

# Stores table like {'allele': set(position_snp)}
snp_table = {}

for i in range(align.get_alignment_length()):
    col_align = align[:, i:(i+1)]
    col = align[:, i]
    (cf, min_char) = check_freq(col)
    
    # If a SNP (e.g., more than one different nucleotide)
    if len(cf) > 1:
        for ca in col_align:
            snp_table.setdefault(ca.id, set()).add(f"{i}_{str(ca.seq)}")
        left_edge = max(0, i - math.floor(kmer_size/2))
        right_edge = left_edge + kmer_size
        if right_edge > align.get_alignment_length():
            right_edge = align.get_alignment_length()
            left_edge = right_edge - kmer_size
            
        kmer_slice = align[:, left_edge:right_edge]
        
        for allele in kmer_slice:
            kmer_string = canonical_kmer(str(allele.seq))
            table_of_kmers.setdefault(allele.id, set()).add(kmer_string)

# Only print first 2 alleles
count = 0
for id in table_of_kmers:
    if count >= 2:
        break
    else:
        print(f"{id}")
        for kmer in sorted(table_of_kmers[id]):
            print(f"\t{kmer}")
    count += 1
    
# Print SNPs
for allele in sorted(snp_table.keys()):
    print(f"{allele}: \t{sorted(snp_table[allele])}")

aspA_1
	AATGTTTTTGAACCAGTTGCA
	AATTAATTCCCGTTCCAATAG
	ATGATAGGTGAAGATATACAA
	CATTAAGTTGTAATTGTCCAC
	CTAGAAAATTGATTTTAGAGA
	CTGAGGATTTGATCGAGGCGA
	GACTTTTAAGCAGTGGTCCAA
	GTAGTTCTATCATGCCAGGTA
	GTTCAAAAACATTAAGTTGTA
	TTACAGTTACATCTGCTCCAA
aspA_2
	AATGTTTTTGAACCAGTTGTA
	AATTAATTCCTGTTCCAATAG
	ATGATAGGTGAAGATATACAA
	CATTAAGTTGTAATTGTCCAC
	CTAGAAAATTGATTTTAGAGA
	CTGAGGATTTAATCGAGGCGA
	GACTTTTAAGTAGTGGTCCAA
	GTAGTTCTATCATGCCAGGTA
	GTTCAAAAACATTAAGTTGTA
	TTACAGTTACGTCTGCTCCAA
aspA_1: 	['173_G', '278_C', '341_C', '413_T', '449_A', '44_G', '458_T', '475_C', '83_G', '8_T']
aspA_10: 	['173_G', '278_C', '341_C', '413_T', '449_A', '44_A', '458_T', '475_C', '83_G', '8_C']
aspA_2: 	['173_A', '278_T', '341_C', '413_C', '449_A', '44_G', '458_T', '475_T', '83_A', '8_T']
aspA_3: 	['173_G', '278_C', '341_C', '413_T', '449_A', '44_G', '458_T', '475_C', '83_A', '8_T']
aspA_4: 	['173_G', '278_C', '341_T', '413_T', '449_A', '44_A', '458_T', '475_C', '83_G', '8_C']
aspA_5: 	['173_G', '278_C', '341_T', '413_T',

## Print kmers that uniquely differentiate one allele from the rest

In [2]:
ordered_labels = sorted(table_of_kmers.keys())

data = []
for allele1 in ordered_labels:
    unique_kmers1 = table_of_kmers[allele1]
    
    for allele2 in ordered_labels:
        if allele2 != allele1 and len(unique_kmers1) > 0:
            unique_kmers1 = unique_kmers1 - table_of_kmers[allele2]
            
    data.append(len(unique_kmers1))
    
df = pd.DataFrame(data, index=ordered_labels)
df

Unnamed: 0,0
aspA_1,0
aspA_10,0
aspA_2,0
aspA_3,0
aspA_4,0
aspA_5,3
aspA_6,0
aspA_7,0
aspA_8,0
aspA_9,0


## Print SNPs that uniquely differentiate one allele from the rest

In [49]:
ordered_labels = sorted(snp_table.keys())

data = []
for allele1 in ordered_labels:
    unique_snps1 = snp_table[allele1]
    
    for allele2 in ordered_labels:
        if allele2 != allele1 and len(unique_snps1) > 0:
            unique_snps1 = unique_snps1 - snp_table[allele2]
            
    data.append(len(unique_snps1))
    
df2 = pd.DataFrame(data, index=ordered_labels)
df2

Unnamed: 0,0
aspA_1,0
aspA_10,0
aspA_2,0
aspA_3,0
aspA_4,0
aspA_5,1
aspA_6,0
aspA_7,0
aspA_8,0
aspA_9,0
