# Get signature/canonical k-mers from MLST scheme

In [52]:
from pathlib import Path
from Bio import SeqIO
from Bio.Seq import reverse_complement
import sys

def canonical_kmer(kmer):
    kmer_upper = kmer.upper()
    kmer_rc = reverse_complement(kmer_upper)
    if kmer_upper < kmer_rc:
        return kmer_upper
    else:
        return kmer_rc

def load_kmers(file_path: Path, kmer_size):
    kmer_table = {}
        
    with open(file_path, "r") as file:
        for record in SeqIO.parse(file, 'fasta'):
            allele = str(record.id)
            sequence = str(record.seq)
            seq_length = len(sequence)
            
            max_start = seq_length - kmer_size + 1
            
            if max_start <= 0:
                raise Exception(f"Cannot extract any kmers from ({file}, {allele}): kmer size {kmer_size} longer than allele length {seq_length}")

            for start in range(0, max_start):
                stop = start + kmer_size

                kmer = canonical_kmer(sequence[start : stop])
                
                kmer_table.setdefault(allele, set()).add(kmer)
                
    return kmer_table
                
kmer_table = load_kmers('aspA-1.fas', 2)
print(kmer_table)

{'aspA_1': {'AG', 'AC', 'AA', 'CG', 'GC', 'GA', 'CA', 'CC', 'TA', 'AT'}}


In [67]:
allele_kmers = load_kmers('aspA-2.fas', 21)
aspA_1 = allele_kmers['aspA_1']
aspA_2 = allele_kmers['aspA_2']

print(f"Kmers in (1,2) = ({len(aspA_1)}, {len(aspA_2)})")
print(f"Kmers in union = {len(aspA_1 | aspA_2)}")
print(f"Kmers in intersection = {len(aspA_1 & aspA_2)}")
print(f"Unique kmers aspA_1 = {len(aspA_1 - aspA_2)}")
print(f"Unique kmers aspA_2 = {len(aspA_2 - aspA_1)}")
print(f"Unique kmers in aspA_1 = {aspA_1 - aspA_2}")

Kmers in (1,2) = (457, 457)
Kmers in union = 543
Kmers in intersection = 371
Unique kmers aspA_1 = 86
Unique kmers aspA_2 = 86
Unique kmers in aspA_1 = {'CTGCTTAAAAGTCTTAAGTCA', 'ATGACTTAAGACTTTTAAGCA', 'AGTTACAGTTACATCTGCTCC', 'ACTTTTAAGCAGTGGTCCAAA', 'CCTCGATCAAATCCTCAGCCA', 'AATGACTTAAGACTTTTAAGC', 'CTGAGGATTTGATCGAGGCGA', 'ATCAAATCCTCAGCCACAGTA', 'GTTACAGTTACATCTGCTCCA', 'ATTGGAGCAGATGTAACTGTA', 'GAACTGCTATTGGAACGGGAA', 'CAGGATGAGAATTAATTCCCG', 'ATACTGTGGCTGAGGATTTGA', 'AAGCAGTGGTCCAAAATGTGG', 'CCCGTTCCAATAGCAGTTCCA', 'GGATTTGATCGAGGCGACTCA', 'ATTTTGGACCACTGCTTAAAA', 'AATGTTTTTGAACCAGTTGCA', 'TGAGAATTAATTCCCGTTCCA', 'CCGTTCCAATAGCAGTTCCAC', 'GGAACTGCTATTGGAACGGGA', 'GGGAATTAATTCTCATCCTGA', 'ATGAGAATTAATTCCCGTTCC', 'ACTTAAGACTTTTAAGCAGTG', 'GATTTGATCGAGGCGACTCAA', 'AAGTTACAGTTACATCTGCTC', 'AACTGCTATTGGAACGGGAAT', 'GTCGCCTCGATCAAATCCTCA', 'TGCTATTGGAACGGGAATTAA', 'ACTGTGGCTGAGGATTTGATC', 'CAGTTACATCTGCTCCAATAA', 'ATCTGCTCCAATAACAAAATA', 'CTGCTATTGGAACGGGAATTA', 'GATCGAGGCGACTCAAGATAC