# Get signature/canonical k-mers from MLST scheme

In [16]:
from pathlib import Path
from Bio import SeqIO
from Bio.Seq import reverse_complement
import pandas as pd

def canonical_kmer(kmer):
    kmer_upper = kmer.upper()
    kmer_rc = reverse_complement(kmer_upper)
    if kmer_upper < kmer_rc:
        return kmer_upper
    else:
        return kmer_rc

def load_kmers(file_path: Path, kmer_size):
    allele_table = {}
    kmer_table = {}
        
    with open(file_path, "r") as file:
        for record in SeqIO.parse(file, 'fasta'):
            allele = str(record.id)
            sequence = str(record.seq)
            seq_length = len(sequence)
            
            max_start = seq_length - kmer_size + 1
            
            if max_start <= 0:
                raise Exception(f"Cannot extract any kmers from ({file}, {allele}): kmer size {kmer_size} longer than allele length {seq_length}")

            for start in range(0, max_start):
                stop = start + kmer_size

                kmer = canonical_kmer(sequence[start : stop])
                
                allele_table.setdefault(allele, set()).add(kmer)
                kmer_table.setdefault(kmer, set()).add(allele)
                
    return (allele_table, kmer_table)
                
(allele_table, kmer_table) = load_kmers('aspA-1.fas', 2)
print(allele_table)
print(kmer_table)

{'aspA_1': {'GA', 'GC', 'CC', 'AA', 'AT', 'CG', 'TA', 'AG', 'CA', 'AC'}}
{'AT': {'aspA_1'}, 'CA': {'aspA_1'}, 'GA': {'aspA_1'}, 'TA': {'aspA_1'}, 'AG': {'aspA_1'}, 'CC': {'aspA_1'}, 'AC': {'aspA_1'}, 'AA': {'aspA_1'}, 'GC': {'aspA_1'}, 'CG': {'aspA_1'}}


In [19]:
(allele_kmers, kmer_table) = load_kmers('aspA-2.fas', 21)
aspA_1 = allele_kmers['aspA_1']
aspA_2 = allele_kmers['aspA_2']

print(f"Kmers in (1,2) = ({len(aspA_1)}, {len(aspA_2)})")
print(f"Kmers in union = {len(aspA_1 | aspA_2)}")
print(f"Kmers in intersection = {len(aspA_1 & aspA_2)}")
print(f"Unique kmers aspA_1 = {len(aspA_1 - aspA_2)}")
print(f"Unique kmers aspA_2 = {len(aspA_2 - aspA_1)}")
print(f"Unique kmers in aspA_1 = {aspA_1 - aspA_2}")

Kmers in (1,2) = (457, 457)
Kmers in union = 543
Kmers in intersection = 371
Unique kmers aspA_1 = 86
Unique kmers aspA_2 = 86
Unique kmers in aspA_1 = {'CTGCTATTGGAACGGGAATTA', 'AGCAAAAGTTACAGTTACATC', 'CAGGATGAGAATTAATTCCCG', 'CGATCAAATCCTCAGCCACAG', 'AAAAGTTACAGTTACATCTGC', 'GACCACATTTTGGACCACTGC', 'AGATGTAACTGTAACTTTTGC', 'TTACATCTGCTCCAATAACAA', 'CAAAAGTTACAGTTACATCTG', 'CTGAGGATTTGATCGAGGCGA', 'ATTGGAGCAGATGTAACTGTA', 'ATTTTGTTATTGGAGCAGATG', 'GACCACTGCTTAAAAGTCTTA', 'TACATCTGCTCCAATAACAAA', 'ACTGCTTAAAAGTCTTAAGTC', 'CCACTGCTTAAAAGTCTTAAG', 'CTGCTTAAAAGTCTTAAGTCA', 'CAAGCAAAAGTTACAGTTACA', 'ACCACTGCTTAAAAGTCTTAA', 'AAGCAAAAGTTACAGTTACAT', 'AATTCCCGTTCCAATAGCAGT', 'ATTAATTCCCGTTCCAATAGC', 'CATTTTGGACCACTGCTTAAA', 'TGAGAATTAATTCCCGTTCCA', 'ACTGTGGCTGAGGATTTGATC', 'ACAGTTACATCTGCTCCAATA', 'GACTTTTAAGCAGTGGTCCAA', 'AATGACTTAAGACTTTTAAGC', 'TTACAGTTACATCTGCTCCAA', 'AAAGTTACAGTTACATCTGCT', 'AAGCAGTGGTCCAAAATGTGG', 'AGTCGCCTCGATCAAATCCTC', 'ACCACATTTTGGACCACTGCT', 'ATGAGAATTAATTCCCGTTCC

In [144]:
(allele_kmers, kmer_table) = load_kmers('aspA-10.fas', 21)
#(allele_kmers, kmer_table) = load_kmers('mlst-campy/aspA.fas', 21)

# Unique kmers among all pairs

In [145]:
ordered_labels = sorted(allele_kmers.keys())

data = []
for allele1 in ordered_labels:
    kmers1 = allele_kmers[allele1]
    
    row = [len(kmers1 - allele_kmers[allele2]) for allele2 in ordered_labels]
    data.append(row)
    
df = pd.DataFrame(data, columns=ordered_labels, index=ordered_labels)
print(df.idxmax())
df

aspA_1     aspA_2
aspA_10    aspA_2
aspA_2     aspA_5
aspA_3     aspA_5
aspA_4     aspA_2
aspA_5     aspA_2
aspA_6     aspA_5
aspA_7     aspA_5
aspA_8     aspA_5
aspA_9     aspA_5
dtype: object


Unnamed: 0,aspA_1,aspA_10,aspA_2,aspA_3,aspA_4,aspA_5,aspA_6,aspA_7,aspA_8,aspA_9
aspA_1,0,30,86,21,51,70,65,65,44,65
aspA_10,30,0,116,51,21,40,95,95,74,95
aspA_2,86,116,0,65,137,154,63,21,42,63
aspA_3,21,51,65,0,72,91,44,44,65,86
aspA_4,51,21,137,72,0,19,116,116,95,116
aspA_5,70,40,154,91,19,0,121,133,112,121
aspA_6,65,95,63,44,116,121,0,42,63,42
aspA_7,65,95,21,44,116,133,42,0,63,84
aspA_8,44,74,42,65,95,112,63,63,0,21
aspA_9,65,95,63,86,116,121,42,84,21,0


# Unique kmers for any one over all others

In [146]:
ordered_labels = sorted(allele_kmers.keys())

data = []
for allele1 in ordered_labels:
    unique_kmers1 = allele_kmers[allele1]
    
    for allele2 in ordered_labels:
        if allele2 != allele1 and len(unique_kmers1) > 0:
            unique_kmers1 = unique_kmers1 - allele_kmers[allele2]
            
    data.append(len(unique_kmers1))
    
df2 = pd.DataFrame(data, index=ordered_labels)
df2

Unnamed: 0,0
aspA_1,0
aspA_10,0
aspA_2,0
aspA_3,0
aspA_4,0
aspA_5,19
aspA_6,0
aspA_7,0
aspA_8,0
aspA_9,0


In [147]:
for kmer in sorted(kmer_table.keys()):
    if len(kmer_table[kmer]) == 1:
        print(f"{kmer}\t{kmer_table[kmer]}")

AAAAACGTTAAGTTGTAATTG	{'aspA_5'}
AAAACGTTAAGTTGTAATTGT	{'aspA_5'}
AAACGTTAAGTTGTAATTGTC	{'aspA_5'}
AACGTTAAGTTGTAATTGTCC	{'aspA_5'}
AACGTTTTTGAACCAGTTGCA	{'aspA_5'}
AACTGGTTCAAAAACGTTAAG	{'aspA_5'}
AACTTAACGTTTTTGAACCAG	{'aspA_5'}
AATTACAACTTAACGTTTTTG	{'aspA_5'}
ACAACTTAACGTTTTTGAACC	{'aspA_5'}
ACGTTAAGTTGTAATTGTCCA	{'aspA_5'}
ACTGGTTCAAAAACGTTAAGT	{'aspA_5'}
ATTACAACTTAACGTTTTTGA	{'aspA_5'}
CAACTGGTTCAAAAACGTTAA	{'aspA_5'}
CAACTTAACGTTTTTGAACCA	{'aspA_5'}
CGTTAAGTTGTAATTGTCCAC	{'aspA_5'}
GCAACTGGTTCAAAAACGTTA	{'aspA_5'}
GGTGGACAATTACAACTTAAC	{'aspA_5'}
GTTCAAAAACGTTAAGTTGTA	{'aspA_5'}
TTACAACTTAACGTTTTTGAA	{'aspA_5'}


# Minimal k-mers differentiating one genome from all others

In [148]:
ordered_labels = sorted(allele_kmers.keys()) 

data_orig = []
data_union = []
for allele1 in ordered_labels:
    kmers1 = allele_kmers[allele1]
    kmers1_unique_union = set()
    
    for allele2 in ordered_labels:
        kmers1_unique_union = kmers1_unique_union | (kmers1 - allele_kmers[allele2])
            
    data_orig.append(len(kmers1))
    data_union.append(len(kmers1_unique_union))

df3 = pd.DataFrame([data_orig, data_union], columns=ordered_labels)
df3

Unnamed: 0,aspA_1,aspA_10,aspA_2,aspA_3,aspA_4,aspA_5,aspA_6,aspA_7,aspA_8,aspA_9
0,457,457,457,457,457,457,457,457,457,457
1,163,163,163,163,163,163,163,163,163,163


In [149]:
ordered_labels = sorted(allele_kmers.keys()) 

def get_unique_kmers(kmers_alleles_table):
    allele_kmer_values = list(kmers_alleles_table.values())
    intersection_all = allele_kmer_values.pop()
    print(len(intersection_all))
    for other in allele_kmer_values:
        intersection_all = intersection_all & other

    print(len(intersection_all))

    data = [kmers_alleles_table[allele1] - intersection_all for allele1 in ordered_labels]
    print(f"{[len(d) for d in data]}")
    print(data[0])
    
get_unique_kmers(allele_kmers)

457
294
[163, 163, 163, 163, 163, 163, 163, 163, 163, 163]
{'ATGCAACCAGGTAGTTCTATC', 'CTGCTATTGGAACGGGAATTA', 'AATCAATTTTCTAGCTTCTAA', 'AGCAAAAGTTACAGTTACATC', 'CAGGATGAGAATTAATTCCCG', 'CAAATTGATCTCTAAAATCAA', 'CGATCAAATCCTCAGCCACAG', 'AAAAGTTACAGTTACATCTGC', 'GGGTGGACAATTACAACTTAA', 'CCCAAATTGATCTCTAAAATC', 'GTAGTTCTATCATGCCAGGTA', 'AAATTGATTTTAGAGATCAAT', 'GACCACATTTTGGACCACTGC', 'AGATGTAACTGTAACTTTTGC', 'TTACATCTGCTCCAATAACAA', 'CAAAAGTTACAGTTACATCTG', 'CTGAGGATTTGATCGAGGCGA', 'AGGGTGGACAATTACAACTTA', 'AATACTCTTTGTATATCTTCA', 'ATTGGAGCAGATGTAACTGTA', 'TATTAGAAGCTAGAAAATTGA', 'CTAGAAAATTGATTTTAGAGA', 'GACCACTGCTTAAAAGTCTTA', 'ATTTTGTTATTGGAGCAGATG', 'ATTACAACTTAATGTTTTTGA', 'TACATCTGCTCCAATAACAAA', 'ACTGCTTAAAAGTCTTAAGTC', 'GGTGAAGATATACAAAGAGTA', 'ATGATAGGTGAAGATATACAA', 'CCACTGCTTAAAAGTCTTAAG', 'CTGCTTAAAAGTCTTAAGTCA', 'CAAGCAAAAGTTACAGTTACA', 'CAACCAGGTAGTTCTATCATG', 'ACCACTGCTTAAAAGTCTTAA', 'TGATAGAACTACCTGGTTGCA', 'AAGCAAAAGTTACAGTTACAT', 'ATGATAGAACTACCTGGTTGC', 'AATTCCCGTTCCAA

# Minimal after encoding non-kmers

In [150]:
def insert_non_kmers(kmers_alleles_table):
    allele_non_kmers = {}
    
    union_all = set()
    for allele in kmers_alleles_table:
        union_all = union_all | kmers_alleles_table[allele]

    for allele in kmers_alleles_table:
        non_kmers = union_all - kmers_alleles_table[allele]
        allele_non_set = kmers_alleles_table[allele].copy()
        for k in non_kmers:
            allele_non_set.add(f"~{k}")

        allele_non_kmers[allele] = allele_non_set
        
    return allele_non_kmers

def get_unique_kmers(kmers_alleles_table):
    ordered_labels = sorted(kmers_alleles_table.keys())
    
    data = []
    for allele1 in ordered_labels:
        unique_kmers1 = kmers_alleles_table[allele1]

        for allele2 in ordered_labels:
            if allele2 != allele1 and len(unique_kmers1) > 0:
                unique_kmers1 = unique_kmers1 - kmers_alleles_table[allele2]

        data.append(unique_kmers1)
        
    return (data, ordered_labels)
    
allele_non_kmers = insert_non_kmers(allele_kmers)
#print(allele_non_kmers['aspA_1'] - allele_non_kmers['aspA_2'])
(data, labels) = get_unique_kmers(allele_non_kmers)
print(data)
data = [len(d) for d in data]

df4 = pd.DataFrame(data, index=labels)
df4

[set(), set(), set(), set(), set(), {'ACGTTAAGTTGTAATTGTCCA', 'AATTACAACTTAACGTTTTTG', '~CAACTTAATGTTTTTGAACCA', 'AACGTTAAGTTGTAATTGTCC', '~ACTGGTTCAAAAACATTAAGT', 'GGTGGACAATTACAACTTAAC', 'GCAACTGGTTCAAAAACGTTA', 'CAACTTAACGTTTTTGAACCA', 'ACAACTTAACGTTTTTGAACC', 'AAAACGTTAAGTTGTAATTGT', 'AACTTAACGTTTTTGAACCAG', 'AACTGGTTCAAAAACGTTAAG', 'AAACGTTAAGTTGTAATTGTC', '~AACTTAATGTTTTTGAACCAG', '~CAACTGGTTCAAAAACATTAA', 'AAAAACGTTAAGTTGTAATTG', 'AACGTTTTTGAACCAGTTGCA', 'CGTTAAGTTGTAATTGTCCAC', 'TTACAACTTAACGTTTTTGAA', '~AACTGGTTCAAAAACATTAAG', 'CAACTGGTTCAAAAACGTTAA', 'ACTGGTTCAAAAACGTTAAGT', 'ATTACAACTTAACGTTTTTGA', 'GTTCAAAAACGTTAAGTTGTA'}, set(), set(), set(), set()]


Unnamed: 0,0
aspA_1,0
aspA_10,0
aspA_2,0
aspA_3,0
aspA_4,0
aspA_5,24
aspA_6,0
aspA_7,0
aspA_8,0
aspA_9,0
