# Frequent Words with Mismatches

### Plan:
1. get all possible k-mers from original sequence
2. generate all possible (up-to d changes) k-mers for each item at point 1
3. find most frequent k-mer among all those generated at point 2
4. bam!

In [1]:
from itertools import combinations, product
from collections import Counter

In [2]:
def find_words_with_mismatches(sequence, k, d, bases='ATCG'):
    """Find the most frequent k-mers with d mismatches in SEQUENCE.
    
    Args:
        sequence (str): Input text string.
        k (int): Length of the k-mers.
        d (int): Maximum Hamming distance.
        
    Returns:
        List(str): A list of the most frequent k-mers in SEQUENCE
                   with up to d mismatches.
    
    Example:
    >>> find_words_with_mismatches('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4, 1)
    ['ATGT', 'ATGC', 'GATG']
    """
    
    # 1. get all possible k-mers of length k from sequence
    kmers = find_kmers(sequence, k)
    
    # 2. find all possible unique (that is why we use set) mutations for each of them
    mutated_kmers = [list(set(mutations(kmer, d, charset=bases))) for kmer in kmers]
    
    # mutated_kmers is a nested list, ie. [[k-mer, k-mer, ...], [k-mer, k-mer, ...], ...]
    # so we need to flatten it
    flat_kmers = [item for sublist in mutated_kmers for item in sublist]
    
    # build dictionary, ie {k-mer : freq, k-mer : freq, ...}
    frequency = Counter(flat_kmers)
    
    # get a list of tuples ordered by decreasing frequency
    kmer_freq = frequency.most_common()
    
    # get maximum frequency
    max_freq = kmer_freq[0][1]
    
    # 3. return only the k-mers corresponding to the max frequency
    most_freq_kmers = []
    for (kmer, freq) in kmer_freq:
        if freq != max_freq:
            break
        most_freq_kmers.append(kmer)
    return most_freq_kmers

In [3]:
# for string of length n, n-k+1 possibilities
def find_kmers(sequence, k):
    """Find all K-mers in a given character SEQUENCE.
    
    >>> find_kmers('GTAGAGCTGT', 5)
    ['GTAGA', 'TAGAG', 'AGAGC', 'GAGCT', 'AGCTG', 'GCTGT']
    """
    
    kmers = []
    n = len(sequence)

    for i in range(n-k+1):
        kmers.append(sequence[i:i+k])

    return kmers

In [4]:
def mutations(word, hamming_distance, charset='ATCG'):
    """Generates all mutations of a word with a Hamming distance less than or equal to a given number.
    
    Args:
        word (str): Input text string.
        hamming_distance (int): Maximum Hamming distance.
        charset (str): Available alphabet.
        
    Yields:
        str: The next mutation of WORD in the sequence.

    Example:
    >>> list(mutations('GTAGA', 1))
    ['ATAGA', 'TTAGA', 'CTAGA', 'GTAGA', 'GAAGA', 'GTAGA', 'GCAGA', 'GGAGA', 'GTAGA', 'GTTGA', 'GTCGA', 'GTGGA', 'GTAAA', 'GTATA', 'GTACA', 'GTAGA', 'GTAGA', 'GTAGT', 'GTAGC', 'GTAGG']
    """
    
    for indices in combinations(range(len(word)), hamming_distance):
        for replacements in product(charset, repeat=hamming_distance):
            mutation = list(word)

            for index, replacement in zip(indices, replacements):
                mutation[index] = replacement

            yield "".join(mutation)

# Testing

In [5]:
# for testing on multiple datasets
import unittest 

In [6]:
class TestMismatch(unittest.TestCase):
    def test_1(self):
        seq = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
        out = ['GATG', 'ATGC', 'ATGT']
        res = find_words_with_mismatches(seq, 4, 1)
        self.assertEqual(set(res), set(out))
    def test_2(self):
        seq = 'CACAGTAGGCGCCGGCACACACAGCCCCGGGCCCCGGGCCGCCCCGGGCCGGCGGCCGCCGGCGCCGGCACACCGGCACAGCCGTACCGGCACAGTAGTACCGGCCGGCCGGCACACCGGCACACCGGGTACACACCGGGGCGCACACACAGGCGGGCGCCGGGCCCCGGGCCGTACCGGGCCGCCGGCGGCCCACAGGCGCCGGCACAGTACCGGCACACACAGTAGCCCACACACAGGCGGGCGGTAGCCGGCGCACACACACACAGTAGGCGCACAGCCGCCCACACACACCGGCCGGCCGGCACAGGCGGGCGGGCGCACACACACCGGCACAGTAGTAGGCGGCCGGCGCACAGCC'
        out = ['GCACACAGAC', 'GCGCACACAC']
        res = find_words_with_mismatches(seq, 10, 2)
        self.assertEqual(set(res), set(out))
    def test_appears(self):
        seq = 'AAAAAAAAAA'
        out = ['AA', 'AC', 'AG', 'CA', 'AT', 'GA', 'TA']
        res = find_words_with_mismatches(seq, 2, 1)
        self.assertEqual(set(res), set(out))
    def test_swapping(self):
        seq = 'AGTCAGTC'
        out = ['TCTC', 'CGGC', 'AAGC', 'TGTG', 'GGCC', 'AGGT', 'ATCC', 'ACTG', 'ACAC', 'AGAG', 'ATTA', 'TGAC', 'AATT',
'CGTT', 'GTTC', 'GGTA', 'AGCA', 'CATC']
        res = find_words_with_mismatches(seq, 4, 2)
        self.assertEqual(set(res), set(out))
    def test_complement(self):
        seq = 'AATTAATTGGTAGGTAGGTA'
        out = ['GGTA']
        res = find_words_with_mismatches(seq, 4, 0)
        self.assertEqual(set(res), set(out))
    def test_cardinality(self):
        seq = 'ATA'
        out = ['GTA', 'ACA', 'AAA', 'ATC', 'ATA', 'AGA', 'ATT', 'CTA', 'TTA', 'ATG']
        res = find_words_with_mismatches(seq, 3, 1)
        self.assertEqual(set(res), set(out))
    def test_complement_2(self):
        seq = 'AAT'
        out = ['AAT']
        res = find_words_with_mismatches(seq, 3, 0)
        self.assertEqual(set(res), set(out))
    def test_last(self):
        seq = 'TAGCG'
        out = ['GG', 'TG']
        res = find_words_with_mismatches(seq, 2, 1)
        self.assertEqual(set(res), set(out))

In [7]:
a = TestMismatch()

suite = unittest.TestLoader().loadTestsFromModule(a)
unittest.TextTestRunner().run(suite)

........
----------------------------------------------------------------------
Ran 8 tests in 0.507s

OK


<unittest.runner.TextTestResult run=8 errors=0 failures=0>