Breaks apart a known genome into reads of length R.  Creates kmers from the reads of length K.  The kmers are then used to create a de Bruijn graph.  The de Bruijn graph is then used to create a Eulerian path.  The Eulerian path is then used to create the genome.  The genome is then compared to the known genome to determine the number of errors.  The number of errors is then returned.

In [38]:
from typing import List, Dict, Tuple
from collections import defaultdict

def generate_kmers(genome: str, k: int) -> List[str]:
    kmers = [genome[i:i+k] for i in range(len(genome) - k + 1)]
    print(f"Generated {len(kmers)} {k}-mers")
    return kmers

def de_bruijn_kmers(k_mers: List[str]) -> Dict[str, List[str]]:
    adj_list = defaultdict(list)
    for kmer in k_mers:
        node1, node2 = kmer[:-1], kmer[1:]
        adj_list[node1].append(node2)
    return dict(adj_list)

def count_in_out_degrees(graph: Dict[str, List[str]]) -> Tuple[Dict[str, int], Dict[str, int]]:
    in_degree = defaultdict(int)
    out_degree = defaultdict(int)
    for node, adj_nodes in graph.items():
        out_degree[node] = len(adj_nodes)
        for adj_node in adj_nodes:
            in_degree[adj_node] += 1
    return in_degree, out_degree

def is_eulerian_path(in_degree: Dict[str, int], out_degree: Dict[str, int]) -> bool:
    start_nodes = [node for node in set(in_degree) | set(out_degree) if out_degree[node] - in_degree[node] == 1]
    end_nodes = [node for node in set(in_degree) | set(out_degree) if in_degree[node] - out_degree[node] == 1]
    balanced_nodes = [node for node in set(in_degree) | set(out_degree) if in_degree[node] == out_degree[node]]
    
    return len(start_nodes) == 1 and len(end_nodes) == 1 and len(balanced_nodes) == len(set(in_degree) | set(out_degree)) - 2

def count_paths(graph: Dict[str, List[str]], start: str, end: str, path: List[str] = None) -> int:
    if path is None:
        path = []
    
    path = path + [start]
    
    if start == end:
        return 1
    
    if start not in graph:
        return 0
    
    count = 0
    for node in graph[start]:
        if node not in path or node == end:
            count += count_paths(graph, node, end, path)
    
    return count

def count_reconstructions(graph: Dict[str, List[str]]) -> int:
    in_degree, out_degree = count_in_out_degrees(graph)
    
    if is_eulerian_path(in_degree, out_degree):
        return 1
    
    start_nodes = [node for node in graph if out_degree[node] > in_degree[node]]
    end_nodes = [node for node in set(in_degree) | set(out_degree) if in_degree[node] > out_degree[node]]
    
    if not start_nodes:
        start_nodes = list(graph.keys())
    if not end_nodes:
        end_nodes = list(set(node for adj in graph.values() for node in adj))
    
    total_paths = sum(count_paths(graph, start, end) for start in start_nodes for end in end_nodes)
    return total_paths

def assemble_and_count(genome: str, k: int) -> Tuple[int, int]:
    kmers = generate_kmers(genome, k)
    graph = de_bruijn_kmers(kmers)
    reconstructions = count_reconstructions(graph)
    
    print(f"K-mer length: {k}")
    print(f"Number of possible reconstructions: {reconstructions}")
    
    return k, reconstructions

def process_multiple_k(genome: str, k_values: List[int]) -> List[Tuple[int, int]]:
    results = []
    for k in k_values:
        results.append(assemble_and_count(genome, k))
    return results

# Example usage
known_genome = "ATCCGAAATCCGAATCCGAAACTACTACTACTTCCGATATCCGAATCCGAAACTACTACTACTTCCGACCGAAACTACATCCGAATCCGAAACTACTACTACTTCCGATACTACTTCCGA"
k_values = [2, 3, 4, 5, 6]

print(f"Original genome: {known_genome}")
results = process_multiple_k(known_genome, k_values)

print("\nSummary:")
for k, reconstructions in results:
    print(f"K-mer length {k}: {reconstructions} possible reconstruction(s)")

Original genome: ATCCGAAATCCGAATCCGAAACTACTACTACTTCCGATATCCGAATCCGAAACTACTACTACTTCCGACCGAAACTACATCCGAATCCGAAACTACTACTACTTCCGATACTACTTCCGA
Generated 119 2-mers
K-mer length: 2
Number of possible reconstructions: 12644
Generated 118 3-mers
K-mer length: 3
Number of possible reconstructions: 1
Generated 117 4-mers
K-mer length: 4
Number of possible reconstructions: 1
Generated 116 5-mers
K-mer length: 5
Number of possible reconstructions: 1
Generated 115 6-mers
K-mer length: 6
Number of possible reconstructions: 1

Summary:
K-mer length 2: 12644 possible reconstruction(s)
K-mer length 3: 1 possible reconstruction(s)
K-mer length 4: 1 possible reconstruction(s)
K-mer length 5: 1 possible reconstruction(s)
K-mer length 6: 1 possible reconstruction(s)
