## Code Challenge: Solve the Eulerian Cycle Problem.

Input: The adjacency list of an Eulerian directed graph.  
Output: An Eulerian cycle in this graph.

In [222]:
from random import choice
from typing import Dict, List
from collections import defaultdict

def build_graph(text: List[str]) -> Dict[str, List[str]]:
    """
    Builds a graph from a list of strings representing edges.

    Args:
        text: A list of strings, where each string represents an edge in the graph
              in the format "source_node: destination_node1 destination_node2 ...".

    Returns:
        A defaultdict with keys as the source nodes and values as lists of destination nodes.
    """
    lines = [line.strip().split(': ')  for line in text]
    edges = {
        line[0]: line[1].split(' ')
        for line in lines
    }
    return defaultdict(list, edges)

def display_cycle(cycle: List[str]) -> str:
    """
    Formats a list of nodes into a string representation of an Eulerian cycle.

    Args:
        cycle: A list of strings representing the nodes visited in an Eulerian cycle.

    Returns:
        A string representation of the Eulerian cycle in the format "node1 node2 ... node1".
    """
    return ' '.join(map(str, cycle))



from random import choice
from typing import Dict, List

def find_eulerian_cycle(string_map: Dict[str, List[str]]) -> List[str]:
    """
    Finds an Eulerian cycle in a graph represented by a dictionary.

    Args:
        string_map (Dict[str, List[str]]): A dictionary representing the graph where each key is a node and its value is a list of connected nodes.

    Returns:
        List[str]: A list of nodes representing the Eulerian cycle.

    """

    result = []  # Stores the Eulerian cycle
    start = choice(list(string_map.keys()))  # Randomly choose a starting node
    stack = [start]  # Initialize the stack with the starting node

    while len(stack) > 0:
        current = stack[-1]  # Get the current node from the top of the stack

        if len(string_map[current]) == 0:
            # If there are no more unvisited neighbors for the current node,
            # remove it from the stack and prepend it to the result
            result.insert(0, stack.pop())
        else:
            # Otherwise, choose a neighbor of the current node,
            # remove it from the list of neighbors, and push it onto the stack
            stack.append(string_map[current].pop())

    return result


In [223]:
def is_path(cycle, graph):
    """
    Checks if a given cycle traverses all edges in the graph.

    Args:
        cycle: A list of strings representing the nodes visited in a cycle.
        graph: A dictionary with keys as nodes and values as lists of neighboring nodes.

    Returns:
        A boolean value indicating if the cycle traverses all edges in the graph.
    """
    # Get all edges in the graph
    edges = [
        (start, end)
        for start, ends in graph.items()
        for end in ends
    ]
    # Get all edges in the cycle
    cycle_ = list(zip(cycle[:-1], cycle[1:]))
    # Compare the sets of edges and cycle edges
    return sorted(edges) == sorted(cycle_)


def is_eulerian(cycle):
    """
    Checks if a given cycle is an Eulerian cycle.

    Args:
        cycle: A list of strings representing the nodes visited in a cycle.

    Returns:
        A boolean value indicating if the cycle is an Eulerian cycle.
    """
    # Get all edges in the cycle
    cycle_ = list(zip(cycle[:-1], cycle[1:]))
    # Compare the sets of cycle edges and unique cycle edges
    return sorted(list(set(cycle_))) == sorted(cycle_)

In [224]:
sample_input = [
    '0: 3',
    '1: 0',
    '2: 1 6',
    '3: 2',
    '4: 2',
    '5: 4',
    '6: 5 8',
    '7: 9',
    '8: 7',
    '9: 6',
]

sample_output = '6 8 7 9 6 5 4 2 1 0 3 2 6' # could be different

graph = build_graph(sample_input)
cycle = find_eulerian_cycle(graph)
assert cycle[0] == cycle[-1]
assert is_eulerian(cycle)
# assert is_path(cycle, graph)
print(display_cycle(cycle))

5 4 2 1 0 3 2 6 8 7 9 6 5


In [225]:
input_filename = 'dataset_203_2'
with open(f'data/{input_filename}.txt', 'r') as input_file:
    test_input = input_file.readlines()

In [226]:
graph = build_graph(test_input)
cycle = find_eulerian_cycle(graph)
assert cycle[0] == cycle[-1]
assert is_eulerian(cycle)
# assert is_path(cycle, graph)

In [227]:
output_filename = 'submission_' + '_'.join(input_filename.split('_')[1:])
with open(f'data/{output_filename}.txt', 'w') as output_file:
    output_file.write(display_cycle(cycle))

## Code Challenge: Solve the Eulerian Path Problem.

Input: The adjacency list of a directed graph that has an Eulerian path.  
Output: An Eulerian path in this graph.

In [228]:
# from functools import reduce
# from operator import add
# from collections import Counter
# from typing import Dict, List


# def find_eulerian_path(graph):
#     """
#     Finds an Eulerian path in a graph represented as a dictionary of edges.

#     Args:
#         graph: A dictionary with keys as nodes and values as lists of neighboring nodes.

#     Returns:
#         A list of strings representing the nodes visited in the Eulerian path.
#     """
#     # Ensure every node appears as a key in the graph dictionary
#     for node in set(sum(graph.values(), [])):
#         if node not in graph:
#             graph[node] = []

#     # Determine if the graph has an Eulerian path
#     in_degrees = {node: 0 for node in graph}
#     out_degrees = {node: len(neighbors) for node, neighbors in graph.items()}
#     for node in graph:
#         in_degrees[node] = sum(1 for neighbors in graph.values() if node in neighbors)
#     start_node, end_node = None, None
#     for node in graph:
#         delta = out_degrees[node] - in_degrees[node]
#         if delta == 1 and not start_node:
#             start_node = node
#         elif delta == -1 and not end_node:
#             end_node = node
#         elif delta != 0:
#             return None

#     # If an Eulerian path exists, find it
#     path = []
#     stack = [start_node]
#     while stack:
#         node = stack[-1]
#         if graph[node]:
#             stack.append(graph[node].pop(0))
#         else:
#             path.append(stack.pop())
#     path.reverse()
#     if path[0] != start_node or path[-1] != end_node:
#         return None
#     return path


In [229]:
# from collections import deque, Counter

# def find_eulerian_path(edges):
#     """
#     Finds an Eulerian path in a directed graph represented as an adjacency dictionary.

#     Args:
#         edges (dict): Adjacency dictionary representing the directed graph.

#     Returns:
#         list: List of nodes representing the Eulerian path.
#     """
#     in_nodes = list(edges.keys())
#     out_nodes = sum(edges.values(), [])  # Flatten the list of outgoing nodes
#     in_degrees = Counter(out_nodes)
#     out_degrees = Counter({key: len(value) for key, value in edges.items()})
#     start = end = None

#     # Find start and end nodes based on the differences in degrees
#     for node in set(out_nodes + in_nodes):
#         difference = out_degrees[node] - in_degrees[node]
#         if difference > 0:
#             start = node
#         if difference < 0:
#             end = node

#     if not start or not end:
#         return []

#     # Create an augmented graph with an additional edge from end to start
#     augmented_edges = {end: [start], **edges}

#     # Find Eulerian cycle in the augmented graph
#     cycle = find_eulerian_cycle(augmented_edges)

#     # Rotate the cycle to start at the original end node
#     path = deque(cycle[:-1])
#     path.rotate(-1 - path.index(end))

#     return list(path)


In [230]:
from typing import Dict, List

def find_eulerian_path(string_map: Dict[str, List[str]]) -> List[str]:
    """
    Find an Eulerian path in a given string map.

    Args:
        string_map: A dictionary representing the adjacency list of the graph.

    Returns:
        A list representing the Eulerian path.

    Algorithm:
    1. Initialize an empty list 'result' to store the Eulerian path.
    2. Choose a starting vertex 'start' using the 'get_start' function.
    3. Initialize a stack with 'start' as the only element.
    4. While the stack is not empty:
        a. Set 'current' as the top element of the stack.
        b. If 'current' has no outgoing edges:
            i. Remove 'current' from the stack and insert it at the beginning of 'result'.
           ii. Otherwise:
            i. Push the next vertex from 'current' to the stack.
    5. Return the 'result' list representing the Eulerian path.
    """
    result = []
    start = get_start(string_map)
    stack = [start]
    while len(stack) > 0:
        current = stack[-1]
        if len(string_map[current]) == 0:
            result.insert(0, stack.pop())
        else:
            stack.append(string_map[current].pop())
    return result


def get_start(string_map: Dict[str, List[str]]) -> str:
    """
    Find the starting vertex for an Eulerian path in a given string map.

    Args:
        string_map: A dictionary representing the adjacency list of the graph.

    Returns:
        A string representing the starting vertex.

    Algorithm:
    1. Initialize an empty dictionary 'result' to store the vertex information.
    2. Iterate over the vertices in 'string_map':
        a. If the vertex is not in 'result' or has an incomplete count:
            i. Initialize the vertex in 'result' with [0, 0] count.
        b. Increment the outgoing count of the vertex in 'result'.
        c. For each outgoing edge:
            i. If the edge is not in 'result' or has an incomplete count:
                - Initialize the edge in 'result' with [0, 0] count.
            ii. Increment the incoming count of the edge in 'result'.
    3. Iterate over the vertices and their counts in 'result':
        a. If the outgoing count of a vertex is greater than the incoming count, return the vertex.
    4. Return the first vertex from 'result' if no suitable starting vertex is found.
    """
    result = {}
    for vertex, edges in string_map.items():
        if vertex not in result or len(result[vertex]) != 2:
            result[vertex] = [0, 0]
        result[vertex][0] = len(edges)
        for edge in edges:
            if edge not in result or len(result[edge]) != 2:
                result[edge] = [0, 0]
            result[edge][1] += 1
    for vertex, counts in result.items():
        if counts[0] > counts[1]:
            return vertex
    return list(result.keys())[0]


In [231]:
sample_input = [
    '0: 2',
    '1: 3',
    '2: 1',
    '3: 0 4',
    '6: 3 7',
    '7: 8',
    '8: 9',
    '9: 6',
]

graph = build_graph(sample_input)
path = find_eulerian_path(graph)  
print(display_cycle(path))

6 7 8 9 6 3 0 2 1 3 4


In [232]:
input_filename = 'dataset_203_6'
with open(f'data/{input_filename}.txt', 'r') as input_file:
    test_input = input_file.readlines()

graph = build_graph(test_input)
path = find_eulerian_path(graph)

output_filename = 'submission_' + '_'.join(input_filename.split('_')[1:])
with open(f'data/{output_filename}.txt', 'w') as output_file:
    output_file.write(display_cycle(path))

## Code Challenge: Solve the String Reconstruction Problem.

Input: An integer k followed by a list of k-mers Patterns.  
Output: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may return any one.)

In [233]:
# From Week_1.ipynb 

def path_to_genome(path):
    """
    Constructs a genome sequence from a given path.

    Args:
        path (list): A list of strings representing a path of kmers.

    Returns:
        str: The constructed genome sequence.

    Example:
        >>> path_to_genome(['AC', 'CT', 'TT'])
        'ACTT'
    """
    genome = path[0]
    for kmer in path[1:]:
        genome += kmer[-1]
    return genome
     

def prefix(kmer):
    """
    Returns the prefix of a k-mer.

    Args:
        kmer: A string representing a k-mer.

    Returns:
        The prefix of the input k-mer, which is the substring of the k-mer
        from the first character up to the second-to-last character.
    """
    return kmer[:-1]


def suffix(kmer):
    """
    Returns the suffix of a k-mer.

    Args:
        kmer: A string representing a k-mer.

    Returns:
        The suffix of the input k-mer, which is the substring of the k-mer
        from the second character up to the last character.
    """
    return kmer[1:]


def debruijn_graph_from_kmers(kmers):
    """
    Constructs a De Bruijn graph from a list of k-mers.

    Args:
        kmers: A list of strings representing k-mers.

    Returns:
        A dictionary representing the De Bruijn graph, where the keys are strings
        representing the nodes of the graph and the values are lists of strings
        representing the outgoing edges from each node.
    """
    # Create an empty dictionary with default values of empty lists.
    edges = defaultdict(list)

    # Iterate over the k-mers in the input list and add their edges to the edges dictionary.
    for kmer in kmers:
        edges[prefix(kmer)].append(suffix(kmer))

    # Return the edges dictionary as the final De Bruijn graph representation.
    return edges



In [234]:
sample_input = [
    'CTTA',
    'ACCA',
    'TACC',
    'GGCT',
    'GCTT',
    'TTAC',
]
sample_output = 'GGCTTACCA'

graph = debruijn_graph_from_kmers(sample_input)
path = find_eulerian_path(graph)
# print(path)
genome = path_to_genome(path)
print(genome)
assert path_to_genome(path) == sample_output

GGCTTACCA


In [235]:
sample_input = [
    'AAAT',
    'AATG',
    'ACCC',
    'ACGC',
    'ATAC',
    'ATCA',
    'ATGC',
    'CAAA',
    'CACC',
    'CATA',
    'CATC',
    'CCAG',
    'CCCA',
    'CGCT',
    'CTCA',
    'GCAT',
    'GCTC',
    'TACG',
    'TCAC',
    'TCAT',
    'TGCA',
]

graph = debruijn_graph_from_kmers(sample_input)
path = find_eulerian_path(graph)
# print(path)
genome = path_to_genome(path)
print(genome)

CAAATGCATCATACGCTCACCCAG


In [161]:
input_filename = 'dataset_203_7'
with open(f'data/{input_filename}.txt', 'r') as input_file:
    test_input = input_file.readlines()

input = test_input[1].split()
graph = debruijn_graph_from_kmers(input)
path = find_eulerian_path(graph)
genome = path_to_genome(path)

output_filename = 'submission_' + '_'.join(input_filename.split('_')[1:])
with open(f'data/{output_filename}.txt', 'w') as output_file:
    output_file.write(genome)

## Code Challenge: Solve the k-Universal Circular String Problem.

Input: An integer k.  
Output: A k-universal circular string.

In [162]:
from itertools import product

def binary_strings(k):
    """
    Generates all possible binary strings of length k.

    Args:
        k (int): The length of binary strings.

    Returns:
        list: A list of binary strings sorted in lexicographic order.
    """
    kmers_ = product('01', repeat=k)
    kmers = [''.join(combo) for combo in kmers_]
    return sorted(kmers)

def universal_circular_string(k):
    """
    Solves the k-Universal Circular String Problem.

    Args:
        k (int): The length of k-mers.

    Returns:
        str: A k-universal circular string.

    Example:
        >>> universal_circular_string(3)
        '00111010'
    """
    kmers = binary_strings(k)
    graph = debruijn_graph_from_kmers(kmers)
    cycle = find_eulerian_cycle(graph)
    genome = path_to_genome(cycle[:-(k-1)])
    return genome


In [163]:
sample_k = 4
universal_circular_string(sample_k)

'1101111001010000'

In [164]:
input_filename = 'dataset_203_11'
with open(f'data/{input_filename}.txt', 'r') as input_file:
    test_input = input_file.readlines()
    test_k = int(test_input[0].strip())

result = universal_circular_string(test_k)

output_filename = 'submission_' + '_'.join(input_filename.split('_')[1:])
with open(f'data/{output_filename}.txt', 'w') as output_file:
    output_file.write(result)

## String Reconstruction from Read-Pairs Problem: Reconstruct a string from its paired composition.

Input: A collection of paired k-mers PairedReads and an integer d.  
Output: A string Text with (k,d)-mer composition equal to PairedReads (if such a string exists).

In [165]:
from typing import List, Tuple

def paired_composition(k: int, d: int, text: str) -> List[Tuple[str, str]]:
    """
    Generates a list of paired k-mers from a given text.

    Args:
        k (int): Length of each k-mer.
        d (int): Distance between the two k-mers in each pair.
        text (str): Input text.

    Returns:
        List[Tuple[str, str]]: List of paired k-mers.
    """
    kdmers = []
    for i in range(len(text) - 2 * k - d + 1):
        kdmers.append((text[i:i+k], text[i+k+d:i+2*k+d]))
    return sorted(kdmers)


def display_kdmers(kdmers: List[Tuple[str, str]]) -> str:
    """
    Displays the paired k-mers in a formatted string.

    Args:
        kdmers (List[Tuple[str, str]]): List of paired k-mers.

    Returns:
        str: Formatted string representation of paired k-mers.
    """
    return ' '.join(map(lambda x: f'({x[0]}|{x[1]})', kdmers))

k = 3
d = 2
text = 'TAATGCCATGGGATGTT'
kdmers = paired_composition(k, d, text)
display = display_kdmers(kdmers)

print("Paired k-mers:")
print(display)

Paired k-mers:
(AAT|CAT) (ATG|ATG) (ATG|ATG) (CAT|GAT) (CCA|GGA) (GCC|GGG) (GGG|GTT) (TAA|CCA) (TGC|TGG) (TGG|TGT)


## Code Challenge: Solve the String Reconstruction from Read-Pairs Problem.

Input: Integers k and d followed by a collection of paired k-mers PairedReads.  
Output: A string Text with (k, d)-mer composition equal to PairedReads.

In [219]:
def de_bruijn_paired_kmers(pairs: List[str]) -> Dict[str, List[str]]:
    """
    Construct a de Bruijn graph from paired k-mers.

    Args:
        pairs: A list of paired k-mers represented as strings.

    Returns:
        A dictionary representing the de Bruijn graph.

    Algorithm:
    1. Create a new list 'processed_pairs' to store the processed paired k-mers.
    2. Iterate over each pair in 'pairs':
        a. Strip the pair of leading/trailing whitespace and split it using '|' separator.
        b. Append the resulting pair to 'processed_pairs'.
    3. Initialize an empty defaultdict 'kmer_map' to store the graph.
    4. For each pair of k-mers in 'new_pairs':
        a. Create the prefix by concatenating the first k-1 characters of each k-mer with a '|' separator.
        b. Create the suffix by concatenating the last k-1 characters of each k-mer with a '|' separator.
        c. Append the suffix to the list associated with the prefix in 'kmer_map'.
    5. Return the 'kmer_map' dictionary representing the de Bruijn graph.
    """
    processed_pairs = []
    for pair in pairs:
        processed_pairs.append(pair.strip().split("|"))

    kmer_map = defaultdict(list)
    for kmer_pair in processed_pairs:
        prefix = "{}|{}".format(kmer_pair[0][:-1], kmer_pair[1][:-1])
        suffix = "{}|{}".format(kmer_pair[0][1:], kmer_pair[1][1:])
        kmer_map[prefix].append(suffix)

    return kmer_map


def paired_genome_path(pairs: List[str], k: int, d: int) -> str:
    """
    Construct a paired genome path from a list of paired k-mers.

    Args:
        pairs: A list of paired k-mers represented as strings.
        k: The length of each k-mer.
        d: The overlap between adjacent k-mers.

    Returns:
        A string representing the paired genome path.

    Algorithm:
    1. Determine the total length 'n' of the paired k-mers.
    2. Initialize a list 'result' with '*' characters of length 'k + k + n - 1'.
    3. Set 'l' as 'k - 1'.
    4. For each pair of k-mers with index 'i':
        a. Split the pair into prefix and suffix using the '|' separator.
        b. Calculate the start position 's2' for the suffix in 'result'.
        c. Replace the characters from 'i' to 'i + l' in 'result' with the characters of the prefix.
        d. Replace the characters from 's2' to 's2 + l' in 'result' with the characters of the suffix.
    5. Return the concatenated string from 'result' representing the paired genome path.
    """
    n = len(pairs)
    result = ['*'] * (k + k + n - 1)
    l = k - 1
    for i, pair in enumerate(pairs):
        prefix, suffix = pair.split('|')
        s2 = i + k + d
        result[i:i + l] = list(prefix)
        result[s2:s2 + l] = list(suffix)
    return "".join(result)

def reconstruction_from_read_pairs(pairs, k, d):
    graph = de_bruijn_paired_kmers(pairs)
    path = find_eulerian_path(graph)
    return paired_genome_path(path, k, d)

In [199]:
sample_k = 4
sample_d = 2
sample_input = [
    'GAGA|TTGA',
    'TCGT|GATG',
    'CGTG|ATGT',
    'TGGT|TGAG',
    'GTGA|TGTT',
    'GTGG|GTGA',
    'TGAG|GTTG',
    'GGTC|GAGA',
    'GTCG|AGAT',
]
sample_output = 'GTGGTCGTGAGATGTTGA'
sample_result = reconstruction_from_read_pairs(sample_input, sample_k, sample_d)
print(sample_result)

GTGGTCGTGAGATGTTGA


In [239]:
sample_k = 3
sample_d = 1
sample_input = [
    'ACC|ATA',
    'ACT|ATT',
    'ATA|TGA',
    'ATT|TGA',
    'CAC|GAT',
    'CCG|TAC',
    'CGA|ACT',
    'CTG|AGC',
    'CTG|TTC',
    'GAA|CTT',
    'GAT|CTG',
    'GAT|CTG',
    'TAC|GAT',
    'TCT|AAG',
    'TGA|GCT',
    'TGA|TCT',
    'TTC|GAA',
]
sample_result = reconstruction_from_read_pairs(sample_input, sample_k, sample_d)
print(sample_result)

CACTGATTCTGATACCGAAACTT


In [200]:
input_filename = 'dataset_204_16'
with open(f'data/{input_filename}.txt', 'r') as input_file:
    test_input = input_file.readlines()
    test_params = test_input[0].strip().split(' ')
    test_k = int(test_params[0])
    test_d = int(test_params[1])
    test_pairs = test_input[1].split()

In [202]:
test_result = reconstruction_from_read_pairs(test_pairs, test_k, test_d)

output_filename = 'submission_' + '_'.join(input_filename.split('_')[1:])
with open(f'data/{output_filename}.txt', 'w') as output_file:
    output_file.write(test_result)

## Contig Generation Problem: Generate the contigs from a collection of reads (with imperfect coverage).

Input: A collection of k-mers Patterns.  
Output: All contigs in DeBruijn(Patterns).

In [208]:
from collections import defaultdict
from typing import List, Dict

def get_incoming_edges(graph: Dict[str, List[str]]) -> Dict[str, List[str]]:
    """
    Get the incoming edges for each node in the graph.

    Args:
        graph: A dictionary representing the adjacency list of the graph.

    Returns:
        A dictionary where each node is a key and its incoming edges are values.

    Algorithm:
    1. Initialize an empty defaultdict 'ins' to store the incoming edges.
    2. Iterate over each key-value pair in the graph:
        a. For each outgoing edge in the value list, add the key as an incoming edge in the 'ins' dictionary.
    3. Return the 'ins' dictionary containing the incoming edges.
    """
    ins = defaultdict(list)
    for kmer, outs in graph.items():
        for out in outs:
            ins[out].append(kmer)
    return ins


def is_one_in_one_out(kmer: str, ins: Dict[str, List[str]], outs: Dict[str, List[str]]) -> bool:
    """
    Check if a k-mer has exactly one incoming edge and one outgoing edge.

    Args:
        kmer: A k-mer string.
        ins: A dictionary representing the incoming edges for each node.
        outs: A dictionary representing the outgoing edges for each node.

    Returns:
        A boolean indicating if the k-mer has one incoming edge and one outgoing edge.

    Algorithm:
    1. Check if the length of the incoming edges list and outgoing edges list for the k-mer is equal to 1.
    2. Return True if the condition is satisfied, otherwise False.
    """
    return len(ins[kmer]) == len(outs[kmer]) == 1


def generate_contigs(kmers: List[str]) -> List[str]:
    """
    Generate contigs from a list of k-mers.

    Args:
        kmers: A list of k-mers represented as strings.

    Returns:
        A list of contigs.

    Algorithm:
    1. Initialize an empty list 'result' to store the generated contigs.
    2. Construct the De Bruijn graph from the given list of k-mers.
    3. Get the incoming edges for each node in the graph.
    4. Iterate over each key in the graph:
        a. Get the outgoing edges for the current k-mer.
        b. If the current k-mer does not have exactly one incoming and one outgoing edge:
            - Create a list of contigs containing the current k-mer repeated for the number of outgoing edges.
            - For each outgoing edge, extend the contig by traversing the graph until the current node has
              more than one incoming or outgoing edge.
        c. Append the generated contigs to the 'result' list.
    5. Return the 'result' list of contigs.
    """
    result = []
    outs_graph = debruijn_graph_from_kmers(kmers)
    ins_graph = get_incoming_edges(outs_graph)
    for kmer in list(outs_graph.keys()):
        outs = outs_graph[kmer]
        if not is_one_in_one_out(kmer, ins_graph, outs_graph):
            contigs = [kmer] * len(outs)
            for i, out in enumerate(outs):
                current = out
                contigs[i] += current[-1]
                while is_one_in_one_out(current, ins_graph, outs_graph):
                    current = outs_graph[current][0]
                    contigs[i] += current[-1]
            result += contigs
    return result

def display_contigs(contigs: List[str]) -> str:
    """
    Display contigs as a string.

    vbnet
    Copy code
    Args:
        contigs: A list of contigs.

    Returns:
        A string representation of contigs.
    """
    return ' '.join(contigs)

In [212]:
sample_input = [
    'ATG',
    'ATG',
    'TGT',
    'TGG',
    'CAT',
    'GGA',
    'GAT',
    'AGA',
]
sample_output = 'AGA ATG ATG CAT GAT TGGA TGT'
assert sample_output == display_contigs(sorted(generate_contigs(sample_input)))

In [217]:
input_filename = 'dataset_205_5'
with open(f'data/{input_filename}.txt', 'r') as input_file:
    test_input = input_file.read().split()

contigs = generate_contigs(test_input)

output_filename = 'submission_' + '_'.join(input_filename.split('_')[1:])
with open(f'data/{output_filename}.txt', 'w') as output_file:
    output_file.write(display_contigs(contigs))

## Challenge: Carsonella ruddii

Given a collection of simulated error-free read-pairs (with exact distance d = 1000 between reads of length k = 120 within a read-pair), use the paired de Bruijn graph to reconstruct the Carsonella ruddii genome. Compare this assembly to the assembly obtained from the classic de Bruijn graph (i.e., when all we know is the reads themselves and do not know the distance between paired reads) in order to better appreciate the benefits of read-pairs. For each k, what is the minimum value of d needed to enable reconstruction of the entire Carsonella ruddii genome from its (k,d)-mer composition?

In [220]:
k = 120
d = 1000

with open(f'data/reads.txt', 'r') as reads_file:
    read_pairs = reads_file.readlines()

result = reconstruction_from_read_pairs(read_pairs, k, d)