In [1]:
def read_In(fileName):
    with open(fileName, 'r') as f:
        composition = f.readlines()
    n = len(composition)
    for i in range(n):
        if composition[i][-1] == '\n':
            composition[i] = composition[i][:-1]
    composition.sort()
    return composition

In [2]:
def DeBruijnGraph(composition):
    dictionary = {}

    for patterns in composition:
        if patterns[:-1] not in dictionary: 
            dictionary[patterns[:-1]] = [patterns[1:]]
        else:
            dictionary[patterns[:-1]].append(patterns[1:])
            
    return dictionary

In [3]:
def remove_edge(DeBruijn, from_node, to_node):
    DeBruijn[from_node].remove(to_node)
    if not DeBruijn[from_node]:
        del DeBruijn[from_node]
    return DeBruijn

In [4]:
def maximal_non_branching_paths(DeBruijn):
    paths = []

    # define a dictionary storing all information of in & out degrees of each node
    in_out_degrees = {}
    for source, targets in DeBruijn.items():
        if source not in in_out_degrees:
            in_out_degrees[source] = [0, len(targets)]
        else:
            in_out_degrees[source][1] += len(targets)

        for target in targets:
            if target not in in_out_degrees:
                in_out_degrees[target] = [1, 0]
            else:
                in_out_degrees[target][0] += 1

    # find non-branching paths
    for fore_node in list(in_out_degrees):
        
        # find potential node could be marked as the starting node of one path
        if in_out_degrees[fore_node] != [1, 1]:
            
            # select node as satrting node
            if in_out_degrees[fore_node][1] > 0:
                while fore_node in DeBruijn:
                    later_node = DeBruijn[fore_node][0]
                    non_branching_path = [fore_node, later_node]
                    DeBruijn = remove_edge(DeBruijn, fore_node, later_node)
                    
                    # extend non branching path by adding nodes with [1, 1] in & out degree
                    while in_out_degrees[later_node] == [1, 1]:
                        continue_node = DeBruijn[later_node][0]
                        non_branching_path.append(continue_node)
                        DeBruijn = remove_edge(DeBruijn, later_node, continue_node)
                        later_node = continue_node
                    paths.append(non_branching_path)

    # find other cycles in the remaining nodes and edges which is non-connected with other cycle
    # isolated cycles
    while DeBruijn:
        start_node = list(DeBruijn)[0]
        current_node = DeBruijn[start_node][0]
        DeBruijn = remove_edge(DeBruijn, start_node, current_node)
        cycle = [start_node, current_node]
        while current_node != start_node:
            target_node = DeBruijn[current_node][0]
            cycle.append(target_node)
            adj_list = remove_edge(DeBruijn, current_node, target_node)
            current_node = target_node
        paths.append(cycle)

    return paths

In [48]:
def return_path(path_composition):
    
    contigs = []
    for path in path_composition:
        contig = [path[0]]
        for edge in path[1:]:
            contig.append(edge[-1])
        contig = ''.join(contig)
        contigs.append(contig)
    # print(contigs)
    contigs.sort()
    contigs = ' '.join(contigs)
    return contigs

In [57]:
if __name__ == '__main__':
    graph = read_In('rtext.txt')
    DeBruijn = DeBruijnGraph(graph)
    path_composition = maximal_non_branching_paths(DeBruijn)
    path = return_path(path_composition)
    print(path)

AAACAGACCCACGTGTTGC AAACAGACCCACGTGTTGC AAACATGTTCAAACGTTTG AAACATGTTCAAACGTTTG AAACCATGCAATACGTAAT AAACCATGCAATACGTAAT AAACTTCTTGATATGGTCAATGCGCGACTGACGCA AAACTTGGCCAGTCATGTACCTGGGAAAGCT AACAGACCCACGTGTTGCC AACAGACCCACGTGTTGCC AACAGTTAGGTTGCCCGATAGCTGCGAAAGAGGTCAC AACATGTTCAAACGTTTGG AACATGTTCAAACGTTTGG AACCAACTATATAGTACAT AACCAACTATATAGTACAT AACCATGCAATACGTAATC AACCATGCAATACGTAATC AACGCCTCCATGAACCCAAGACTTCCCGGTCG AAGACTTCCCGGTCGATCTGGACTTACCCTACG AAGATCTCAGGCGTGCTAG AAGATCTCAGGCGTGCTAG AAGCGCAGTGTGTCTTATCCCCGACTGGAAGAG AAGGGTGAAGATATCTAAA AAGGGTGAAGATATCTAAA AAGTATGCAGAGTACAAGT AAGTATGCAGAGTACAAGT AATGCGCGACTGACGCAGG AATGCGCGACTGACGCAGG AATTCGTGGAGAACTCAAC AATTCGTGGAGAACTCAAC AATTGAAGAAACGTGTAAT AATTGAAGAAACGTGTAAT ACAAGCGCAGTGTGTCTTA ACAAGCGCAGTGTGTCTTA ACAGACCCACGTGTTGCCA ACAGACCCACGTGTTGCCA ACATGTTCAAACGTTTGGC ACATGTTCAAACGTTTGGC ACCAACTATATAGTACATTCTGCCTACAGTTGTCGT ACCATGCAATACGTAATCG ACCATGCAATACGTAATCG ACCATGTTAGCCTTGTGCA ACCATGTTAGCCTTGTGCA ACGATGTGGGCGTACAGTA ACGATGTGGGCGTACA