<a href="https://colab.research.google.com/github/byunsy/bioinformatics-algorithms-py/blob/main/BA_3J.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# String Reconstruction from Read-Pairs Problem

### Function

In [1]:
import random
import copy

In [2]:
def prefix(string):
    # return string[:len(string)-1]
    return string[:-1]

def suffix(string):
    return string[1:]

In [3]:
def SearchCycle(graph, start_node):
    cycle = [start_node] # start cycle
    avail_nodes = []     # nodes in the current cycle that have unused edges
    unused = copy.deepcopy(graph) # unused edges 
  
    node = start_node
    while unused[node] != []:

        # randomly select the next node to explore
        next = unused[node][random.randint(0, len(unused[node])-1)] 
        unused[node].remove(next)  # remove from graph dict value
        cycle.append(next)         # add to cycle
        node = next

        # if dead end
        if node not in unused:
            unused[node] = []
            break

    for i in unused:
        if unused[i] != []:
            avail_nodes.append(i) # find nodes with unused edges

    return cycle, avail_nodes

In [4]:
def EulerianPath(graph):

    # Initialize variables
    cycle = []
    unexp_edges = [0] 
    start_node = list(graph.keys())[0] # first key in graph

    while unexp_edges != []:

        cycle, unexp_edges = SearchCycle(graph, start_node)

        # If there exist any unexplored edges,
        # then select a new start_node from unexp_edges
        if unexp_edges:
            start_node = unexp_edges[0] 

    return cycle

In [5]:
def PairedDeBruijn(patterns):

    # Split the kmers to parse more easily
    paired_kmers = []
    for pairs in sorted(patterns):
        paired_kmers.append(pairs.split('|'))

    # Same logic as De Bruijn but for both kmer in each pair
    graph = {}
    for kmers in paired_kmers:

        pairs_pfx = prefix(kmers[0]) + "|" + prefix(kmers[1])
        pairs_sfx = suffix(kmers[0]) + "|" + suffix(kmers[1])

        if pairs_pfx not in graph:
            graph[pairs_pfx] = [pairs_sfx]
        else:
            graph[pairs_pfx].append(pairs_sfx)

    return graph

In [8]:
def ReadPairsStrRecon(k, d, pairs):

    # Create paired De Bruijn graph and find Eulerian path
    paired_db_graph = (PairedDeBruijn(pairs))
    path = EulerianPath(paired_db_graph)
    
    # Reconstruct the string
    # 01. Take the entire first left kmer
    # 02. Afterwards, take the last nucleotide of each left kmer
    # 03. Starting from len(path)-k-d of path, take the last
    #     nucleotide of each right kmer
     
    text = path[0][:k-1]            # left kmer of the first pair
    for i in range(1, len(path)):
        text += path[i][k-2]        # the last nucleotide of left kmer
    for j in path[len(path)-k-d:]:
        text += j[-1]               # the last nucleotide of right kmer

    return text

In [7]:
pairs = ["GAGA|TTGA",
         "TCGT|GATG",
         "CGTG|ATGT",
         "TGGT|TGAG",
         "GTGA|TGTT",
         "GTGG|GTGA",
         "TGAG|GTTG",
         "GGTC|GAGA",
         "GTCG|AGAT"]

print(ReadPairsStrRecon(4, 2, pairs))

GTGGTCGTGAGATGTTGA


### Test Cases

In [9]:
# Create a function for test suite
def TestSuite(function, cases):
    print("*"*50)
    print("TEST SUITE\n")
    passed = 0
    for i, case in enumerate(cases):
        k, d, pairs, answer = case
        result = function(k, d, pairs)
        if result == answer:
            print("- Test Case {} Passed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
            passed += 1
        else:
            print("- Test Case {} Failed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
    print("\n{} out of {} passed.".format(passed, len(cases)), end=" ")
    print("END OF TEST SUITE.")
    print("*"*50)

In [10]:
# Create test cases to pass into test suite
case1 = (4, 2, ["GAGA|TTGA","TCGT|GATG","CGTG|ATGT","TGGT|TGAG","GTGA|TGTT","GTGG|GTGA","TGAG|GTTG","GGTC|GAGA","GTCG|AGAT"], "GTGGTCGTGAGATGTTGA")

cases = [case1]

TestSuite(ReadPairsStrRecon, cases)

**************************************************
TEST SUITE

- Test Case 1 Passed. Expected: GTGGTCGTGAGATGTTGA, Actual: GTGGTCGTGAGATGTTGA

1 out of 1 passed. END OF TEST SUITE.
**************************************************
