In [143]:
#FINAL SCRIPT WITH FULL SCRIPT TIMESTAMP AND TIME STAMP AFTER LOADING THE INPUT DATA AND before printing THE OUTPUT
# !/usr/bin/env python3
import sys
from collections import defaultdict
import random
import time
start_time = time.time()


########################################################################
# File:problem14.py
#  executable: problem14.py
#  purpose: Reconstruct a String from its k-mer Composition
#  stderr: errors and status
#  stdout:
#
# Author: Arushi Mithal
#
#
# Notes:  1. To run the program from command line terminals:
#          Unix/Windows: python  problem14.py < input.txt > output.out
#
#
# Laptop, where test were running, specs:
#        Windows 10-64bit. Processor i-5 5200U CPU @2.20GHz 2.20 GHz
#        Internal RAM  4.00 GB
########################################################################


class StringReconstruction:
    """
        Used to Reconstruct a String from its k-mer Composition.

        Take the list of k-mers as patterns, create their deBruijn graph, use this to create their Eulerian Path
        and from the Eulerian Path reconstruct the genome string.

        use commandline: python  problem14.py < input.txt > output.out
    """
    def __init__(self, k, kmers):
        """StringReconstruction Constructor"""
        self.k = k
        self.patterns = kmers
        self.sortedGraph = defaultdict(str)
        self.vnode = str
        self.knode = str

    def deBruijn(self):
        """ Create and print the DeBruijn Graph using the collection of k-mers. """
        prefix = []
        suffix = []
        for i in self.patterns:
            prefix.append(i[:-1])
            suffix.append(i[1:])
        graph = {}
        for i in range(len(prefix)):
            if prefix[i] in graph:
                graph[prefix[i]].append(suffix[i])
            else:
                graph[prefix[i]] = [suffix[i]]
        self.sortedGraph = {key: sorted(value) for key, value in sorted(graph.items())}      
             
        
    def cycle(self):
        """ print string and create the Eulerian Path """
        outDegree = {}
        inDegree = defaultdict(int)
        for k, v in self.sortedGraph.items():  # iterate over self.graph dictionary to find no. of outDegrees for a node
            outDegree[k] = len(v)  # set outDegree key(node) equal to length of the list of values in that key(node),
        for k, v in self.sortedGraph.items():  # iterate over the graph dictionary
            for node in self.sortedGraph.keys():  # iterate over graph keys (or nodes)
                if node in v:  # check if a particular key(node) in the graph is also present as a value(node) in graph.
                    inDegree[node] += 1  # If true, then add 1 to inDegrees[node]
            for node in v:  # iterate over elements in the list of value v of the graph
                if node not in self.sortedGraph.keys():  # check if this element node is NOT present in the graph keys
                    inDegree[node] += 1  # if true, then add this element node to inDegrees
        for node in inDegree.keys():  # now iterate over the inDegrees dictionary keys
            if node not in outDegree.keys():  # check if any node in inDegree keys is NOT present in outDegree keys
                outDegree[node] = 0  # if True, make a new key- value pair in outDegree dictionary
        for node in outDegree.keys():  # now iterate over outDegrees dictionary keys
            if node not in inDegree.keys():  # check if any node in outDegree keys NOT present in inDegree keys
                inDegree[node] = 0  # if True, make new key-value pair in inDegree dictionary
        for node in inDegree.keys():  # iterate over inDegree keys
            outgoing = outDegree[node]  # outgoing = number of keys in outDegree
            incoming = inDegree[node]  # incoming = number of keys in inDegree
            if outgoing == incoming:  # check if outgoing equal to incoming, If True, pass
                pass
            elif outgoing > incoming:  # if outgoing more than incoming , make that extra node self.vnode
                self.vnode = node
            else:
                self.knode = node  # if incoming more than outgoing, make that extra node self.knode
        if self.knode in self.sortedGraph.keys():  # if graph has knode as key, then append vnode as value to it
            self.sortedGraph[self.knode].append(self.vnode)
        else:
            self.sortedGraph[self.knode] = [
                self.vnode]  # if not, make a new key-value pair in graph with knode as key and vnode as value
        cycle = self.eulerianGraph()
        cycle.pop(-1)
        cycle.reverse()
        path1 = list
        path2 = list
        for i in range(len(cycle)):
            if cycle[i - 1] == self.knode and cycle[i] == self.vnode:
                path1 = cycle[:i]
                path2 = cycle[i:]
        eulerianPath = path2 + path1
        string = self.genomeString(eulerianPath)
        return string

    def eulerianGraph(self):
        """ Return the Eulerian Cycle from the given Graph """
        traverse = []
        cycle = []
        start = random.choice(list(self.sortedGraph.keys()))
        traverse.append(start)
        while len(traverse):
            if self.sortedGraph[start] in self.sortedGraph.values():
                node = self.sortedGraph[start].pop(-1)
                while node != start:
                    traverse.append(node)
                    node = self.sortedGraph[node].pop(-1)
                traverse.append(node)
                for k in range(len(traverse)):
                    if len(self.sortedGraph[traverse[-1]]) > 0:
                        start = traverse[-1]
                    if len(self.sortedGraph[traverse[-1]]) == 0:
                        cycle.append(traverse.pop(-1))
        return cycle

    def genomeString(self, kmerlist):
        """ Create and return the final string from the k-mer List. """
        string = ''
        string += kmerlist[0]
        for i in range(len(kmerlist) - 1):
            kmer1 = kmerlist[i]
            kmer2 = kmerlist[i + 1]
            if kmer1[1:] == kmer2[:-1]:
                string += kmer2[-1]
        return string


def main():
    """ Used to execute the program"""
    filename1="rosalind_ba3h_762_7_dataset.txt"
    output_file=sys.argv[2]
    #print("first")
    with open(filename1) as file:
        input1 = file.readlines()
    #print(input1)
    inp = []
    k = int(input1[0].rstrip('\n'))
    #print(k)
    for i in input1[1:]:
        kmer = i.rstrip('\n')
        inp.append(kmer)
    #print("second", inp)
    start_time = time.time()
    s1 = StringReconstruction(k, inp)
    #print(k)
    s1.deBruijn()
    a = s1.cycle()
    print("--- %s seconds ---" % (time.time() - start_time))
    print(a)
    print(len(a))


if __name__ == '__main__':
    main()
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.42470550537109375 seconds ---
AATGAACCGATACTGCGCCACGGGGACCGTATCGGATGCAAGTAGCCCGCCTATTGCCGACTACACCTTCTTATGTGCCGTTCTAGGGCCGTGTCCAAGGCCTACCGATGCGCGGTACATCAGAGCGCCGGGACTTCGCATCCTGGTAGTGTTCGGTTGCAACGAACGAGTCAGACGTATCCACTCGGTGTGTGCCCCCGAGACGGCGGCTTGTAATGATATGCGACCCCATCGTACGCTGAAGATCTGCGCAGAGGTGCTCGACTGTATTTTTCCTGGGAAGGTCGTAATGGTCGATCAGTTACACCATTGTGTATCTATTACAGAGGACAAGCCGCCTATTAACCAGATGGCAGAGGATCTCGTATAGAGGTTAGAGGTTCAGGCTCGATACATCACAACAGATACCAGCTTAGTCGCGTTACGGTCCACTATGGAAAAATGAGCTCATTTAGGTCCAAGACCGGCTCCTGGGATCGCCAACGAACCAGAAAAATAATAGTTTGAGACCCAACGAGGAAGATGTACTACCGTTTGTGGAAGACGTGCCCTAGCCACACGAGTTTTGTAACAAACTTAAATTTCCCGGGACGACGATTTGGGCACTGTTATTCAGTCCACTAGGTCTCGCTGTAAGATTTTGTAGAAATTTATTAGGGCCTTCTATAGAAGGGCCGGTATACATGAGCAGTTCCGTATCCCCGTTAAATCGCATGTCCATTTTCAAATCGAGAGTTCTACCTATGGAGCGAAAGATGAGTTGTGATTCTATGCTCATCGTGCATCTCGGATGGGGGCGAGCGAAGCTGAGTCATTCTAAATCGTAATGTGAGGCATTGCCAATTGCGAATGGGAGGGTAAGTGACAGTCCATTATCCTTATACGCGGAGCTAGAGAGCGCGGCCAAGGAGAAGTATGTAGCACCATGTAGCGTCTCTCGGTTTAAAGGGTTGGAAGAGGCTCA

In [93]:
#FFINAL WITH LOOP 100 TIMES
# !/usr/bin/env python3
import sys
from collections import defaultdict
import random
import time
start_time = time.time()


########################################################################
# File:problem14.py
#  executable: problem14.py
#  purpose: Reconstruct a String from its k-mer Composition
#  stderr: errors and status
#  stdout:
#
# Author: Arushi Mithal
#
#
# Notes:  1. To run the program from command line terminals:
#          Unix/Windows: python  problem14.py < input.txt > output.out
#
#
# Laptop, where test were running, specs:
#        Windows 10-64bit. Processor i-5 5200U CPU @2.20GHz 2.20 GHz
#        Internal RAM  4.00 GB
########################################################################


class StringReconstruction:
    """
        Used to Reconstruct a String from its k-mer Composition.

        Take the list of k-mers as patterns, create their deBruijn graph, use this to create their Eulerian Path
        and from the Eulerian Path reconstruct the genome string.

        use commandline: python  problem14.py < input.txt > output.out
    """
    def __init__(self, k, kmers):
        """StringReconstruction Constructor"""
        self.k = k
        self.patterns = kmers
        self.sortedGraph = defaultdict(str)
        self.vnode = str
        self.knode = str

    def deBruijn(self):
        """ Create and print the DeBruijn Graph using the collection of k-mers. """
        prefix = []
        suffix = []
        for i in self.patterns:
            prefix.append(i[:-1])
            suffix.append(i[1:])
        graph = {}
        for i in range(len(prefix)):
            if prefix[i] in graph:
                graph[prefix[i]].append(suffix[i])
            else:
                graph[prefix[i]] = [suffix[i]]
        self.sortedGraph = {key: sorted(value) for key, value in sorted(graph.items())}      
             
        
    def cycle(self):
        """ print string and create the Eulerian Path """
        outDegree = {}
        inDegree = defaultdict(int)
        for k, v in self.sortedGraph.items():  # iterate over self.graph dictionary to find no. of outDegrees for a node
            outDegree[k] = len(v)  # set outDegree key(node) equal to length of the list of values in that key(node),
        for k, v in self.sortedGraph.items():  # iterate over the graph dictionary
            for node in self.sortedGraph.keys():  # iterate over graph keys (or nodes)
                if node in v:  # check if a particular key(node) in the graph is also present as a value(node) in graph.
                    inDegree[node] += 1  # If true, then add 1 to inDegrees[node]
            for node in v:  # iterate over elements in the list of value v of the graph
                if node not in self.sortedGraph.keys():  # check if this element node is NOT present in the graph keys
                    inDegree[node] += 1  # if true, then add this element node to inDegrees
        for node in inDegree.keys():  # now iterate over the inDegrees dictionary keys
            if node not in outDegree.keys():  # check if any node in inDegree keys is NOT present in outDegree keys
                outDegree[node] = 0  # if True, make a new key- value pair in outDegree dictionary
        for node in outDegree.keys():  # now iterate over outDegrees dictionary keys
            if node not in inDegree.keys():  # check if any node in outDegree keys NOT present in inDegree keys
                inDegree[node] = 0  # if True, make new key-value pair in inDegree dictionary
        for node in inDegree.keys():  # iterate over inDegree keys
            outgoing = outDegree[node]  # outgoing = number of keys in outDegree
            incoming = inDegree[node]  # incoming = number of keys in inDegree
            if outgoing == incoming:  # check if outgoing equal to incoming, If True, pass
                pass
            elif outgoing > incoming:  # if outgoing more than incoming , make that extra node self.vnode
                self.vnode = node
            else:
                self.knode = node  # if incoming more than outgoing, make that extra node self.knode
        if self.knode in self.sortedGraph.keys():  # if graph has knode as key, then append vnode as value to it
            self.sortedGraph[self.knode].append(self.vnode)
        else:
            self.sortedGraph[self.knode] = [
                self.vnode]  # if not, make a new key-value pair in graph with knode as key and vnode as value
        cycle = self.eulerianGraph()
        cycle.pop(-1)
        cycle.reverse()
        path1 = list
        path2 = list
        for i in range(len(cycle)):
            if cycle[i - 1] == self.knode and cycle[i] == self.vnode:
                path1 = cycle[:i]
                path2 = cycle[i:]
        eulerianPath = path2 + path1
        string = self.genomeString(eulerianPath)
        return string

    def eulerianGraph(self):
        """ Return the Eulerian Cycle from the given Graph """
        traverse = []
        cycle = []
        start = random.choice(list(self.sortedGraph.keys()))
        traverse.append(start)
        while len(traverse):
            if self.sortedGraph[start] in self.sortedGraph.values():
                node = self.sortedGraph[start].pop(-1)
                while node != start:
                    traverse.append(node)
                    node = self.sortedGraph[node].pop(-1)
                traverse.append(node)
                for k in range(len(traverse)):
                    if len(self.sortedGraph[traverse[-1]]) > 0:
                        start = traverse[-1]
                    if len(self.sortedGraph[traverse[-1]]) == 0:
                        cycle.append(traverse.pop(-1))
        return cycle

    def genomeString(self, kmerlist):
        """ Create and return the final string from the k-mer List. """
        string = ''
        string += kmerlist[0]
        for i in range(len(kmerlist) - 1):
            kmer1 = kmerlist[i]
            kmer2 = kmerlist[i + 1]
            if kmer1[1:] == kmer2[:-1]:
                string += kmer2[-1]
        return string


def main():
    """ Used to execute the program"""
    filename1="rosalind_ba3h_762_11_dataset.txt"
    
    output_file=sys.argv[2]
    #print("first")
    with open(filename1) as file:
        input1 = file.readlines()
    #print(input1)
    inp = []
    k = int(input1[0].rstrip('\n'))
    #print(k)
    for i in input1[1:]:
        kmer = i.rstrip('\n')
        inp.append(kmer)
    #print("second", inp)
    s1 = StringReconstruction(k, inp)
    #print(k)
    
    for i in range(0,100):
        s1.deBruijn()
        a = s1.cycle()
    print(a)
    print("%s seconds" % (time.time() - start_time))
    
    


if __name__ == '__main__':
    main()
print("%s seconds" % (time.time() - start_time))

CAGAAGTCATGGGTCCTCCGGAGCCTGGAGGGAGGCCTGTTTCATTTGGCAACTGGGGCGCCGAACGTAAGGACTGGAAAGATAGAACTGTCGCATGGTCAGATATAAGCTGCGGGCCACACCCCCGTGAAATCTATAAGTACGCACAACAGTAATCAACAAATTCTAGCGTATTCTGAACGAACGCCATACGGAGTCATACCGCAGTAGTAATATATCGATGGAGCTCTAATCAAGTTCGCACGCCCCGGAGCCTCTTTTTGCTTAAACGTAGCATTCTCCCCTTGCACAATCTCCTTGCCGGCCTTCAAACCTCTTGAATATTCAGGATGATCCTCTTATGCTTGTCCATCTACCGTAGTCTGTATAAAATCTGCACCTCTGGATTACGAACTTATCCAGTCTGTCCAGACGGCTCTTATATTATTAGCGATATAACCAAGCCCAGGCTAACGCTCATCTGGTCCTGTAACCGAGCCTCTTAGAAGAACTGTATTGCATCATGGTATTAGGGTGAAAGAGCTTTTGCCCCACCTCTCGGGGGGCCGATACTCACATTATCGAGCATATGGAAAGTTGACACATACGGGCGCCACCGGCAAATACCGCATGGTTTGATTAAGTTGATAAATCGAAAACCCCTAAATCAGTGCGTGCTATGTTGTTACGGTCAAGGCGCCCAGAGAGGACTTAGAAAGGTGACGCAGCAACCTAGACTGCGTTGGTCAATCCCAGGAATCGGCCAATCAACAAAAACCTATCGGCATTCCCTTAGGCGGCAGAGGGTCGTCTTTCCACAAACACATACTATTACGCCGTGCCCTAAGTCTTCTTTACCGTACGGAAAGTTCTATTGGGTGGTCTATCAGGTTTATATCAACCTCGCCCGAGTCCCGCGTGTCGCGTGCATTATTTTGTTGTATCGGGCAATGGGCTAAACGGGCGAACTCTATTGAGATCGCCCTCGACAACGGGAATTAATAGTCGTGATCTACAGTTT