In [26]:
import random
import toyplot

In [27]:
def get_kmer_count_from_sequence(sequence, k=3, cyclic=True):
    """
    Returns dictionary with keys representing all possible kmers in a sequence
    and values counting their occurrence in the sequence.
    """
    # dict to store kmers
    kmers = {}
    
    # count how many times each occurred in this sequence (treated as cyclic)
    for i in range(0, len(sequence)):
        kmer = sequence[i:i + k]
        
        # for cyclic sequence get kmers that wrap from end to beginning
        length = len(kmer)
        if cyclic:
            if len(kmer) != k:
                kmer += sequence[:(k - length)]
        
        # if not cyclic then skip kmers at end of sequence
        else:
            if len(kmer) != k:
                continue
        
        # count occurrence of this kmer in sequence
        if kmer in kmers:
            kmers[kmer] += 1
        else:
            kmers[kmer] = 1
    
    return kmers

In [28]:
def short_read_sequencing(sequence, nreads, readlen):
    "generate short reads from a circular genome"
    
    # do not allow reads to be longer than genome
    assert len(sequence) > readlen, "readlen must be shorter than sequence"
    
    # get random start positions of short reads
    starts = [random.randint(0, len(sequence)) for i in range(nreads)]
    
    # return reads as a list, generate reads by slicing from sequence
    reads = []
    for position in starts:
        end = position + readlen
        
        # if read extends past end then loop to beginning of sequence
        if end > len(sequence):
            read = sequence[position:len(sequence)] + sequence[0:end-len(sequence)]
        else:
            read = sequence[position:position + readlen]
        
        # append to reads list
        reads.append(read)
    return reads

In [29]:
def get_kmer_count_from_reads(reads, k=3):
    "Combines results of 'get_kmer_count_from_sequence()' across many reads"
   
    # a dictionary to store kmer counts in
    kmers = {}
    
    # iterate over reads
    for read in reads:
        
        # get kmer count for this read
        ikmers = get_kmer_count_from_sequence(read, k, cyclic=False)
        
        # add this kmer count to the global kmer counter across all reads
        for key, value in ikmers.items():
            if key in kmers:
                kmers[key] += value
            else:
                kmers[key] = value
                
    # return kmer counts
    return kmers

In [30]:
def get_debruijn_edges_from_kmers(kmers):
    """
    Every possible (k-1)mer (n-1 suffix and prefix of kmers) is assigned
    to a node, and we connect one node to another if the (k-1)mer overlaps 
    another. Nodes are (k-1)mers, edges are kmers.
    """
    # store edges as tuples in a set
    edges = set()
    
    # compare each (k-1)mer
    for k1 in kmers:
        for k2 in kmers:
            if k1 != k2:            
                # if they overlap then add to edges
                if k1[1:] == k2[:-1]:
                    edges.add((k1[:-1], k2[:-1]))
                if k1[:-1] == k2[1:]:
                    edges.add((k2[:-1], k1[:-1]))

    return edges

In [31]:
def plot_debruijn_graph(edges, width=500, height=500):
    "returns a toyplot graph from an input of edges"
    graph = toyplot.graph(
        [i[0] for i in edges],
        [i[1] for i in edges],
        width=width,
        height=height,
        tmarker=">", 
        vsize=25,
        vstyle={"stroke": "black", "stroke-width": 2, "fill": "white"},
        vlstyle={"font-size": "11px"},
        estyle={"stroke": "black", "stroke-width": 2},
        layout=toyplot.layout.FruchtermanReingold(edges=toyplot.layout.CurvedEdges()))
    return graph

In [32]:
def random_sequence(seqlen):
    "Generate a random DNA sequence of a given length "
    return "".join([random.choice("ACGT") for i in range(seqlen)])

In [None]:
# set a random seed 
random.seed(123)

# get a random genome sequence
genome1 = random_sequence(25)
genome1

In [None]:
# not all possible kmers occur in this sequence, some occur multiple times.
kmers = get_kmer_count_from_sequence(genome1, k=3)
kmers

In [None]:
# edges of connected (k-1)mers for k=3 representing the db graph
edges = get_debruijn_edges_from_kmers(kmers)
edges

In [None]:
# get kmers
kmers = get_kmer_count_from_sequence(genome1, k=6, cyclic=True)

# get db graph
edges = get_debruijn_edges_from_kmers(kmers)

# plot db graph
plot_debruijn_graph(edges, width=600, height=400);

# print the true sequence
print("the true sequence: {}".format(genome1))

In [None]:
# get kmers
kmers = get_kmer_count_from_sequence(genome1, k=6, cyclic=False)

# get db graph
edges = get_debruijn_edges_from_kmers(kmers)

# plot db graph
plot_debruijn_graph(edges, width=600, height=400);

# print the true sequence
print("the true sequence: {}".format(genome1))

In [None]:
kmers = get_kmer_count_from_sequence(genome1, k=4, cyclic=False)
edges = get_debruijn_edges_from_kmers(kmers)
plot_debruijn_graph(edges, width=800, height=400)

In [None]:
random.seed(123)
genome = random_sequence(50)
reads = short_read_sequencing(genome, 1000, 15)
kmers = get_kmer_count_from_reads(reads, k=8)
edges = get_debruijn_edges_from_kmers(kmers)
plot_debruijn_graph(edges);
print(genome)