In [1]:
import numpy as np
import pandas as pd
from os import path,makedirs
from pathlib import Path
import matplotlib.pyplot as plt
import sys
import shelve
import pprint

In [2]:
def ReadFasta(filename):
    with open(filename,'r') as f:
        data = f.readlines()
    return data

In [3]:
def ParseReads(data):
    reads = {}
    for num in range(len(data)):
        if data[num].startswith('>'):
            key = data[num].rstrip() # remove newline char
            val = data[num+1].rstrip() # remove newline char
            reads[key] = val
    return reads

In [4]:
def ConvertToBinary(s):
    out_string = np.array([])
    for letter in s:
        out_string = np.append(out_string,hash(letter))
    return out_string

In [5]:
class GraphNode():
    def __init__(self,seq):
        self.seq = seq
        self.next = []
        self.prev = []
        self.freq = 1

In [6]:
def construct_sequence_from_nodes(node_list):
    out_sequence = ''.join([n.seq[0] for n in node_list[0:-1]])+node_list[-1].seq
    return out_sequence

In [7]:
class DBGraph():
    pass

In [8]:
def RemoveEdge(edge_list,edge_to_remove):
    try:
        edge_list.remove(edge_to_remove)
    except ValueError:
        print('Edge %s not found in list! Please check.')
    return edge_list

In [9]:
def SelectNextNode(current_node,node_table):
    options = np.unique(current_node.next)
    if len(options)>1:
        #node degree of all our possible next nodes
        possible_next_node_degrees = np.array([len(node_table[current_node.seq[1:]+x].next) for x in options])
        #if none of them have any edges, just pick one at random, we will stop here anyways!
        if not np.any(possible_next_node_degrees>0):
            next_na = np.random.choice(options)
        else:
            #try to find ones that aren't bridges--have a degree of more than 1
            try:
                next_na = np.random.choice(options[np.where(possible_next_node_degrees>1)])
            #if they are all bridges, then just pick any that aren't zero
            except ValueError:
                next_na = np.random.choice(options[np.where(possible_next_node_degrees>0)])
    elif len(options) == 1:
        next_na = options[0]
    else:
        next_na = None
    return next_na

In [35]:
# reads = ParseReads(ReadFasta('../data/READS.fasta'))
# 
# h = np.histogram([len(x) for x in list(reads.values())],bins=100)
# read_lengths = [len(x) for x in list(reads.values())]
# plt.hist([len(x) for x in list(reads.values())],bins=100)
# 
# %%time
# #read and parse all our k-mers
# kmersize = 27
# node_table = {}
# for read in reads.values():
#     idx = 0;
#     previousNode = None
#     while idx<len(read)-kmersize+1:
#         kmer = read[idx:kmersize+idx]
#         kmer_1 = kmer[0:-1]
#         kmer_2 = kmer[-1]
#         if kmer_1 in node_table:#grab the existing node
#             currentNode = node_table[kmer_1]
#             #add one to the frequency
#             currentNode.freq+=1
#         else:#create new node
#             currentNode = GraphNode(kmer_1)
#         #add our next node
#         currentNode.next.append(kmer_2)
#         #add the previous node if its here
#         if previousNode:
#             currentNode.prev.append(previousNode)
#         #save it to our node table!
#         node_table[kmer_1] = currentNode
#         #set our previous node for reversal of the graph
#         previousNode = kmer_1[0]
#         idx+=1
# 
# outname = '../data/DeBruijneGraph'
# graph = shelve.open(outname)
# graph['node_table'] = node_table
# graph['kmersize'] = kmersize
# graph['reads'] = reads
# graph.close()

In [10]:
outname = '../data/DeBruijneGraph'
graph = shelve.open(outname)
node_table = graph['node_table']
kmersize = graph['kmersize']
reads = graph['reads']

In [11]:
# read in our query
query_seq = list(ParseReads(ReadFasta('../data/QUERY.fasta')).values())[0]

In [12]:
# solve for our query sequence
idx = 0
solved_nodes = []
removed_edges = []
while idx<len(query_seq)-kmersize+2:#+2 because we need the end of the sequece, i.e. we have to dial ONE. NUMBER. HIGHER.
    kmer_1 = query_seq[idx:kmersize+idx-1]
    try:
        cn = node_table[kmer_1]
    except(KeyError):
        raise ValueError('Kmer not found %s'%kmer_1)
    #check that our next node solution is valid!
    if len(query_seq)>(kmersize+idx):
        next_node = query_seq[kmersize+idx-1] 
        if not next_node in cn.next:
            raise ValueError('No solution found! Stopping at this sequence: %s'%construct_sequence_from_nodes(solved_nodes))
        else:
            #remove the edge!
            removed_edges += RemoveEdge(cn.next,next_node)
    solved_nodes.append(cn)    
    idx+=1

In [13]:
#extend our solution!
cn = solved_nodes[-1]
keep_looping = True
while keep_looping:
    next_na = SelectNextNode(cn, node_table)
    #if we can't find a next node, stop looping
    if next_na is None:
        keep_looping = False
    #if we can, look for it in the node table
    else:
        try:
            next_node = node_table[cn.seq[1:]+next_na]
        except KeyError:
            print('Sequence %s not found in the node table!'%(cn.seq[1:]+next_na))
            keep_looping = False
        removed_edges += RemoveEdge(cn.next,next_na)
        solved_nodes.append(cn)
    if len(solved_nodes) == 5000:
        keep_looping = False

<__main__.GraphNode at 0x20a0efcf850>