In [4]:
from boltons.iterutils import pairwise
from boltons.iterutils import windowed_iter as windowed

from boink import libboink
from boink.dbg import dBG
from boink.hashing import CanUnikmerShifter
from boink.storage import SparseppSetStorage
from boink.traversal import STATES

make_masked = libboink.make_masked

from debruijnal_enhance_o_tron.generators.generator import SequenceGenerator

from cppyy.gbl import std

In [2]:
class Tag:
    def __init__(self, link, L, R):
        self.link = link
        self.L = L
        self.R = R
    
    def __repr__(self):
        return f'<Tag link={self.link}, L={self.L}, R={self.R}>'

class Tip:
    def __init__(self, kmer, partner, neighbor_tag, unitig_dir):
        self.kmer = kmer
        self.partner = partner
        self.neighbor_tag = neighbor_tag
        self.unitig_dir = unitig_dir

class TagGraph:
    def __init__(self, tagger):
        self.tagger = tagger
        self.dbg = tagger.dbg
        self.tag_map = {}
        self.tip_map = {}
    
    def find_new_tags(self, sequence):
        segments = self.tagger.build_new_segments(self.tagger.filter_new_extensions(self.tagger.find_new_extensions(sequence)))
        links = []
        tags = {}
        
        for segment in segments:
            segment_links = []
            for u, v in pairwise(segment):
                uh, up, unb = u
                vh, vp, vnb = v
            
                if self.tagger.is_tag(uh, vh):
                    segment_links.append((uh, vh))
            links.append(segment_links)
        
        make_link = lambda pair: (pair[0].value, pair[1].value) 
        
        for link_chain, segment in zip(links, segments):
            
            tags[make_link(link_chain[0])] = Tag(make_link(link_chain[0]), None, make_link(link_chain[1]))
            for ulink, vlink, wlink in windowed(link_chain, 3):
                tags[make_link(vlink)] = Tag(make_link(vlink), make_link(ulink), make_link(wlink))
            tags[make_link(link_chain[-1])] = Tag(make_link(link_chain[-1]), make_link(link_chain[-2]), None)
        
        return tags
    
    def find_segments(self, sequence):
        return self.tagger.build_new_segments(self.tagger.filter_new_extensions(self.tagger.find_new_extensions(sequence)))
    
    def solve_internal_cdbg(self, sequence, segments):
        for segment in segments:
            for segment_u, segment_v in pairwise(segments):
                
    
    def find_split_seeds(self, sequence, segments):
        new = std.set[self.dbg.hash_type]()
        for segment in segments:
            for h, upos, (_, _) in segment:
                new.insert(h)
        
        masked = make_masked(self.dbg, new)
        
        for segment in segments:
            front_h, front_pos, (front_lneighbors, _) = segment[0]
            
            search_from = masked.filter_nodes(front_lneighbors)
            masked.set_cursor(sequence[front_pos:front_pos+K])
            
            

SyntaxError: unexpected EOF while parsing (<ipython-input-2-e46dd97bc70c>, line 60)

In [109]:
K = 31

tagger_type = libboink.cdbg.USparseGraph[SparseppSetStorage].Graph
store = SparseppSetStorage.build()
hasher = CanUnikmerShifter.build(K, 7)

graph = dBG[type(store), CanUnikmerShifter].build(store, hasher)
tagger = tagger_type.build(graph, hasher.ukhs_map)

generator = SequenceGenerator(ksize=K, rseed=42)

tg = TagGraph(tagger)
seq = generator.random_unitig(100)

In [111]:
tg.find_new_tags(seq)

{(7934545079051529215,
  2764121419379474404): <Tag link=(7934545079051529215, 2764121419379474404), L=None, R=(818812955200927988, 15129357215172498693)>,
 (818812955200927988,
  15129357215172498693): <Tag link=(818812955200927988, 15129357215172498693), L=(7934545079051529215, 2764121419379474404), R=(830867074220822841, 3680904267476707804)>,
 (830867074220822841,
  3680904267476707804): <Tag link=(830867074220822841, 3680904267476707804), L=(818812955200927988, 15129357215172498693), R=(8408950140949573313, 324211508350410007)>,
 (8408950140949573313,
  324211508350410007): <Tag link=(8408950140949573313, 324211508350410007), L=(830867074220822841, 3680904267476707804), R=(11481326741199404490, 2400874475101700531)>,
 (11481326741199404490,
  2400874475101700531): <Tag link=(11481326741199404490, 2400874475101700531), L=(8408950140949573313, 324211508350410007), R=(1566111346320143109, 2625455515602636957)>,
 (1566111346320143109,
  2625455515602636957): <Tag link=(156611134632014

In [102]:
tg.tag_map

{(7934545079051529215,
  2764121419379474404): <Tag link=(7934545079051529215, 2764121419379474404), L=None, R=(818812955200927988, 15129357215172498693)>,
 (818812955200927988,
  15129357215172498693): <Tag link=(818812955200927988, 15129357215172498693), L=(7934545079051529215, 2764121419379474404), R=(830867074220822841, 3680904267476707804)>,
 (830867074220822841,
  3680904267476707804): <Tag link=(830867074220822841, 3680904267476707804), L=(818812955200927988, 15129357215172498693), R=(8408950140949573313, 324211508350410007)>,
 (8408950140949573313,
  324211508350410007): <Tag link=(8408950140949573313, 324211508350410007), L=(830867074220822841, 3680904267476707804), R=(11481326741199404490, 2400874475101700531)>,
 (11481326741199404490,
  2400874475101700531): <Tag link=(11481326741199404490, 2400874475101700531), L=(8408950140949573313, 324211508350410007), R=(1566111346320143109, 2625455515602636957)>}