In [1]:
from copy import deepcopy

In [2]:
class Graph:
    #konstruktorska funkcija (metod)
    def __init__(self, adjacency_list):
        self.adjacency_list = deepcopy(adjacency_list)
    
    #metod koji vraca stringovsku reprezentaciju grafa
    def __str__(self):
        return self.adjacency_list.__str__()      #II nacin: f'{self.adjacency_list}' ili  str(self.adjacency_list)
    
    #metod koji vraca listu svih cvorova u grafu
    def get_all_nodes(self):
        return list(self.adjacency_list.keys())
    
    #metod koji vraca listu suseda za dati cvor 'node'
    def get_neighbors(self, node):
        return self.adjacency_list[node]
    
    #metod koji vraca broj suseda za dati cvor 'node'
    def num_neighbors(self, node):
        return len(self.get_neighbors(node))
    
    #metod koji dodaje novog suseda 'neighbor' za dati cvor 'node'
    def add_neighbor(self, node, neighbor):
        self.adjacency_list[node].append(neighbor)
    
    #metod koji uklanja suseda 'neighbor' za dati cvor 'node'
    def remove_neighbor(self, node, neighbor):
        self.adjacency_list[node].remove(neighbor)
    
    #metod koji vrsi dodavanje novog cvora 'node' u graf
    def add_node(self, node):
        if node not in self.get_all_nodes():
            self.adjacency_list[node] = []
        
    #metod koji vrsi uklanjanje cvora 'node' iz grafa
    def remove_node(self, node):
        if node in self.adjacency_list:
            del self.adjacency_list[node]
            
        for key, neighbors in self.adjacency_list.items():
            self.adjacency_list[key] = [n for n in neighbors if n != node]
           
    #metod koji vraca listu izlaznih grana za dati cvor 'node'
    def outbound_edges(self, node):
        neighbors = self.get_neighbors(node)
        edges = [(node, neighbor) for neighbor in neighbors]
        return edges
    
    #metod koji vraca listu ulaznih grana za dati cvor 'node'
    def inbound_edges(self, node):
        edges = []
        
        for key, neighbors in self.adjacency_list.items():
            for neighbor in neighbors:
                if neighbor == node:
                    edges.append((key, node))
                    
        return edges    
        
    #metod koji racuna izlazni stepen (broj izlaznih grana) za dati cvor 'node' 
    def out_degree(self, node):
        return len(self.outbound_edges(node))
    
    #metod koji racuna ulazni stepen (broj ulaznih grana) za dati cvor 'node'
    def in_degree(self, node):
        return len(self.inbound_edges(node))  
    
    #metod koji vraca listu svih grana u grafu
    def get_all_edges(self):
        edges = []
        
        for node in self.get_all_nodes():
            edges += self.outbound_edges(node)
            
        return edges    
    
    #metod koji vraca listu neposecenih grana zadatog cvora 'node' 
    #(grane koje pripadaju listi neposecenih grana 'unvisited_edges')
    def get_unvisited_edges(self, node, unvisited_edges):
        node_edges = self.outbound_edges(node)
        
        node_unvisited_edges = []
        
        for egde in unvisited_edges:
            (node_out, node_in) = egde
            if node_out == node:
                node_unvisited_edges.append(egde)
                
        return node_unvisited_edges        
    
    #metod koji ispituje da li dati cvor 'node' ima neposecene grane u listi 'unvisited_edges'
    def has_unvisited_edges(self, node, unvisited_edges):
        unvisited_edges = self.get_unvisited_edges(node, unvisited_edges)
        
        return len(unvisited_edges) > 0
    
    #metod koji pronalazi jedan Ojlerov ciklus u grafu
    def eulerian_cycle(self):
        cycle = []
        unvisited_edges = self.get_all_edges()
        current_node = self.get_all_nodes()[0]
        
        while len(unvisited_edges) > 0:
            current_cycle = []
            
            while self.has_unvisited_edges(current_node, unvisited_edges):
                unvisited_edges_from_current_node = self.get_unvisited_edges(current_node, unvisited_edges)
                selected_edge = unvisited_edges_from_current_node[0]
                unvisited_edges.remove(selected_edge)
                
                (node_out, node_in) = selected_edge
                current_cycle.append(node_in)
                current_node = node_in            
                
            cycle += current_cycle
            
            for i in range(len(cycle)):
                node = cycle[i]
                
                if self.has_unvisited_edges(node, unvisited_edges):
                    cycle = cycle[i+1:] + cycle[i+1]
                    current_node = node
                    break
                    
        return [cycle[-1]] + cycle
    
    #metod koji vrsi tzv. prevezivanje u grafu za cvorove 'u', 'v' i 'w'
    #dodajuci novi cvor 'v*' i obilazni put preko njega
    def bypass(self, u, v, w):
        new_node = v + '*' * self.in_degree(v)
        self.add_node(new_node)
        
        self.remove_neighbor(u, v)
        self.remove_neighbor(v, w)
        self.add_neighbor(u, new_node)
        self.add_neighbor(new_node, w)
    
    #metod koji proverava da li je graf prost u smislu da svaki   
    #cvor ima ulazni (a samim tim i izlazni) stepen tacno 1
    def is_simple(self):
        for node in self.get_all_nodes():
            if self.in_degree(node) > 1:
                return False
            
        return True
    
    #metod koji vraca listu neposecenih suseda zadatog cvora 'node' 
    #(cvorovi koje ne pripadaju listi posecenih cvorova 'visited_nodes')
    def get_unvisited_neighbors(self, node, visited_nodes):
        unvisited_neighbors = []
        
        for neighbor in self.get_neighbors(node):
                if neighbor not in visited_nodes:
                    unvisited_neighbors.append(neighbor)
                    
        return unvisited_neighbors         
    
    #metod koji ispituje da li dati cvor 'node' ima neposecene susede 
    #u odnosu na listi posecenih cvorova 'visited_nodes'
    def has_unvisited_neighbors(self, node, visited_nodes):
        unvisited_neighbors = self.get_unvisited_neighbors(node, visited_nodes)
        
        return len(unvisited_neighbors) > 0        
    
    #metod koji proverava da li je graf povezan (DFS obilazak)
    def is_connected(self):
        visited_nodes = set([])                                 #skup obidjenih cvorova
        begin_node = self.get_all_nodes()[0]                    #obilazak grafa pocinjemo iz proizvoljnog cvora
        stack = [begin_node]
        
        while len(stack) > 0:
            current_node = stack[-1]
            visited_nodes.add(current_node)
            
            if self.has_unvisited_neighbors(current_node, visited_nodes):
                unvisited_neighbors = self.get_unvisited_neighbors(current_node, visited_nodes)
                stack.append(unvisited_neighbors[0])            #dodajemo samo jednog (proizvoljnog) suseda, ne sve
            else:
                stack.pop() 
                
        total_num_nodes = len(self.get_all_nodes())
        
        return len(visited_nodes) == total_num_nodes            
    
    #metod koji pronalazi sve Ojlerove cikluse u grafu
    def all_eulerian_cycles(self):
        simple_graphs = []
        non_simple_graphs = []
        
        if self.is_simple():
            simple_graphs.append(self)    
        else:
            non_simple_graphs.append(self)
        
        #svaki non-simple graf uproscavamo dok ne postane simple pravljenjem bypass-ova    
        while len (non_simple_graphs) > 0:
            current_graph = non_simple_graphs[0]
            
            #pronalazimo (proizvoljni) non-simple cvor (koji ima ulazni a samim tim i izlazni stepen >1)
            #takav cvor mora da postoji s obzirom da graf nije prost! 
            non_simple_node = None
            for node in current_graph.get_all_nodes():
                if current_graph.in_degree(node) > 1:
                    non_simple_node = node
                    break
                    
            #i pravimo novi graf sa bypass-om na tom cvoru tj. odjednom pravimo za sve moguce bypass-ove
            #(za sve moguce kombinacije parove grana u/iz) na jednom cvoru po jedan novi (zaseban) graf
            non_simple_node_inbound_edges = current_graph.inbound_edges(non_simple_node)
            non_simple_node_outbound_edges = current_graph.outbound_edges(non_simple_node)
            
            for (u, _) in non_simple_node_inbound_edges:
                for (_, w) in non_simple_node_outbound_edges:
                    if u == non_simple_node and non_simple_node == w: #petlju (grana koja ide iz cvora nazad u 
                        continue                                      #samog sebe) ne raspetljavamo pomocu bypass-a
                    
                    new_graph = Graph(current_graph.adjacency_list)
                    new_graph.bypass(u, non_simple_node, w)
                    
                    if new_graph.is_connected():
                        if new_graph.is_simple():
                            simple_graphs.append(new_graph)
                        else:
                            non_simple_graphs.append(new_graph)
            
            non_simple_graphs.remove(current_graph)
           
        all_cycles = []    
        for graph in simple_graphs:
            cycle = graph.eulerian_cycle()
            all_cycles.append([node.replace('*', '') for node in cycle])
            
        deduplicated_cycles = set([tuple(cycle) for cycle in all_cycles])    
            
        return deduplicated_cycles    
    
    #metod koji pronalazi cvorove ciji se ulazni stepen razlikuje od izlaznog
    def get_unbalanced_nodes(self):
        unbalanced_nodes = []
        
        for node in self.get_all_nodes():
            if self.in_degree(node) != self.out_degree(node):
                unbalanced_nodes.append(node)
                
        return unbalanced_nodes
    
    #metod koji povezuje nebalansirane cvorove (to su pocetni i krajnji cvor kod De Brujin-ovog grafa) 
    #kako bi obezbedili uslov da graf sadrzi Ojlerov ciklus (ulazni stepen == izlazni stepen, za svaki cvor)
    def close_to_cycle(self):
        [u, v] = self.get_unbalanced_nodes()
        
        if self.in_degree(u) > self.out_degree(u):
            self.add_neighbor(u, v)
        else:
            self.add_neighbor(v, u)

<img src="assets/graph_eulerian_cycle.jpg" width="400">

In [3]:
adjacency_list = {
    'A' : ['B', 'E'],
    'B' : ['C'],
    'C' : ['A', 'E'],
    'D' : ['C'],
    'E' : ['D', 'F'],
    'F' : ['G'],
    'G' : ['A']
}

In [4]:
g = Graph(adjacency_list)

In [5]:
print(g)

{'A': ['B', 'E'], 'B': ['C'], 'C': ['A', 'E'], 'D': ['C'], 'E': ['D', 'F'], 'F': ['G'], 'G': ['A']}


In [6]:
g.get_all_nodes()

['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [7]:
g.get_neighbors('E')

['D', 'F']

In [8]:
g.num_neighbors('C')

2

In [9]:
g.add_neighbor('B', 'D')
print(g)

{'A': ['B', 'E'], 'B': ['C', 'D'], 'C': ['A', 'E'], 'D': ['C'], 'E': ['D', 'F'], 'F': ['G'], 'G': ['A']}


In [10]:
g.remove_neighbor('B', 'D')
print(g)

{'A': ['B', 'E'], 'B': ['C'], 'C': ['A', 'E'], 'D': ['C'], 'E': ['D', 'F'], 'F': ['G'], 'G': ['A']}


In [11]:
g.add_node('H')
g.add_neighbor('H', 'D')
g.add_neighbor('B', 'H')
print(g)

{'A': ['B', 'E'], 'B': ['C', 'H'], 'C': ['A', 'E'], 'D': ['C'], 'E': ['D', 'F'], 'F': ['G'], 'G': ['A'], 'H': ['D']}


In [12]:
g.remove_node('H')
print(g)

{'A': ['B', 'E'], 'B': ['C'], 'C': ['A', 'E'], 'D': ['C'], 'E': ['D', 'F'], 'F': ['G'], 'G': ['A']}


In [13]:
g.outbound_edges('B')

[('B', 'C')]

In [14]:
g.inbound_edges('C')

[('B', 'C'), ('D', 'C')]

In [15]:
g.out_degree('B')

1

In [16]:
g.in_degree('C')

2

In [17]:
g.get_all_edges()

[('A', 'B'),
 ('A', 'E'),
 ('B', 'C'),
 ('C', 'A'),
 ('C', 'E'),
 ('D', 'C'),
 ('E', 'D'),
 ('E', 'F'),
 ('F', 'G'),
 ('G', 'A')]

In [18]:
unvisited_edges = [('A', 'E'), ('E', 'F'), ('F', 'G'), ('G', 'A')]
g.get_unvisited_edges('A', unvisited_edges)

[('A', 'E')]

In [19]:
unvisited_edges = [('A', 'E'), ('E', 'F'), ('F', 'G'), ('G', 'A')]
g.has_unvisited_edges('A', unvisited_edges)

True

In [20]:
g.eulerian_cycle()

['A', 'B', 'C', 'A', 'E', 'D', 'C', 'E', 'F', 'G', 'A']

In [21]:
g.bypass('G', 'A', 'B')
print(g)

{'A': ['E'], 'B': ['C'], 'C': ['A', 'E'], 'D': ['C'], 'E': ['D', 'F'], 'F': ['G'], 'G': ['A**'], 'A**': ['B']}


In [22]:
g.is_simple()

False

In [23]:
visited_nodes = ['A', 'E', 'F', 'G',]
g.get_unvisited_neighbors('E', visited_nodes)

['D']

In [24]:
visited_nodes = ['A', 'E', 'F', 'G',]
g.has_unvisited_neighbors('E', visited_nodes)

True

In [25]:
g.is_connected()

True

In [26]:
g.all_eulerian_cycles()

{('A', 'E', 'D', 'C', 'E', 'F', 'G', 'A', 'B', 'C', 'A'),
 ('A', 'E', 'F', 'G', 'A', 'B', 'C', 'E', 'D', 'C', 'A')}

Klasa <code>DeBruijn</code> nasleđuje sve osobine bazne klase <code>Graph</code> i omogućava konstrukciju DeBruijn-ovog grafa na osnovu datih ocitavanja (read-ova) genoma koji se sekvencira.

In [27]:
class DeBruijn(Graph):    
    #metod koji vraca k-gramski sastav (lista preklapajućih k-grama) za data ocitavanja 'reads'    
    def get_kmers(self, reads, k):
        kmers = []
        
        for read in reads:
            n = len(read)
            for i in range(n-k+1):
                kmer = read[i:i+k]
                kmers.append(kmer)
                      
        return kmers    
    
    #konstruktorska funkcija (metod) koja na osnovu k-gramskog sastava ocitavanja 'reads' pravi graf
    #ciji su cvorovi prefiksi i sufiksi k-grama a grane odgovaraju k-gramima tj. za svaki od k-grama 
    #iz k-gramskog sastava postoji po jedna grana izmedju cvorova koji odgovaraju njegovom prefiksu i sufiksu      
    def __init__(self, reads, k):
        kmers = self.get_kmers(reads, k)
        adjacency_list = {}
        
        for kmer in kmers:
            u = kmer[:-1]
            v = kmer[1:]
            
            if u not in adjacency_list:
                adjacency_list[u] = []
            if v not in adjacency_list:
                adjacency_list[v] = []
                
            adjacency_list[u].append(v)
            
        super().__init__(adjacency_list)

In [28]:
reads = ['TAATGCCATGGGATGTT']   
db_graph = DeBruijn(reads, 3)
print(db_graph)

{'TA': ['AA'], 'AA': ['AT'], 'AT': ['TG', 'TG', 'TG'], 'TG': ['GC', 'GG', 'GT'], 'GC': ['CC'], 'CC': ['CA'], 'CA': ['AT'], 'GG': ['GG', 'GA'], 'GA': ['AT'], 'GT': ['TT'], 'TT': []}


<img src="assets/DeBruijn_graph.png" width="400">

In [29]:
db_graph.close_to_cycle()
db_graph.all_eulerian_cycles()

{('TA',
  'AA',
  'AT',
  'TG',
  'GC',
  'CC',
  'CA',
  'AT',
  'TG',
  'GG',
  'GG',
  'GA',
  'AT',
  'TG',
  'GT',
  'TT',
  'TA'),
 ('TA',
  'AA',
  'AT',
  'TG',
  'GG',
  'GG',
  'GA',
  'AT',
  'TG',
  'GC',
  'CC',
  'CA',
  'AT',
  'TG',
  'GT',
  'TT',
  'TA')}