# Graph Link Compression

This can only be done after the successful elimination of false linkages between contigs

Consider the example graph, with contig coverages as follows and
```python
contig_coverage = {
    'c2': 50,
    'c1': 150,
    'c3': 250,
    'c4': 50
}
```
link coverages as below
```python
link_coverage = {
    'c2+c1+': 50,
    'c1+c1+': 50,
    'c1+c3+': 50,
    'c3+c4+': 50,
    'c3+c3+': 150,
    'c4+c3+': 50
}
```

This is taken from the genome = `c2-c1-c1-c3-c3-c4-c3-c3-c3`

The compressed expression is = `c2-c1*x-c3*y-c4-c3*z`
In reality derivation of `y` and `z` is nearly impossible, rather an expression generalizing all possible arrangements is possible. For this the coverage information can be used.

In [39]:
# all the imports
# python imports
import copy

# initialize with sample data
contig_coverage = {
    'c1': 50,
    'c2': 150,
    'c3': 250,
    'c4': 50,
    'c5': 50
}

link_coverage = {
    'c1+c2+': 50,
    'c2+c2+': 50,
    'c2+c3+': 50,
    'c3+c3+': 50,
    'c3+c2+': 150,
    'c2+c4+': 50,
    'c4+c2+': 50,
    'c2+c5+': 50
}

links = list(link_coverage.keys())
contigs = list(contig_coverage.keys())

print (links)
print ()
print (contigs)

['c1+c2+', 'c2+c2+', 'c2+c3+', 'c3+c3+', 'c3+c2+', 'c2+c4+', 'c4+c2+', 'c2+c5+']

['c1', 'c2', 'c3', 'c4', 'c5']


In [140]:
class EulerGraph:
    
    def __init__(self, links):
        self._links = links
        self._graph = None
        self._graph_p = None

    def _name_decomposer(self, link_name):
        decomposed = []
        temp = ""

        for c in link_name:
            if c == "+" or c == "-":
                contig_name = temp
                temp = ""
                decomposed.extend([contig_name, c])
            else: temp += c
        return decomposed

    # TODO Polish the logic more
    def _ensure_euler_path(self):
        graph, graph_p = self._graph, self._graph_p
        
        odds = 0

        e_in = {}
        e_out = {}


        for key, val in graph.items():
            e_out[key] = len(val)

        for key, val in graph_p.items():
            e_in[key] = len(val)



        vert = set()

        vert.update(list(e_in.keys()))
        vert.update(list(e_out.keys()))

        for v in list(vert):
            if v not in e_in:
                e_in[v] = []
            if v not in e_out:
                e_out[v] = []

        for v in vert:
            if e_in[v] != e_out[v]:
                odds += 1

        return odds == 0 or odds == 2
    

    def _build_graph(self):
        graph = {}
        graph_p = {}

        for link in self._links:
            l_list = self._name_decomposer(link)
            l_1 = "".join(l_list[0:2])
            l_2 = "".join(l_list[2:4])



            if l_1 not in graph:
                graph[l_1] = set()
            if l_2 not in graph:
                graph[l_2] = set()


            if l_2 not in graph_p:
                graph_p[l_2] = set()
            if l_1 not in graph_p:
                graph_p[l_1] = set()

            # avoid cycles
            if l_1 == l_2: continue

            graph[l_1].add(l_2)
            graph_p[l_2].add(l_1)
 
        print("INFO:: graph", graph)
        print("INFO:: graph_p", graph_p)
        
        self._graph, self._graph_p = graph, graph_p

    def _traverse_until_cycle(self, graph, head):
        graph = copy.deepcopy(graph)
        visited_order = []
        visited = {}
        stack = [head]

        while len(stack) > 0:
            vertex = stack.pop()

            if vertex not in visited:
                visited[vertex] = True

                if len(visited_order) > 0:
                    parent_vertex = visited_order[-1]
                    graph[parent_vertex].remove(vertex)

                visited_order.append(vertex)

                if head in list(graph[vertex]):
                    visited_order.append(head)
                    
                    graph[vertex].remove(head)
                    break
                else:
                    stack.extend(list(graph[vertex]))
            else:
                if len(visited_order) > 0:
                    parent_vertex = visited_order[-1]
                    graph[parent_vertex].remove(vertex)

                visited_order.append(vertex)
                return visited_order , graph
        return visited_order , graph

    def _euler_trace(self):
        graph, graph_p = self._graph, self._graph_p

        # get head
        head = None
        tip = None
        for key, val in graph_p.items():
            if len(val) == 0:
                head = key
                break
        for key, val in graph.items():
            if len(val) == 0:
                tip = key
                break
        # if complete cycle
        if head == None and tip == None:
            head = tip = list(graph.keys())[0]
        else:
            graph[tip].add(head)

        # connect graph
        if tip != head:
            graph[tip].add(head)

        all_discovered = set()
        trails = []
        has_more = True

        while has_more:
            has_more = False

            # one pass with the head
            trail, graph = self._traverse_until_cycle(graph, head)
            trails.append(trail)

            # consider the next head: u = last tour to an edge which has not discovered
            for v in trail:
                if len(graph[v]) > 0:
                    head = v
                    has_more = True
                    break
        return trails
    
    def run_resolution(self):
        self._build_graph()
        
        if self._ensure_euler_path():
            return self._euler_trace()
        else:
            raise Exception('Unable to secure an euler path')
        
        
# eg = EulerGraph(['c1+c2+', 'c2+c2+', 'c2+c3+', 'c3+c3+', 'c3+c2+', 'c2+c4+', 'c4+c2+', 'c2+c5+']) 
eg = EulerGraph(['c1+c3+', 'c1+c2+', 'c2+c3+', 'c3+c1+']) 


trails = eg.run_resolution()

for x in trails:
    print (x)
# euler_trace(graph, graph_p)

INFO:: graph {'c1+': {'c3+', 'c2+'}, 'c3+': {'c1+'}, 'c2+': {'c3+'}}
INFO:: graph_p {'c3+': {'c2+', 'c1+'}, 'c1+': {'c3+'}, 'c2+': {'c1+'}}
['c1+', 'c2+', 'c3+', 'c1+']
['c1+', 'c3+']
