In [5]:
from __future__ import division
import os
import glob
import math
import pandas as pd
import numpy as np
import matplotlib as plt
from pyveplot import *
import networkx as nx
import random
from IPython.display import SVG, display_html
%matplotlib inline

## What is this analysis about?

This notebook attemps to find alternative, clearer plots for inter and intra-chromosomal structural variations. In other words, the idea is to go from

A typical Circos plot:

<img src="https://www.genomatix.de/online_help/help_regionminer/SV_circos_genome.png" height=300 width=300/>

To this:

<img src="img/hyplot_intra_inter.png">

From this:

<img src="img/sv_table.png">

The pipeline used to generate the original input datasets was `bcbio-nextgen`:

https://bcbio-nextgen.readthedocs.org/en/latest/contents/teaching.html

And then [`vcfToBedpe`](https://github.com/ctsa/svtools/blob/master/vcfToBedpe) was used to convert a plain [VCF](https://samtools.github.io/hts-specs/VCFv4.2.pdf) to [BEDPE format](http://bedtools.readthedocs.org/en/latest/content/general-usage.html#bedpe-format) to generate a paired end version where structural variations are seen as pairs (`chrom` and `chrom_b` columns).

Finally, [pyveplot](https://github.com/CSB-IG/pyveplot), a Python implementation of [Hive plots](http://www.hiveplot.net/) was used t 

In [6]:
event_colors = {'DEL': 'red',
                'INV': 'yellow',
                'DUP': 'blue',
                'BND': 'green',
                'complex': 'purple'}

## Plot a hiveplot

In [16]:
def hiveplot(fname, dataframe):
    # Remove duplicates and filter out ALTS (GL000226.1, GL000224.1 ...)
    dataframe = dataframe[~dataframe["chrom"].str.contains("GL")]
    dataframe = dataframe.drop_duplicates(keep="first") ## XXX: Perhaps I should group/count those instead?
    
    # a network
    g = nx.Graph()

    # our hiveplot object
    h = Hiveplot('{}.svg'.format(fname))

                  # start      end
    axis0 = Axis((200,200), (200,100), stroke="grey")
    axis1 = Axis((200,200), (300,300), stroke="blue", stroke_width=1.2)
    axis2 = Axis((200,200), (10,310), stroke="black", stroke_width=3)

    h.axes = [ axis0, axis1, axis2 ]
    
    print dataframe.groupby("sv").count()
    
    for _, chrom, chrom_b, sv in dataframe.itertuples():
        g.add_node(chrom)
        # Count = 1 looks better than parametrized with groupby
        g.add_edge(chrom, chrom_b, event=sv, count=1)

    for n in g.nodes():
        # Separate instances for the axis, otherwise arcs go to itself.
        node = Node(n)
        node2 = Node(n)
        node3 = Node(n)

        # XXX: Find a better (more uniform) function than ord? 
        # A small hash function would be prob better here.
        # Calculates the offset of the chromosomes in the axis.

        n = str(n)
        
        if len(n) == 1:
            offset_axis0 = ord(n) - 30
            offset_axis1 = ord(n) - 20
            offset_axis2 = ord(n) - 30
        else:
            chrom_offset = 0
            for char in n:
                chrom_offset = chrom_offset + ord(char)

            offset_axis0 = chrom_offset
            offset_axis1 = chrom_offset
            offset_axis2 = chrom_offset

        offset_axis0 = offset_axis0/120
        offset_axis1 = offset_axis1/120
        offset_axis2 = offset_axis2/120

        axis0.add_node(node, offset_axis0)
        axis1.add_node(node2, offset_axis1)
        axis2.add_node(node3, offset_axis2)

    for e in g.edges():
        edge_data = g.get_edge_data(*e)

        # inter-chromosomal axis
        if e[0] != e[1] and (e[0] in axis0.nodes) and (e[1] in axis1.nodes):
            h.connect(axis0, e[0], 45, 
                      axis1, e[1], -45, 
                      stroke_width=edge_data['count'], 
                      stroke=event_colors[edge_data['event']])
        
        # intra-chromosomal axis
        elif e[0] == e[1] and (e[0] in axis1.nodes) and (e[1] in axis2.nodes):
            h.connect(axis1, e[0], 15, 
                      axis2, e[1], -15, 
                      stroke_width=edge_data['count'], 
                      stroke=event_colors[edge_data['event']])

    h.save()

In [17]:
data_path = "data"

for dataset in glob.iglob(os.path.join(data_path, "*.tsv")):
    hiveplot(os.path.basename(dataset), pd.read_table(dataset, names=["chrom", "chrom_b", "sv"]))

     chrom  chrom_b
sv                 
BND     48       48
DEL     17       17
DUP      6        6
INV     13       13
