In [1]:
from __future__ import division
import os
import glob
import math
import pandas as pd
import numpy as np
import matplotlib as plt
from pyveplot import *
#from collections import namedtuple
import networkx as nx
import random
from IPython.display import SVG
%matplotlib inline

## Circos vs Hiveplots

This notebook attemps to find alternative, clearer plots for inter and intra-chromosomal structural variations. In other words, the idea is to go from

A typical Circos plot:

<img src="https://www.genomatix.de/online_help/help_regionminer/SV_circos_genome.png" height=300 width=300/>

To this:

<img src="img/hyplot_intra_inter.png">

From this:

<img src="img/sv_table.png">

Some of the preliminary data comes from <a href="http://www.pancreaticcancer.net.au/">Australian Pancreatic Cancer Genome Initiative</a>, other from the <a href="https://www.synapse.org/#!Synapse:syn312572">ICGC-TCGA DREAM challenge</a> processed via the <a href="https://github.com/chapmanb/bcbio-nextgen/blob/master/config/examples/cancer-dream-syn3.yaml">bcbio cancer pipeline</a>. That pipeline run takes a considerable amount of time to run given the big input sizes and <a href="http://bcb.io/2015/03/05/cancerval/">running several variant callers</a>.

For pedagogical reasons, the resulting tab-separated `.tsv` files have been generated for easy analysis. If a more upstream run or (re)-analysis  is required, there's a <a href="https://bcbio-nextgen.readthedocs.org/en/latest/contents/teaching.html">reduced dataset that focuses on chromosome 6</a>. 

And then [`vcfToBedpe`](https://github.com/ctsa/svtools/blob/master/vcfToBedpe) was used to convert a plain [VCF](https://samtools.github.io/hts-specs/VCFv4.2.pdf) to [BEDPE format](http://bedtools.readthedocs.org/en/latest/content/general-usage.html#bedpe-format) to generate a paired end version where structural variations are seen as pairs (`chrom` and `chrom_b` columns).

Finally, [pyveplot](https://github.com/CSB-IG/pyveplot), a Python implementation of [Hive plots](http://www.hiveplot.net/) was used to show the representation above.

In [2]:
event_colors = {'DEL': 'red',
                'INV': 'yellow',
                'DUP': 'blue',
                'BND': 'green',
                'complex': 'purple'}

## Plot a hiveplot

In [6]:
def hiveplot(fname, dataframe):
    # Remove duplicates and filter out ALTS (GL000226.1, GL000224.1 ...)
    dataframe = dataframe[~dataframe["chrom"].str.contains("GL")]
    dataframe = dataframe.drop_duplicates(keep="first") ## XXX: Perhaps should group/count dups better?
    
    # a network
    g = nx.Graph()

    # our hiveplot object
    h = Hiveplot('{}.svg'.format(fname))

                  # start      end
    axis0 = Axis((200,200), (200,100), stroke="grey")
    axis1 = Axis((200,200), (300,300), stroke="blue", stroke_width=1.2)
    axis2 = Axis((200,200), (10,310), stroke="black", stroke_width=3)

    h.axes = [ axis0, axis1, axis2 ]
    
    print "Structural variation events for ''{fname}'' have the following event counts:\n\n{groupby}\n".format(
           groupby=dataframe.groupby("sv").count(), fname=fname)
    
    for row in dataframe.itertuples():
        # idx, u'sample', u'chrom', u'chrom_b', u'sv', u'counts'
        g.add_node(row[2])
        # Count = 1 looks better than parametrized with groupby
        g.add_edge(row[2], row[3], event=row[4], count=1)

    for n in g.nodes():
        # Separate instances for the axis, otherwise arcs go to itself.
        node = Node(n)
        node2 = Node(n)
        node3 = Node(n)

        # XXX: Find a better (more uniform) function than ord? 
        # A small hash function would be prob better here.
        # Calculates the offset of the chromosomes in the axis.

        off = 120
        n = str(n)
        
        if len(n) == 1:
            offset_axis0 = ord(n)
            offset_axis1 = ord(n)
            offset_axis2 = ord(n)
        else:
            chrom_offset = 0
            for char in n:
                chrom_offset = chrom_offset + ord(char)

            offset_axis0 = chrom_offset
            offset_axis1 = chrom_offset
            offset_axis2 = chrom_offset

        offset_axis0 = offset_axis0/off
        offset_axis1 = offset_axis1/off
        offset_axis2 = offset_axis2/off

        axis0.add_node(node, offset_axis0)
        axis1.add_node(node2, offset_axis1)
        axis2.add_node(node3, offset_axis2)

    for e in g.edges():
        edge_data = g.get_edge_data(*e)

        # inter-chromosomal axis
        if e[0] != e[1] and (e[0] in axis0.nodes) and (e[1] in axis1.nodes):
            h.connect(axis0, e[0], 45, 
                      axis1, e[1], -45, 
                      stroke_width=edge_data['count'], 
                      stroke=event_colors[edge_data['event']])
        
        # intra-chromosomal axis
        elif e[0] == e[1] and (e[0] in axis1.nodes) and (e[1] in axis2.nodes):
            h.connect(axis1, e[0], 15, 
                      axis2, e[1], -15, 
                      stroke_width=edge_data['count'], 
                      stroke=event_colors[edge_data['event']])

    h.save()

### Shipped TSV's do not have the same structure, normalize

In [7]:
def normalize(data):
    if "counts" not in data.columns:
        data.columns = ["chrom", "chrom_b", "sv"]
        data.insert(0, 'sample', np.nan)
        data.insert(len(data.columns), 'counts', np.nan)
    else:
        data.insert(2, 'chrom_b', 0)

    return data

### Process and plot all the TSV's

In [5]:
data_path = "data"

for dataset in glob.iglob(os.path.join(data_path, "*.tsv")):
    study = pd.read_table(dataset)
    study = normalize(study)
    
    # plot them all
    hiveplot(os.path.basename(dataset), study)

Structural variation events for ''2156.tsv'' have the following event counts:

     sample  chrom  chrom_b  counts
sv                                 
BND       0     48       48       0
DEL       0     17       17       0
DUP       0      6        6       0
INV       0     13       13       0

Structural variation events for ''APGI1953_Tumor_lumpy.tsv'' have the following event counts:

         sample  chrom  chrom_b  counts
sv                                     
DEL          20     20       20      20
DUP          19     19       19      19
INV           8      8        8       8
complex      23     23       23      23

Structural variation events for ''APGI1955_Tumor_lumpy.tsv'' have the following event counts:

         sample  chrom  chrom_b  counts
sv                                     
DEL          14     14       14      14
DUP          15     15       15      15
INV           1      1        1       1
complex      24     24       24      24

Structural variation events for 