In [60]:
from __future__ import division
import math
import pandas as pd
import numpy as np
import matplotlib as plt
from pyveplot import *
import networkx as nx
import random
from IPython.display import SVG
%matplotlib inline

In [61]:
event_colors = {'DEL': 'red',
                'INV': 'yellow',
                'DUP': 'blue',
                'BND': 'green',
                'complex': 'purple'}

In [62]:
apgi_1953 = pd.read_table("APGI1953_Tumor_lumpy.tsv")
apgi_1955 = pd.read_table("APGI1955_Tumor_lumpy.tsv")
apgi_2049 = pd.read_table("APGI2049_Tumor_lumpy.tsv")

panc_2156 = pd.read_table("2156.tsv", names=["chrom", "chrom_b", "sv"])

In [63]:
# Remove duplicates and filter out ALTS (GL000226.1, GL000224.1 ...)
panc_2156 = panc_2156[~panc_2156["chrom"].str.contains("GL")]
panc_2156 = panc_2156.drop_duplicates(keep="first") ## XXX: Perhaps I should group/count those instead?

### Only intra-chromosomal events happening on APGI datasets

In [64]:
# i.e:
apgi_1953[apgi_1953["chrom"] == "4"]

Unnamed: 0,sample,chrom,sv,counts
30,APGI1953_Tumor,4,complex,15
33,APGI1953_Tumor,4,DEL,8
51,APGI1953_Tumor,4,DUP,1


## Inter and intra chromosomal events in panc datasets

In [65]:
panc_2156[panc_2156['chrom'] != panc_2156['chrom_b']].head()

Unnamed: 0,chrom,chrom_b,sv
0,1,6,BND
1,1,3,BND
6,1,15,BND
7,1,8,BND
12,2,12,BND


In [66]:
panc_2156[panc_2156["chrom"] == "1"]

Unnamed: 0,chrom,chrom_b,sv
0,1,6,BND
1,1,3,BND
3,1,1,DEL
6,1,15,BND
7,1,8,BND
10,1,1,INV


In [67]:
panc_2156[(panc_2156["chrom"] == "1") & (panc_2156["sv"] == "DEL")]

Unnamed: 0,chrom,chrom_b,sv
3,1,1,DEL


In [68]:
panc_2156[(panc_2156["chrom"] == "1") & (panc_2156["sv"] == "BND")]

Unnamed: 0,chrom,chrom_b,sv
0,1,6,BND
1,1,3,BND
6,1,15,BND
7,1,8,BND


In [84]:
panc_2156.groupby("sv").count()

Unnamed: 0_level_0,chrom,chrom_b
sv,Unnamed: 1_level_1,Unnamed: 2_level_1
BND,48,48
DEL,17,17
DUP,6,6
INV,13,13


In [87]:
intra = panc_2156[(panc_2156["chrom"] == panc_2156["chrom_b"])].count()
inter = panc_2156[(panc_2156["chrom"] != panc_2156["chrom_b"])].count()

intra, inter

(chrom      36
 chrom_b    36
 sv         36
 dtype: int64, chrom      48
 chrom_b    48
 sv         48
 dtype: int64)

In [69]:
def groupby_chrom_sv(df):
    """ Original df and grouped by cols
        XXX: Perhaps is not that relevant to have counts since the overlapping and opacity will
             show this up
    """
    # inter-chrom format
    if "chrom_b" in df.columns:
        by_chrom_sv = df.groupby(["chrom", "sv"]).count()["chrom_b"].reset_index(name="counts")
        by_chrom_sv = by_chrom_sv.groupby(["chrom", "sv", "counts"])
    else:
        by_chrom_sv = df.groupby(["chrom", "sv", "counts"])
    
    return df, by_chrom_sv

In [70]:
#panc_2156_cnts = panc_2156.groupby(["chrom", "sv"]).count()["chrom_b"].reset_index(name="counts")
#panc_2156_cnts.head()

#apgi_1953_grps = groupby_chrom_sv(apgi_1953)
#panc_2156_cnts.groupby(["chrom", "sv", "counts"]).groups, apgi_1953_grps.groups

## Plot a hiveplot given a pandas dataframe

In [88]:
def hiveplot(fname, dataframe):
    # a network
    g = nx.Graph()

    # our hiveplot object
    h = Hiveplot('{}.svg'.format(fname))

                  # start      end
    axis0 = Axis((200,200), (200,100), stroke="grey")
    axis1 = Axis((200,200), (300,300), stroke="blue", stroke_width=1.2)
    axis2 = Axis((200,200), (10,310), stroke="black", stroke_width=3)

    h.axes = [ axis0, axis1, axis2 ]
    
    for _, chrom, chrom_b, sv in dataframe[0].itertuples():
        #if chrom != '1' and sv != 'BND':
        #    continue
        
        g.add_node(chrom)
        g.add_edge(chrom, chrom_b, event=sv, count=1)

    for n in g.nodes():
        # Separate instances for the axis, otherwise loops in itself.
        node = Node(n)
        node2 = Node(n)
        node3 = Node(n)

        # XXX: Find a better (more uniform) function than ord? 
        # A small hash function would be prob better here.
        # Calculates the offset of the chromosomes in the axis.

        n = str(n)
        
        if len(n) == 1:
            offset_axis0 = ord(n) - 30
            offset_axis1 = ord(n) - 20
            offset_axis2 = ord(n) - 30
        else:
            chrom_offset = 0
            for char in n:
                chrom_offset = chrom_offset + ord(char)

            offset_axis0 = chrom_offset
            offset_axis1 = chrom_offset
            offset_axis2 = chrom_offset

        offset_axis0 = offset_axis0/120
        offset_axis1 = offset_axis1/120
        offset_axis2 = offset_axis2/120

        axis0.add_node(node, offset_axis0)
        axis1.add_node(node2, offset_axis1)
        axis2.add_node(node3, offset_axis2)

    for e in g.edges():
        edge_data = g.get_edge_data(*e)

        # inter-chromosomal axis
        #print e
        if e[0] != e[1] and (e[0] in axis0.nodes) and (e[1] in axis1.nodes):
            #print("inter plotting now!")
            h.connect(axis0, e[0], 45, 
                      axis1, e[1], -45, 
                      stroke_width=edge_data['count'], 
                      stroke=event_colors[edge_data['event']])
        
        # intra-chromosomal axis
        elif e[0] == e[1] and (e[0] in axis1.nodes) and (e[1] in axis2.nodes):
            #print("intra plotting now!")
            h.connect(axis1, e[0], 15, 
                      axis2, e[1], -15, 
                      stroke_width=edge_data['count'], 
                      stroke=event_colors[edge_data['event']])

    h.save()

In [89]:
#hiveplot("apgi_1953", groupby_chrom_sv(apgi_1953)[1])
#hiveplot("apgi_1955", groupby_chrom_sv(apgi_1955)[1])
#hiveplot("apgi_2049", groupby_chrom_sv(apgi_2049))
hiveplot("panc_2156", groupby_chrom_sv(panc_2156))