In [1]:
%matplotlib inline
import math
from matplotlib.collections import BrokenBarHCollection
import matplotlib.pyplot as plt
import pandas as pd
import pybedtools
import pysam
import requests
from matplotlib import rcParams
from seaplotlib.helper import (
    despine,
    save_fig_in_dir,
)


IDEOGRAM_URL = 'https://unpkg.com/ideogram@1.9.0/dist/data/bands/native/drosophila-melanogaster.json'

In [2]:
FASTA_INDEX = '../../reference-data/dm6-r6.17.fa.fai'
# The remaining variables will be replaced when run through papermill
INSERTIONS = 'calls.gff'  
LABEL = 'prosGFP guts'
OUT_DIR = 'ProsGFP guts'

In [3]:
# Parameters
INSERTIONS = "0.5.21.filtered_calls_prosgfp_guts.gff"
LABEL = "ProsGFP calls guts"
SAMPLE_FILE = "ProsGFP_gut_samples.csv"


In [4]:
with open(FASTA_INDEX) as fai:
    chrom_len = {l.split('\t')[0]: l.split('\t')[1] for l in fai}
    
    
def get_ideogram():
    bands = requests.get(IDEOGRAM_URL).json()
    ideo = pd.DataFrame.from_records([b.split(' ') for b in bands['chrBands']])
    ideo = ideo[[0,3,4,2,7]]
    ideo.columns = ['chrom', 'start', 'end', 'name', 'gieStain']
    ideo.start = pd.to_numeric(ideo.start)
    ideo.end = pd.to_numeric(ideo.end)

    # Filter out chromosomes not in our list
    ideo = ideo[ideo.chrom.apply(lambda x: x in chromosome_list)]
    ideo

    # Add a new column for width
    ideo['width'] = ideo.end - ideo.start

    # Colors for different chromosome stains
    color_lookup = {
        'gneg': (1., 1., 1.),
        'gpos': (0., 0., 0.,),
        'gpos25': (.6, .6, .6),
        'gpos50': (.4, .4, .4),
        'gpos75': (.2, .2, .2),
        'gpos100': (0., 0., 0.),
        'acen': (.8, .4, .4),
        'gvar': (.8, .8, .8),
        'stalk': (.9, .9, .9),
    }
    ideo['colors'] = ideo['gieStain'].apply(lambda x: color_lookup[x])
    return ideo


def interval_to_df(path, chromosome_list, color='red'):
    df = pybedtools.BedTool(path).to_dataframe()
    df = df[['seqname', 'start', 'end', 'feature']]
    df.columns = ['chrom', 'start', 'end', 'name']
    df['width'] = 50000
    df['colors'] = color
    df = df[df.chrom.apply(lambda x: x in chromosome_list)]
    return df


# Here's the function that we'll call for each dataframe (once for chromosome
# ideograms, once for genes).  The rest of this script will be prepping data
# for input to this function
#
def chromosome_collections(df, y_positions, height,  **kwargs):
    """
    Yields BrokenBarHCollection of features that can be added to an Axes
    object.
    Parameters
    ----------
    df : pandas.DataFrame
        Must at least have columns ['chrom', 'start', 'end', 'color']. If no
        column 'width', it will be calculated from start/end.
    y_positions : dict
        Keys are chromosomes, values are y-value at which to anchor the
        BrokenBarHCollection
    height : float
        Height of each BrokenBarHCollection
    Additional kwargs are passed to BrokenBarHCollection
    """
    del_width = False
    if 'width' not in df.columns:
        del_width = True
        df['width'] = df['end'] - df['start']
    for chrom, group in df.groupby('chrom'):
        yrange = (y_positions[chrom], height)
        xranges = group[['start', 'width']].values
        yield BrokenBarHCollection(
            xranges, yrange, facecolors=group['colors'], **kwargs)
    if del_width:
        del df['width']
        

# Height of each ideogram
chrom_height = 0.5

# Spacing between consecutive ideograms
chrom_spacing = 1.2

# Height of the gene track. Should be smaller than `chrom_spacing` in order to
# fit correctly
gene_height = 0.8

# Padding between the top of a gene track and its corresponding ideogram
gene_padding = 0.1

# Width, height (in inches)
figsize = (6, 8)

# Decide which chromosomes to use
chromosome_list = list(chrom_len.keys())

# Keep track of the y positions for ideograms and genes for each chromosome,
# and the center of each ideogram (which is where we'll put the ytick labels)
ybase = 0
chrom_ybase = {}
gene_ybase = {}
chrom_centers = {}

# Iterate in reverse so that items in the beginning of `chromosome_list` will
# appear at the top of the plot
for chrom in chromosome_list[::-1]:
    chrom_ybase[chrom] = ybase
    chrom_centers[chrom] = ybase + chrom_height / 2.
    gene_ybase[chrom] = ybase - gene_height - gene_padding
    ybase += chrom_height + chrom_spacing
    
    
ideo = get_ideogram()
genes_df = interval_to_df(INSERTIONS, chromosome_list)
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)
for collection in chromosome_collections(ideo, chrom_ybase, chrom_height):
    ax.add_collection(collection)
for collection in chromosome_collections(
    genes_df, gene_ybase, gene_height, alpha=0.3, linewidths=10
):
    ax.add_collection(collection)

ax.set_yticks([chrom_centers[i] for i in chromosome_list])
ax.set_yticklabels(chromosome_list)
ax.set_title('Density of de-novo insertions across genome')
ax.axis('tight')
despine(ax)
plt.show()
save_fig_in_dir(fig, filename='insertion_density %s.pdf' % LABEL, directory=LABEL, bbox_inches='tight')

