# Footprint calling and plotting of a single DHS


In [1]:
import numpy as np
import pandas as pd
import scipy as sc
import scipy.stats as stats

import pysam

from genome_tools import bed, genomic_interval
from footprint_tools import cutcounts
from footprint_tools.modeling import bias, predict, dispersion
from footprint_tools.stats import fdr, windowing, utils

# Import Data

In [2]:
# This is the genomic region to plot data
interval = genomic_interval('chr19', 48363826, 48364602)

counts_reader = cutcounts.bamfile('../Data/reads.bam')
fasta_reader = pysam.FastaFile('../Data/hg38.all.fa')
dm = dispersion.load_dispersion_model('../Data/dm.json')
bm = bias.kmer_model('../Data/vierstra_et_al.6mer-model.txt')

# Perform footprint detection

In [3]:
predictor = predict.prediction(counts_reader, 
                               fasta_reader, 
                               bm, 
                               half_win_width = 5, 
                               smoothing_half_win_width = 50, 
                               smoothing_clip = 0.01)
(obs_counts, exp_counts, win_counts) = predictor.compute(interval)

In [4]:
# Compute expected counts
# Note: windowed counts are used to generated expected counts
# and not used directly in statistical testing
obs = obs_counts['+'][1:] + obs_counts['-'][:-1]
exp = exp_counts['+'][1:] + exp_counts['-'][:-1]
win = win_counts['+'][1:] + win_counts['-'][:-1]

In [5]:
# Per-nucleotide p-values
pvals = dm.p_values(exp, obs)

In [6]:
# Windowed p-values with Stouffer's Z
winpvals_func = lambda x: windowing.stouffers_z(np.ascontiguousarray(x), 3)
winpvals = np.array(winpvals_func(pvals))

In [7]:
# Resample from expected distributions for emperical adjustment of p-values for multiple testing
_, pvals_null = dm.sample(exp, 1000)
winpvals_null = np.apply_along_axis(winpvals_func, 0, pvals_null)
fdr = fdr.emperical_fdr(winpvals_null, winpvals)

# Call footprints at 0.01 FDR
fps = [genomic_interval(interval.chrom, interval.start+x, interval.start+y)
    for x, y in utils.segment(np.array(fdr), 0.01, 3, decreasing=True) ]

# Plot genomic footprinting data

In [8]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as mgridspec
import matplotlib.ticker as mticker
import matplotlib.collections as mcollections

from genome_tools.plotting import signal_plot, add_scale_bar

def discrete_cmap(N, base_cmap=None):
    """Create an N-bin discrete colormap from the specified input map"""

    # Note that if base_cmap is a string or None, you can simply do
    #    return plt.cm.get_cmap(base_cmap, N)
    # The following works for string, None, or a colormap instance:

    base = plt.cm.get_cmap(base_cmap)
    color_list = base(np.linspace(0, 1, N))
    cmap_name = base.name + str(N)
    return base.from_list(cmap_name, color_list, N)

def make_nb_plot(expected, observed, dm, ax, lo = 0, hi = 125):
    """Plots the dispersion model negative binomial distribution"""

    x = np.arange(lo, hi)

    mu = dm.fit_mu(expected)
    r = dm.fit_r(expected)
    p = r/(r+mu)

    y = sc.stats.nbinom.pmf(x, r, p)
    ax.plot(x, y, label = "Expected cleavage rate")
    ax.fill_between(x[:int(observed)], 0, y[:int(observed)], edgecolor = 'none', label="Prob. observed")

    ax.set_xlim(left = lo, right = hi)

    ax.set_xlabel("Cleavages")
    ax.set_ylabel("Density")

    [ax.spines[loc].set_color("none") for loc in ["top", "right"]]
    ax.xaxis.set_ticks_position("bottom")
    ax.xaxis.set_tick_params(direction = "out")
    ax.xaxis.set(major_locator = mticker.MaxNLocator(4))

    ax.yaxis.set_ticks_position("left")
    ax.yaxis.set_tick_params(direction = "out")
    ax.yaxis.set(major_locator = mticker.MaxNLocator(3))
    ax.set_ylim(bottom=0)

SyntaxError: invalid syntax (track.py, line 48)