In [1]:
from density import ReadDensity
import os
import numpy as np
import pandas as pd

In [2]:
hnrnpc_bam = '/projects/ps-yeolab3/encode/analysis/encode_master/206_01_HNRNPK.merged.r2.bam'
pos = '/projects/ps-yeolab3/encode/analysis/encode_master/206_01_HNRNPK.merged.r2.norm.pos.bw'
neg = '/projects/ps-yeolab3/encode/analysis/encode_master/206_01_HNRNPK.merged.r2.norm.neg.bw'

In [3]:
rdd = ReadDensity.ReadDensity(pos, neg, bam=hnrnpc_bam)
rdd.bam.count()

4062675

In [4]:
input_bam = '/projects/ps-yeolab3/encode/analysis/encode_master/206-INPUT_S10_R1.unassigned.adapterTrim.round2.rmRep.rmDup.sorted.r2.bam'
rdd = ReadDensity.ReadDensity(pos, neg, bam=input_bam)
rdd.bam.count()

2944770

In [5]:
wd = '/home/bay001/projects/gabe_qc_20170612/data'
ip = os.path.join(wd, '206_01_HNRNPK.merged.r2.1.conf1.HNRNPK-BGHLV12-HepG2-included-upon-knockdown.ip.raw_density.txt')
inp = os.path.join(wd, '206_01_HNRNPK.merged.r2.1.conf1.HNRNPK-BGHLV12-HepG2-included-upon-knockdown.input.raw_density.txt')
ip_df = pd.read_table(ip, index_col=0, sep=',')
inp_df = pd.read_table(inp, index_col=0, sep=',')

In [6]:
def clean(density_df):
    """
    These functions expect a dataframe with density values (columns)
    across a number of regions (rows). These dataframes may also contain
    information regarding premature boundaries for each region (marked as -1)
    and no-density regions (marked by nan). This cleans the dataframe.
    Parameters
    ----------
    density_df : pandas.DataFrame
        Table of densities
    Returns
    -------
    pandas.DataFrame
    """

    # NaNs are regions which contain zero density
    # -1 are regions which should not be counted at all
    density_df = density_df.fillna(0)
    return density_df.replace(-1, np.nan)

def read_entropy(density_df, input_density_df, pseudocount, input_pseudocount,
                 min_density_threshold=0):
    """
    Return the entropy of each position.
    Logic
    -----
    Turn normalized RPM densities to reads:
        (density matrix -> read matrix)
    Add 1 read to entire dataframe (except for nan positions):
        (read matrix -> read matrix + 1)
    Divide each position by total mapped reads:
        (read matrix + 1 -> probability matrix)
    Calculate entropy
    Parameters
    ----------
    density_df : pandas.DataFrame
        matrix of RPM-normalized read densities in ip CLIP
    input_density_df : pandas.DataFrame
        matrix of RPM-normalized read densities in input CLIP
    pseudocount : float
        RPM-normalized read density of one read in ip CLIP
    input_pseudocount : float
        RPM-normalized read density of one read in input CLIP
    min_density_threshold : int
    Returns
    -------
    en : pandas.DataFrame
    """

    total_ip_mapped_reads = 1000000 / pseudocount
    total_input_mapped_reads = 1000000 / input_pseudocount
    density_df = density_df[density_df.sum(axis=1) > min_density_threshold]


    # get equivalent events for input and ip
    df_indices = density_df.index
    dfi_indices = input_density_df.index
    missing = set(df_indices) - set(dfi_indices)
    input_density_df = input_density_df.append(input_density_df.ix[missing])

    rpm = clean(density_df)
    rpmi = clean(input_density_df)

    r = rpm / pseudocount
    ri = rpmi / input_pseudocount

    r = r + 1
    ri = ri + 1

    pr = r / total_ip_mapped_reads
    pri = ri / total_input_mapped_reads

    en = pr.multiply(np.log2(pr.div(pri)))
    return en

In [7]:
test_df = read_entropy(
    ip_df, inp_df, 1000000.0/4062675, 1000000.0/2944770, 0
)
test_df.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399
"1002\tENSG00000046647.9\tGEMIN8\tchrX\t-\t14044170\t14044340\t14039582\t14039630\t14047895\t14048011\t1002\t13,20\t1,1\t9,10\t8,12\t197\t99\t5.49792728484e-05\t0.00241964223187\t0.867,0.91\t0.361,0.295\t0.561",-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,...,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,,
"10038\tENSG00000073792.11\tIGF2BP2\tchr3\t-\t185390328\t185390457\t185376128\t185376197\t185393083\t185393219\t10038\t366,276\t23,19\t168,148\t43,25\t197\t99\t7.84316385394e-07\t5.59504652835e-05\t0.889,0.88\t0.663,0.748\t0.179",-6.858051e-07,-6.065646e-07,-5.04406e-07,-6.065646e-07,-5.04406e-07,-6.065646e-07,-6.065646e-07,-6.065646e-07,-6.065646e-07,-6.065646e-07,...,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08
"10241\tENSG00000213523.5\tSRA1\tchr5\t-\t139931566\t139931769\t139930645\t139930754\t139936731\t139936917\t10241\t101,73\t1,1\t49,62\t4,7\t197\t99\t0.00348954255947\t0.0728233504117\t0.981,0.973\t0.86,0.817\t0.138",-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-3.604214e-07,...,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07
"10300\tENSG00000008294.16\tSPAG9\tchr17\t-\t49053223\t49053262\t49052131\t49052308\t49054468\t49054582\t10300\t86,108\t60,58\t8,16\t119,149\t137\t99\t0.0\t0.0\t0.509,0.574\t0.046,0.072\t0.482",-1.142782e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,...,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07
"10325\tENSG00000142687.13\tKIAA0319L\tchr1\t-\t35908758\t35908820\t35908506\t35908629\t35909761\t35909904\t10325\t11,12\t78,88\t3,0\t128,176\t160\t99\t4.14739964705e-07\t3.18780235547e-05\t0.08,0.078\t0.014,0.0\t0.072",,,,,,,,,,,...,,,,,,,,,,


In [8]:
normed_incl = '/home/bay001/projects/gabe_qc_20170612/data/206_01_HNRNPK.merged.r2.2.conf0.95.HNRNPK-BGHLV12-HepG2-included-upon-knockdown.normed_matrix.txt'
expect_df = pd.read_table(normed_incl, sep=',', index_col=0)
expect_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399
"1002\tENSG00000046647.9\tGEMIN8\tchrX\t-\t14044170\t14044340\t14039582\t14039630\t14047895\t14048011\t1002\t13,20\t1,1\t9,10\t8,12\t197\t99\t5.49792728484e-05\t0.00241964223187\t0.867,0.91\t0.361,0.295\t0.561",-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,...,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,,
"10038\tENSG00000073792.11\tIGF2BP2\tchr3\t-\t185390328\t185390457\t185376128\t185376197\t185393083\t185393219\t10038\t366,276\t23,19\t168,148\t43,25\t197\t99\t7.84316385394e-07\t5.59504652835e-05\t0.889,0.88\t0.663,0.748\t0.179",-6.858051e-07,-6.065646e-07,-5.04406e-07,-6.065646e-07,-5.04406e-07,-6.065646e-07,-6.065646e-07,-6.065646e-07,-6.065646e-07,-6.065646e-07,...,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08,8.911922e-08
"10241\tENSG00000213523.5\tSRA1\tchr5\t-\t139931566\t139931769\t139930645\t139930754\t139936731\t139936917\t10241\t101,73\t1,1\t49,62\t4,7\t197\t99\t0.00348954255947\t0.0728233504117\t0.981,0.973\t0.86,0.817\t0.138",-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-3.604214e-07,...,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07
"10300\tENSG00000008294.16\tSPAG9\tchr17\t-\t49053223\t49053262\t49052131\t49052308\t49054468\t49054582\t10300\t86,108\t60,58\t8,16\t119,149\t137\t99\t0.0\t0.0\t0.509,0.574\t0.046,0.072\t0.482",-1.142782e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,...,-3.604214e-07,-3.604214e-07,-3.604214e-07,-3.604214e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07,-1.142782e-07
"10325\tENSG00000142687.13\tKIAA0319L\tchr1\t-\t35908758\t35908820\t35908506\t35908629\t35909761\t35909904\t10325\t11,12\t78,88\t3,0\t128,176\t160\t99\t4.14739964705e-07\t3.18780235547e-05\t0.08,0.078\t0.014,0.0\t0.072",,,,,,,,,,,...,,,,,,,,,,


In [9]:
from pandas.testing import assert_frame_equal

In [10]:
assert_frame_equal(test_df, expect_df)

In [14]:
raw_matrix = pd.read_table(
    '/home/bay001/projects/gabe_qc_20170612/data/206_01_HNRNPK.merged.r2.1.conf1.HNRNPK-BGHLV12-HepG2-included-upon-knockdown.ip.raw_density.txt',
    sep=',', index_col=0
)
raw_matrix.loc['53793\tENSG00000160285.10\tLSS\tchr21\t-\t47609490\t47609641\t47608054\t47608735\t47610989\t47611149\t53793\t2,3\t11,7\t0,1\t38,54\t197\t99\t0.000550185615318\t0.0164295510216\t0.084,0.177\t0.0,0.009\t0.126'].ix[792]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


3.4460053443900001

In [17]:
raw_input_matrix = pd.read_table(
    '/home/bay001/projects/gabe_qc_20170612/data/206_01_HNRNPK.merged.r2.1.conf1.HNRNPK-BGHLV12-HepG2-included-upon-knockdown.input.raw_density.txt',
    sep=',', index_col=0
)
raw_input_matrix.loc['53793\tENSG00000160285.10\tLSS\tchr21\t-\t47609490\t47609641\t47608054\t47608735\t47610989\t47611149\t53793\t2,3\t11,7\t0,1\t38,54\t197\t99\t0.000550185615318\t0.0164295510216\t0.084,0.177\t0.0,0.009\t0.126'].ix[792]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


0.67917019128800005

In [18]:
normed_matrix = pd.read_table(
    '/home/bay001/projects/gabe_qc_20170612/data/206_01_HNRNPK.merged.r2.2.conf0.95.HNRNPK-BGHLV12-HepG2-included-upon-knockdown.normed_matrix.txt',
    sep=',', index_col=0
)
normed_matrix.loc['53793\tENSG00000160285.10\tLSS\tchr21\t-\t47609490\t47609641\t47608054\t47608735\t47610989\t47611149\t53793\t2,3\t11,7\t0,1\t38,54\t197\t99\t0.000550185615318\t0.0164295510216\t0.084,0.177\t0.0,0.009\t0.126'].ix[792]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


6.8587311541400004e-06