In [109]:
import pandas as pd
import numpy as np
import os

In [174]:

def clean(density_df):
    """
    These functions expect a dataframe with density values (columns)
    across a number of regions (rows). These dataframes may also contain
    information regarding premature boundaries for each region (marked as -1)
    and no-density regions (marked by nan). This cleans the dataframe.

    Parameters
    ----------
    density_df : pandas.DataFrame
        Table of densities

    Returns
    -------
    pandas.DataFrame
    """

    # NaNs are regions which contain zero density
    # -1 are regions which should not be counted at all
    density_df = density_df.fillna(0)
    return density_df.replace(-1, np.nan)

def per_region_subtract_and_normalize(density_df, input_density_df,
                                      pseudocount, input_pseudocount,
                                      min_density_threshold=0):
    """
    Normalizes ip matrix of m x n (where m is the row of each event in a
    feature, and n is the column relating to nucleotide position).

    Parameters
    ----------
    density_df : pandas.DataFrame
        matrix of RPM-normalized read densities in ip CLIP
    input_density_df : pandas.DataFrame
        matrix of RPM-normalized read densities in input CLIP
    pseudocount : float
        RPM-normalized read density of one read in ip CLIP
    input_pseudocount : float
        RPM-normalized read density of one read in input CLIP
    min_density_threshold : int

    Returns
    -------
    subtracted : pandas.DataFrame
    """

    df_indices = density_df.index
    dfi_indices = input_density_df.index
    missing = set(df_indices) - set(dfi_indices)

    input_density_df = input_density_df.append(input_density_df.ix[missing])

    subtracted = density_df.sub(input_density_df)
    pdf = calculate_pdf(
        subtracted, pseudocount, min_density_threshold
    )

    return pdf


def calculate_pdf(density_df, pseudocount=None, min_density_threshold=0):
    """
    Calculates the PDF of a density matrix (makes all rows sum to 1).

    Parameters
    ----------
    density_df : pandas.DataFrame
        r x c matrix of densities.
        May contain NaN corresponding to values
        in which no density was returned. These values should be counted.

        May also contain -1 corresponding to values in which a particular
        region is shorter than the full DataFrame length. These
        values should NOT be counted.
    pseudocount : float
        value added to the entire dataframe before calculating pdf.
    min_density_threshold : int
        minimum total density_df across a row.
        (deprecated - possibly removed in the future)

    Returns
    -------
    pdf : pandas.DataFrame
        r x c matrix of densities normalized across each respective
        (r)ow as a probability density_df func.
    """

    # df = clean(density_df)  # moved this out, it doesn't belong here and i don't think we use it ever
    df = density_df

    # print('before pdf: ', df.shape, min_density_threshold)
    # df = df[df.sum(axis=1) >= min_density_threshold]
    # print('after filtering min threshold pdf: ', df.shape)
    min_read = pseudocount if pseudocount else min(
        [item for item in df.unstack().values if item > 0]
    )
    # print("min read: {}".format(min_read))
    # df = df + min_read
    df_wpseudocount = add_min_read(df, min_read)
    pdf = df.div(df_wpseudocount.abs().sum(axis=1), axis=0)

    return pdf  # , mean, sem


def add_min_read(df, min_read):
    df = np.where(df >= 0, df + min_read, df)
    df = np.where(df < 0, df - min_read, df)
    return pd.DataFrame(df)


In [175]:
test_ip = pd.DataFrame([[0, 1, 2],[3,4,5]])
test_ip

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5


In [176]:
test_inp = pd.DataFrame([[1, 1, 1],[1, 1, 1]])
test_inp

Unnamed: 0,0,1,2
0,1,1,1
1,1,1,1


In [177]:
subtracted = test_ip.sub(test_inp)
subtracted

Unnamed: 0,0,1,2
0,-1,0,1
1,2,3,4


In [178]:
def add_min_read(df, min_read):
    df = np.where(df >= 0, df + min_read, df)
    df = np.where(df < 0, df - min_read, df)
    return pd.DataFrame(df)

In [226]:
test_ip = pd.DataFrame([[0, 1, 2, 3],[3, 4, 5, -1],[0,0,0,0]])
test_inp = pd.DataFrame([[1, 1, 1, 1],[1, 1, 1, -1],[0,0,0,0]])
test_ip

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,3,4,5,-1
2,0,0,0,0


In [227]:
test_inp

Unnamed: 0,0,1,2,3
0,1,1,1,1
1,1,1,1,-1
2,0,0,0,0


In [228]:
df_indices = test_ip.index
dfi_indices = test_inp.index
missing = set(df_indices) - set(dfi_indices)

test_inp = test_inp.append(test_inp.ix[missing])
test_inp

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


Unnamed: 0,0,1,2,3
0,1,1,1,1
1,1,1,1,-1
2,0,0,0,0


In [229]:
subtracted = clean(test_ip).sub(clean(test_inp))
subtracted

Unnamed: 0,0,1,2,3
0,-1,0,1,2.0
1,2,3,4,
2,0,0,0,0.0


In [185]:
calculate_pdf(subtracted, 1)

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,1,2,3
0,-0.125,0.0,0.125,0.25
1,0.166667,0.25,0.333333,
2,0.0,0.0,0.0,0.0


In [225]:
# set every value less than 1 to zero
# normalize (pdf) by adding pseudocount of 1 read (ip)
subtracted[subtracted < 0] = 0
subtracted

Unnamed: 0,0,1,2,3
0,0,0,1,2.0
1,2,3,4,
2,0,0,0,0.0


In [34]:
subtracted+1

Unnamed: 0,0,1,2,3
0,1,1,2,3.0
1,3,4,5,


In [35]:
def per_region_subtract_and_normalize(density_df, input_density_df,
                                      pseudocount, input_pseudocount,
                                      min_density_threshold=0):
    """
    Normalizes ip matrix of m x n (where m is the row of each event in a
    feature, and n is the column relating to nucleotide position).

    Parameters
    ----------
    density_df : pandas.DataFrame
        matrix of RPM-normalized read densities in ip CLIP
    input_density_df : pandas.DataFrame
        matrix of RPM-normalized read densities in input CLIP
    pseudocount : float
        RPM-normalized read density of one read in ip CLIP
    input_pseudocount : float
        RPM-normalized read density of one read in input CLIP
    min_density_threshold : int

    Returns
    -------
    subtracted : pandas.DataFrame
    """

    df_indices = density_df.index
    dfi_indices = input_density_df.index
    missing = set(df_indices) - set(dfi_indices)

    input_density_df = input_density_df.append(input_density_df.ix[missing])

    subtracted = clean(density_df).sub(clean(input_density_df))

    pdf = calculate_pdf(
        subtracted, pseudocount, min_density_threshold
    )

    return pdf

In [101]:
# this is the old pdf 

def clean(density_df):
    """
    These functions expect a dataframe with density values (columns)
    across a number of regions (rows). These dataframes may also contain
    information regarding premature boundaries for each region (marked as -1)
    and no-density regions (marked by nan). This cleans the dataframe.
    Parameters
    ----------
    density_df : pandas.DataFrame
        Table of densities
    Returns
    -------
    pandas.DataFrame
    """

    # NaNs are regions which contain zero density
    # -1 are regions which should not be counted at all
    density_df = density_df.fillna(0)
    return density_df.replace(-1, np.nan)

def normalize_and_per_region_subtract(density_df, input_density_df,
                                      pseudocount, input_pseudocount,
                                      min_density_threshold=0):
    """
    Normalizes ip matrix of m x n (where m is the row of each event in a
    feature, and n is the column relating to nucleotide position).
    Parameters
    ----------
    density_df : pandas.DataFrame
        matrix of RPM-normalized read densities in ip CLIP
    input_density_df : pandas.DataFrame
        matrix of RPM-normalized read densities in input CLIP
    pseudocount : float
        RPM-normalized read density of one read in ip CLIP
    input_pseudocount : float
        RPM-normalized read density of one read in input CLIP
    min_density_threshold : int
    Returns
    -------
    subtracted : pandas.DataFrame
    """

    df_indices = density_df.index
    dfi_indices = input_density_df.index
    missing = set(df_indices) - set(dfi_indices)

    input_density_df = input_density_df.append(input_density_df.ix[missing])
    print('before pdf normalization of ip: {}'.format(density_df.shape))
    pdf = calculate_pdf(
        density_df, pseudocount, min_density_threshold
    )
    print('after pdf normalization of ip: {}'.format(pdf.shape))
    pdfi = calculate_pdf(
        input_density_df, input_pseudocount, min_density_threshold
    )
    print('before pdf normalization of ip: {}'.format(density_df.shape))
    subtracted = pdf.sub(pdfi)
    return subtracted

def calculate_pdf(density_df, pseudocount=None, min_density_threshold=0):
    """
    Calculates the PDF of a density matrix (makes all rows sum to 1).
    Parameters
    ----------
    density_df : pandas.DataFrame
        r x c matrix of densities.
        May contain NaN corresponding to values
        in which no density was returned. These values should be counted.
        May also contain -1 corresponding to values in which a particular
        region is shorter than the full DataFrame length. These
        values should NOT be counted.
    pseudocount : float
        value added to the entire dataframe before calculating pdf.
    min_density_threshold : int
        minimum total density_df across a row.
        (May be deprecated - possibly removed in the future)
    Returns
    -------
    pdf : pandas.DataFrame
        r x c matrix of densities normalized across each respective
        (r)ow as a probability density_df func.
    """
    print("before", density_df.shape)
    print("are there nans?", density_df.isnull().values.any())
    df = clean(density_df)
    print("after clean", df.shape)
    print("are there nans?", df.isnull().values.any())
    print("min density threshold: {}".format(min_density_threshold))
    df = df[df.sum(axis=1) >= 0]
    print("after filter", df.shape)
    return df
    print("are there nans?", df.isnull().values.any())
    min_read = pseudocount if pseudocount else min(
        [item for item in df.unstack().values if item > 0]
    )

    df = df + min_read
    print("after adding pseudocount", df.shape)
    print("are there nans?", df.isnull().values.any())
    pdf = df.div(df.sum(axis=1), axis=0)
    print("after pdf", df.shape)
    print("are there nans?", df.isnull().values.any())
    return pdf # , mean, sem



In [102]:
df = pd.read_table(
    '/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/whole_read/204_01_RBFOX2.merged.r2.0.HepG2_constitutive_exons.ip.raw_density.txt',
    sep=',',
    index_col=0
)
dfi = pd.read_table(
    '/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/whole_read/204_01_RBFOX2.merged.r2.0.HepG2_constitutive_exons.input.raw_density.txt',
    sep=',',
    index_col=0
)
dfp = 0.23389345358
dfip = 0.265550504785

df = normalize_and_per_region_subtract(
    density_df=df, input_density_df=dfi,
    pseudocount=dfp, input_pseudocount=dfip
)

print(df.shape)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


before pdf normalization of ip: (7351, 1400)
('before', (7351, 1400))
('are there nans?', False)
('after clean', (7351, 1400))
('are there nans?', True)
min density threshold: 0
('after filter', (7351, 1400))
(7351, 1400)


In [103]:
df = df[df.sum(axis=1) >= 0]
df.shape

(7351, 1400)

In [86]:
# subtracted[subtracted.sum(axis=1) >= 0].shape

df = df[df.sum(axis=1) >= 0]
df = df + dfp
df.shape

(2721, 1400)

In [59]:
subtracted2.shape

(7351, 1400)

In [61]:
from pandas.util.testing import assert_frame_equal

assert_frame_equal(subtracted, subtracted2)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_table(
    '/home/bay001/projects/codebase/rbp-maps/examples/se/outputs/368_01_PTBP1.merged.r2.4.K562_constitutive_exons.ip.raw_density.txt',
    sep=',',
    index_col=0
)
df = clean(df)

In [None]:
sns.heatmap(df, xticklabels=False, yticklabels=False, vmin=min(df.min()), vmax=max(df.max()))

In [21]:
pd.Series(np.nan + 3)

0   NaN
dtype: float64

In [None]:
max(df[['1']])

In [None]:
df[['1']].max()

In [98]:
pd.__version__

u'0.22.0'

In [234]:
def add_min_read(row, min_read):
    return row.abs().sum() + min_read*len(row.dropna())

In [223]:
x = df_w.apply(add_min_read, axis=1, args=(1,))

In [235]:
subtracted

Unnamed: 0,0,1,2,3
0,-1,0,1,2.0
1,2,3,4,
2,0,0,0,0.0


In [237]:
summed = subtracted.apply(add_min_read, axis=1, args=(1,))

In [238]:
subtracted.div(summed, axis=0)

Unnamed: 0,0,1,2,3
0,-0.125,0.0,0.125,0.25
1,0.166667,0.25,0.333333,
2,0.0,0.0,0.0,0.0


In [242]:
pdf = pd.read_table(
    '/oasis/tscc/scratch/bay001/df.csv',
    sep=',',
    index_col=0
)
pdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399
"10072\tENSG00000055130.11\tCUL1\tchr7\t+\t148456395\t148456446\t148454074\t148454242\t148456639\t148456730\t10072\t418,306\t13,5\t209,215\t26,16\t149\t99\t3.11531619412e-06\t0.000342740711339\t0.955,0.976\t0.842,0.899\t0.095",-0.529109,-0.529109,-0.529109,-0.106158,-0.106158,-0.106158,-0.106158,0.070212,0.246581,0.246581,...,-0.17637,-0.17637,-0.17637,-0.17637,-0.17637,-0.17637,-0.17637,-0.17637,-0.17637,-0.17637
"10082\tENSG00000163714.13\tU2SURP\tchr3\t+\t142742816\t142742860\t142741694\t142741906\t142745990\t142746095\t10082\t234,237\t8,4\t154,137\t22,15\t142\t99\t2.0643736337e-06\t0.000240689931719\t0.953,0.976\t0.83,0.864\t0.117",-0.282528,-0.282528,-0.282528,-0.282528,-0.705479,-0.881849,-0.881849,-0.529109,-0.705479,-0.705479,...,-0.282528,-0.705479,-0.705479,-0.705479,-0.705479,-0.529109,-0.529109,-0.529109,-0.352739,-0.352739
"10256\tENSG00000271816.1\tRP11-574K11.28\tchr10\t-\t75484532\t75484596\t75482146\t75482335\t75485718\t75485798\t10256\t14,9\t8,6\t6,8\t17,23\t162\t99\t0.00110373600952\t0.0432103202553\t0.517,0.478\t0.177,0.175\t0.322",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"10295\tENSG00000114956.15\tDGUOK\tchr2\t+\t74184251\t74184367\t74173845\t74174033\t74185272\t74185372\t10295\t158,122\t78,70\t116,110\t124,137\t197\t99\t4.18676377456e-06\t0.000445932116795\t0.504,0.467\t0.32,0.287\t0.182",-0.17637,-0.17637,-0.17637,-0.17637,-0.352739,-0.352739,-0.529109,-0.529109,-0.529109,-0.705479,...,-0.17637,-0.17637,-0.17637,-0.17637,-0.17637,0.246581,0.246581,0.246581,0.246581,0.246581
"10452\tENSG00000100023.13\tPPIL2\tchr22\t+\t22024552\t22024665\t22024201\t22024251\t22024854\t22024900\t10452\t81,59\t255,226\t16,13\t213,207\t197\t99\t2.03060457338e-10\t6.09638258043e-08\t0.138,0.116\t0.036,0.031\t0.093",1.092483,1.092483,1.092483,1.268853,1.268853,1.268853,1.268853,1.268853,1.268853,1.268853,...,-1.553063,-1.553063,-1.553063,-1.553063,-1.553063,-1.553063,,,,


In [272]:
df = pd.read_table('/oasis/tscc/scratch/bay001/density.csv', sep=',', index_col=0)
dfi = pd.read_table('/oasis/tscc/scratch/bay001/inputdensity.csv', sep=',', index_col=0)
subtracted = pd.read_table('/oasis/tscc/scratch/bay001/subtracted.csv', sep=',', index_col=0)
pdf = pd.read_table('/oasis/tscc/scratch/bay001/pdf.csv', sep=',', index_col=0)

In [273]:
idx = 'chr10|+|124186547-124187791|124187832-124189139|124186547-124189139\t124186457-124186547\t124187791-124187832\t124189139-124191866\t5714,2458\t36893,17211'
(df.ix[idx] - dfi.ix[idx]).head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


0   -0.031657
1   -0.031657
2   -0.031657
3   -0.031657
4   -0.031657
Name: chr10|+|124186547-124187791|124187832-124189139|124186547-124189139\t124186457-124186547\t124187791-124187832\t124189139-124191866\t5714,2458\t36893,17211, dtype: float64

In [274]:
subtracted.ix[idx].head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


0   -0.031657
1   -0.031657
2   -0.031657
3   -0.031657
4   -0.031657
Name: chr10|+|124186547-124187791|124187832-124189139|124186547-124189139\t124186457-124186547\t124187791-124187832\t124189139-124191866\t5714,2458\t36893,17211, dtype: float64

In [281]:
new_series = []
pseudocount = 0.23389345358
for v in subtracted.ix[idx]:
    if v < 0:
        new_series.append(v-pseudocount)
    else:
        new_series.append(v+pseudocount)
pd.Series(new_series).abs().sum()

880.74100861151169

In [282]:
subtracted.ix[idx].div(pd.Series(new_series).abs().sum())

0      -0.000036
1      -0.000036
2      -0.000036
3      -0.000036
4      -0.000036
5      -0.000036
6      -0.000036
7      -0.000036
8      -0.000302
9      -0.000302
10     -0.000302
11     -0.000302
12     -0.000302
13     -0.000302
14     -0.000302
15     -0.000036
16     -0.000036
17     -0.000036
18     -0.000036
19     -0.000036
20     -0.000036
21     -0.000036
22     -0.000036
23     -0.000036
24     -0.000036
25     -0.000036
26     -0.000036
27     -0.000036
28     -0.000036
29     -0.000036
          ...   
1370    0.000531
1371    0.000531
1372    0.000531
1373    0.000531
1374    0.000531
1375    0.000266
1376    0.000266
1377    0.000266
1378    0.000266
1379    0.000266
1380    0.000000
1381    0.000000
1382    0.000000
1383    0.000000
1384    0.000000
1385   -0.000302
1386   -0.000302
1387   -0.000905
1388   -0.000905
1389   -0.000905
1390   -0.000905
1391   -0.001206
1392   -0.001206
1393   -0.001508
1394   -0.001242
1395   -0.001242
1396   -0.001242
1397   -0.0012

In [277]:
pdf.ix[idx]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


0      -0.000036
1      -0.000036
2      -0.000036
3      -0.000036
4      -0.000036
5      -0.000036
6      -0.000036
7      -0.000036
8      -0.000302
9      -0.000302
10     -0.000302
11     -0.000302
12     -0.000302
13     -0.000302
14     -0.000302
15     -0.000036
16     -0.000036
17     -0.000036
18     -0.000036
19     -0.000036
20     -0.000036
21     -0.000036
22     -0.000036
23     -0.000036
24     -0.000036
25     -0.000036
26     -0.000036
27     -0.000036
28     -0.000036
29     -0.000036
          ...   
1370    0.000531
1371    0.000531
1372    0.000531
1373    0.000531
1374    0.000531
1375    0.000266
1376    0.000266
1377    0.000266
1378    0.000266
1379    0.000266
1380    0.000000
1381    0.000000
1382    0.000000
1383    0.000000
1384    0.000000
1385   -0.000302
1386   -0.000302
1387   -0.000905
1388   -0.000905
1389   -0.000905
1390   -0.000905
1391   -0.001206
1392   -0.001206
1393   -0.001508
1394   -0.001242
1395   -0.001242
1396   -0.001242
1397   -0.0012

In [284]:
def get_rowtype(row):
    print(type(row))
    
df.head().apply(get_rowtype, axis=1)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


chr10|+|111890244-111892062|111892158-111893083|111890244-111893083\t111890120-111890244\t111892062-111892158\t111893083-111893969\t1877,1526\t19425,11386    None
chr10|+|124186547-124187791|124187832-124189139|124186547-124189139\t124186457-124186547\t124187791-124187832\t124189139-124191866\t5714,2458\t36893,17211    None
chr10|+|79797062-79799961|79799983-79800372|79797062-79800372\t79796951-79797062\t79799961-79799983\t79800372-79800455\t48407,32190\t438372,256983            None
chr10|+|88930731-88935645|88935852-88939831|88930731-88939831\t88930596-88930731\t88935645-88935852\t88939831-88940060\t3756,1002\t29549,7742                 None
chr10|+|93024277-93031394|93031454-93038025|93024277-93038025\t93024187-93024277\t93031394-93031454\t93038025-93044088\t2525,808\t19336,5297                  None
dtype: object

In [288]:
df = pd.DataFrame(
    [[0/10., 1/10., 2/10., 3/10.],
     [3/17., 4/17., 5/17., -1/17.],
     [0/4., 0/4., 0/4., 0/4.]]
)
df

Unnamed: 0,0,1,2,3
0,0.0,0.1,0.2,0.3
1,0.176471,0.235294,0.294118,-0.058824
2,0.0,0.0,0.0,0.0


In [302]:
df = pd.DataFrame(
        [[0, 1, 2, 3, np.nan]])


In [305]:
def has_negative_values(df):
    """
    Checks a dataframe for negative values. Return True if
    there are negative values, return False otherwise.

    Parameters
    ----------
    df : pandas.DataFrame

    Returns
    -------

    """
    print(df, df[(df < 0).any(axis=1)].shape)
    return df[(df < 0).any(axis=1)].shape[0] > 0

has_negative_values(df)

(   0  1  2  3   4
0  0  1  2  3 NaN, (0, 5))


False

In [306]:
df = pd.DataFrame(
    [
        [2,1],
        [1,1],
        [1,1],
        [-1,1],
        [1,1],
        [1,1],
        [1,1],
        [1,1],
        [1,1],
        [1,1]
    ]
)

In [314]:
df.mean()

0    0.9
1    1.0
dtype: float64

In [315]:
2+1+1+1+1+1+1+1+1+1-1

10

In [317]:
df.mean()

0    0.9
1    1.0
dtype: float64