# Suite of tests for the normalization function we will be using for making RBP maps. 

In [7]:
import pandas as pd
import numpy as np
import os

In [8]:
'''
Created on May 3, 2016

@author: Gabe
'''
import pyBigWig
import pysam
import numpy as np

class ReadDensity():
    """
    ReadDensity class
    Attributes:
        self.pos(positive *.bw file)
        self.neg(negative *.bw file)
    """
    def __init__(self, pos, neg, name = None, bam = None):
        try:
            self.pos = pyBigWig.open(pos)
            self.neg = pyBigWig.open(neg)
            self.name = name if name is not None else pos.replace('pos','*').replace('neg','*')
            print(bam)
            self.bam = pysam.AlignmentFile(bam)
        except Exception as e:
            print("couldn't open the bigwig files!")
            print(e)
            return 1
    
    def pseudocount(self):
        print(self.bam.count())
        pseudocount = 1000000.0/self.bam.count()
        print('pseudocount: {}'.format(pseudocount))
        return pseudocount
        
    def values(self, chrom, start, end, strand):
        """
        Given a chromosome coordinate, return a list of values
        pertaining to the rbpmaps over each nucleotide position.
        Reverse the list if going in the negative strand.
        
        Args:
            chrom (str): (eg. chr1)
            start (int): 0-based start (first position in chromosome is 0)
            end (int): 1-based end (last position is not included)
            strand (char): either '+' or '-'
        """
        try:
            if strand == "+":
                return self.pos.values(chrom, start, end)
            elif strand == "-":
                return list(reversed(self.neg.values(chrom, start, end)))
            else:
                raise("Strand neither + or -")
        except RuntimeError:
            # usually occurs when no chromosome exists in the bigwig file
            return [np.NaN]*abs(start-end)
        

In [9]:
#!/usr/local/bin/python2.7
# encoding: utf-8
'''
Created on May 3, 2016

@author: brianyee
'''
import pandas as pd
import itertools


def multiply(n):
    # type: (int) -> list
    '''
    Multiplies n by 100: (e.g. n = 5, returns [5,5,5, (x100), 5]
    '''
    return [n]*100

def rename_index(interval_name):
    # type: (str) -> str
    '''
    Reformats a BedTool Interval name into a non-tabbed format.
    '''
    chrom, start, end, name, score, strand = str(interval_name).strip().split('\t')
    return "{}:{}-{}:{}:{}".format(chrom, start, end, name, strand)

def get_scale(wiggle):
    # type: (Series) -> Series
    '''
    Returns a wiggle of any N that is divisible by 100.
    
    '''
    if(len(wiggle)==100): # no need to do any calculating.
        return wiggle
    elif len(wiggle) == 1:
        return pd.Series(list(itertools.chain.from_iterable([multiply(w) for w in wiggle])))
    elif len(wiggle) < 100: 
        wiggle = pd.Series(list(itertools.chain.from_iterable([multiply(w) for w in wiggle])))
        
    dist = [0]*100
    x = 0
    step = 0.01
    y = 0
        
    for pos, value in enumerate(wiggle):
        if(float(pos+1)/len(wiggle)) < step:
            y = y + 1
            dist[x] = dist[x] + value            
        else:
            dist[x] = dist[x] / y
                
            step = step + 0.01
            x = x + 1
            dist[x] = value
            y = 1
    dist[x] = dist[x] / y
    return(pd.Series(dist))
    
def some_range(rbp, interval, left_flank = 0, right_flank = 0):
    # type: (ReadDensity, BedTools.Interval) -> list
    if interval.strand == "+":
        wiggle = rbp.values(interval.chrom, interval.start - left_flank, interval.end + right_flank, interval.strand)
    elif interval.strand == "-":
        wiggle = rbp.values(interval.chrom, interval.start - left_flank, interval.end + right_flank, interval.strand)
    else:
        print "Strand not correct", interval.strand
        raise()
    return wiggle   

def five_prime_site(rbp,                # type: ReadDensity
                    upstream_interval,  # type: BedTools.Interval
                    interval,           # type: BedTools.Interval
                    exon_offset,        # type: int
                    intron_offset,      # type: int
                    trunc = True):      # type: boolean
    # type: (...) -> (int, list, int)
    '''
    Given an upstream exon and a focus exon, return a list of density 
    values of the surrounding 5' intron/exon boundary given 
    exon_offset and intron_offset parameters. Also returns the 
    list of padded values which can be appended to either end of
    the returned list in order to conform to a uniform length. 
    
    Args:
        rbp: ReadDensity object containing *.pos and *.neg bigwig files
        upstream interval: Interval describing an exon/feature upstream of
            the current feature.
        interval: The focus interval/exon.
        exon_offset: the number of nt from the 5' Exon boundary into the exon.
        intron_offset: the number of nt from the 5' Exon boundary into the intron.
        trunc: if trunc is True, then consider instances where 
            exon_offset > length of the exon.
    Returns: 
        fivep_pad: if the desired wiggle length is X but the returned wiggle 
            does not span the entire length, return N where N is the number
            of upstream positions that will need to be filled for len(wiggle)=X.
            E.G. exon_offset+intron_offset = 10.
                fivep_pad = 3: NNN1111111
        wiggle: list of densities given a region.
        threep_pad: if the desired wiggle length is X but the returned wiggle 
            does not span the entire length, return N where N is the number
            of downstream positions that will need to be filled for len(wiggle)=X.
            E.G. exon_offset+intron_offset = 10.
                threep_pad = 3: 1111111NNN
    '''
    exon = exon_offset
    intron = intron_offset
    
    fivep_pad = 0
    threep_pad = 0
    # [    ]-----|-----[2  |  |  8]-----|----[10   15]
    if interval.strand == "+":
        if(trunc == True):
            if interval.start + exon_offset > interval.end:
                # middle = int((interval.end + interval.start)/2)
                # exon_offset = interval.end - middle
                exon_offset = interval.end - interval.start
                threep_pad = exon - exon_offset
            if interval.start - intron_offset < upstream_interval.end:
                intron_offset = interval.start - upstream_interval.end
                # middle = int((interval.start + upstream_interval.end)/2)
                # intron_offset = interval.start - middle
                fivep_pad = intron - intron_offset
        wiggle = rbp.values(interval.chrom, (interval.start - intron_offset), (interval.start + exon_offset), interval.strand)
    elif interval.strand == "-":
        if(trunc == True):
            if interval.end - exon_offset < interval.start:
                # middle = int((interval.start + interval.end)/2)
                # exon_offset = interval.end - middle
                exon_offset = interval.end - interval.start
                threep_pad = exon - exon_offset
            if interval.end + intron_offset > upstream_interval.start:
                intron_offset = upstream_interval.start - interval.end
                # middle = int((upstream_interval.start + interval.end)/2)
                # intron_offset = upstream_interval.start - middle
                fivep_pad = intron - intron_offset
                
        wiggle = rbp.values(interval.chrom, (interval.end - exon_offset), (interval.end + intron_offset), interval.strand)
    return fivep_pad, wiggle, threep_pad

def three_prime_site(rbp,                   # type: ReadDensity
                     downstream_interval,   # type: BedTools.Interval
                     interval,              # type: BedTools.Interval
                     exon_offset,           # type: int
                     intron_offset,         # type: int
                     trunc = True):         # type: Boolean
    # [      ]-----|-----[   |   ]-----|----[   ]
    # type: (...) -> (int, list, int)
    '''
    Given an downstream exon and a focus exon, return a list of density 
    values of the surrounding 3' intron/exon boundary given 
    exon_offset and intron_offset parameters. Also returns the 
    list of padded values which can be appended to either end of
    the returned list in order to conform to a uniform length. 
    
    Args:
        rbp: ReadDensity object containing *.pos and *.neg bigwig files
        upstream interval: Interval describing an exon/feature upstream of
            the current feature.
        interval: The focus interval/exon.
        exon_offset: the number of nt from the 5' Exon boundary into the exon.
        intron_offset: the number of nt from the 5' Exon boundary into the intron.
        trunc: if trunc is True, then consider instances where 
            exon_offset > length of the exon.
    Returns: 
        fivep_pad: if the desired wiggle length is X but the returned wiggle 
            does not span the entire length, return N where N is the number
            of upstream positions that will need to be filled for len(wiggle)=X.
            E.G. exon_offset+intron_offset = 10.
                fivep_pad = 3: NNN1111111
        wiggle: list of densities given a region.
        threep_pad: if the desired wiggle length is X but the returned wiggle 
            does not span the entire length, return N where N is the number
            of downstream positions that will need to be filled for len(wiggle)=X.
            E.G. exon_offset+intron_offset = 10.
                threep_pad = 3: 1111111NNN
    '''
    exon = exon_offset
    intron = intron_offset
    
    fivep_pad = 0
    threep_pad = 0
    
    if interval.strand == "+":
        if(trunc == True):
            if interval.end - exon_offset < interval.start:
                # middle = int((interval.start + interval.end)/2)
                # exon_offset = interval.end - middle
                exon_offset = interval.end - interval.start
                fivep_pad = exon - exon_offset
            if interval.end + intron_offset > downstream_interval.start:
                # middle = int((interval.end + downstream_interval.start)/2)
                # intron_offset = downstream_interval.start - middle
                intron_offset = downstream_interval.start - interval.end
                threep_pad = intron - intron_offset
        wiggle = rbp.values(interval.chrom, interval.end - exon_offset, interval.end + intron_offset, interval.strand)
    elif interval.strand == "-":
        if(trunc == True):
            if interval.start + exon_offset > interval.end:
                # middle = int((interval.start + interval.end)/2)
                # exon_offset = interval.end - middle
                exon_offset = interval.end - interval.start
                fivep_pad = exon - exon_offset
            if interval.start - intron_offset < downstream_interval.end:
                # middle = int((interval.start + downstream_interval.end)/2)
                # intron_offset = interval.start - middle
                intron_offset = interval.start - downstream_interval.end
                threep_pad = intron - intron_offset
        wiggle = rbp.values(interval.chrom, interval.start - intron_offset, interval.start + exon_offset, interval.strand)
    return fivep_pad, wiggle, threep_pad

In [63]:
def clean(density):
    """
    These functions expect a dataframe with density values (columns)
    across a number of regions (rows). These dataframes may also contain
    information regarding premature boundaries for each region (marked as -1)
    and no-density regions (marked by nan). This cleans the dataframe.
    """
    density = density.fillna(0) # NaNs are regions which contain zero density
    return density.replace(-1, np.nan) # -1 are regions which should not be counted at all

def normalize_and_per_region_subtract(density, input_density, 
                                      pseudocount, ipseudocount, 
                                      min_density_threshold = 0):
    """
    Normalizes ip matrix of m x n (where m is the row of each event in a feature,
    and n is the column relating to nucleotide position). 
    """
    # logger.info("Starting normalization (per region subtraction)")
    df_indices = density.index
    dfi_indices = input_density.index
    missing = set(df_indices) - set(dfi_indices)
    
    input_density = input_density.append(input_density.ix[missing])
    
    pdf = calculate_pdf(density, pseudocount, min_density_threshold)
    pdfi = calculate_pdf(input_density, ipseudocount, min_density_threshold)
    subtracted = pdf.sub(pdfi)
    # logger.info("Starting normalization (per region subtraction)")
    return pdf, pdfi, subtracted

def calculate_pdf(density, pseudocount = None, min_density_threshold = 0):
    """
    Calculates the PDF of a density matrix.
    Logic:
    
    Args: 
        density (pandas.DataFrame) : r x c matrix of densities. May contain
            NaN corresponding to values in which no density was returned.
            These values should be counted.
            May contain -1 corresponding to values in which a particular
            region is shorter than the full DataFrame length. These 
            values should not be counted.
        min_density_threshold (integer) : minimum total density across
            a row. (Deprecated - may be removed in the future)
    
    Returns:
        pdf (pandas.DataFrame) : r x c matrix of densities normalized
            across each respective (r)ow as a probability density func.
    """
    df = clean(density)
    min_read = pseudocount if pseudocount else min([item for item in df.unstack().values if item > 0])
    
    
    df = df + min_read
    
    pdf = df.div(df.sum(axis=1), axis=0)
    return pdf # , mean, sem

# Test case 1: should be zero
 - df contains uniform density across each position so should be normalized evenly during pdf calculations
 - dfi contains uniform density across each position, normalized evenly
 - subtraction for each region should yield zero

In [64]:
ps = 1
psi = 1
df = pd.DataFrame([[0.0,0.0,0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0]])
dfi = pd.DataFrame([[0.1,0.1,0.1],[.9,.9,.9],[1.9,1.9,1.9],[1.9,1.9,1.9]])

In [65]:
df

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,3.0


In [66]:
dfi

Unnamed: 0,0,1,2
0,0.1,0.1,0.1
1,0.9,0.9,0.9
2,1.9,1.9,1.9
3,1.9,1.9,1.9


In [67]:
pdf, pdfi, sub = normalize_and_per_region_subtract(df, dfi, ps, psi, 0)

In [68]:
pdf

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333


In [69]:
pdfi

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333


In [70]:
# wat
pdf - pdfi

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,-5.5511150000000004e-17,-5.5511150000000004e-17,-5.5511150000000004e-17
2,-5.5511150000000004e-17,-5.5511150000000004e-17,-5.5511150000000004e-17
3,-5.5511150000000004e-17,-5.5511150000000004e-17,-5.5511150000000004e-17


# Test case 1: if input has more regions than ip
- add minimum PDF
- should be zero

In [71]:
ps = 1
psi = 1
testip = pd.DataFrame([[0.0,0.0,0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0]])
testinput = pd.DataFrame([[0.0,0.0,0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0],[4,4,4]])

In [72]:
testip

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,3.0


In [73]:
testinput

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,3.0
4,4.0,4.0,4.0


In [75]:
ip_pdf, input_pdf, sub = normalize_and_per_region_subtract(testip, testinput, ps, psi, 0)

In [76]:
ip_pdf

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333


In [77]:
input_pdf

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333
4,0.333333,0.333333,0.333333


In [78]:
# the final matrix
sub

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,,,


# Test case 2: when there are more ip regions than input
- should return zeroes if the extra region is flat, otherwise imputed input region should be subtracted from ip
- the ip region density will be normalized to even probability density
- the minimum probability density will be added to the input
- subtraction should be zero

In [80]:
ps = 1
psi = 1
df = pd.DataFrame([[0.0,0.0,0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0],[4,4,4]])
dfi = pd.DataFrame([[0.0,0.0,0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0]])

In [81]:
# test ip matrix
df

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,3.0
4,4.0,4.0,4.0


In [82]:
# test input matrix
dfi

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,3.0


In [84]:
pdf, pdfi, sub = normalize_and_per_region_subtract(df, dfi, ps, psi, 0)

In [85]:
# ip pdf
pdf

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333
4,0.333333,0.333333,0.333333


In [86]:
# input pdf
pdfi

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333
4,0.333333,0.333333,0.333333


In [87]:
sub

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0


# Test case 3: if there is a NaN in the IP

In [88]:
ps = 1
psi = 1
df = pd.DataFrame([[0.0,0.0,0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0]])
dfi = pd.DataFrame([[0.0,0.0,0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0]])

In [89]:
# ip matrix
df

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,


In [90]:
# input matrix
dfi

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,3.0


In [91]:
pdf, pdfi, sub = normalize_and_per_region_subtract(df, dfi, ps, psi, 0)

In [92]:
# ip pdf
pdf

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.444444,0.444444,0.111111


In [93]:
# input pdf
pdfi

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333


In [94]:
# final matrix
sub

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.111111,0.111111,-0.222222


# Test case 4: if there is a NaN in the Input

In [95]:
ps = 1
psi = 1
df = pd.DataFrame([[0.0,0.0,0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0]])
dfi = pd.DataFrame([[0.0,0.0,0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0]])

In [96]:
# test ip
df

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,3.0


In [97]:
# test input
dfi

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,


In [98]:
# apply function to get the pdf, input pdf (pdfi) and subtracted matrix (for which we'll take the mean of)
pdf, pdfi, sub = normalize_and_per_region_subtract(df, dfi, ps, psi, 0)

In [99]:
pdf

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333


In [100]:
pdfi

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.444444,0.444444,0.111111


In [101]:
sub

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,-0.111111,-0.111111,0.222222


# Test case 5: when both have nans
- count them both (result will be flat)

In [102]:
ps = 1
psi = 1
df = pd.DataFrame([[0.0,2.0,4.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0,3.0],[]])
dfi = pd.DataFrame([[0.0,0.0,0.0],[1.0,1.0,1.0],[2.0,2.0,2.0],[3.0,3.0],[]])
df

Unnamed: 0,0,1,2
0,0.0,2.0,4.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,3.0
4,,,


In [103]:
dfi

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,1.0,1.0,1.0
2,2.0,2.0,2.0
3,3.0,3.0,
4,,,


In [104]:
pdf, pdfi, sub = normalize_and_per_region_subtract(df, dfi, ps, psi, 0)

In [105]:
pdf

Unnamed: 0,0,1,2
0,0.111111,0.333333,0.555556
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333
4,0.333333,0.333333,0.333333


In [106]:
pdfi

Unnamed: 0,0,1,2
0,0.333333,0.333333,0.333333
1,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333
3,0.444444,0.444444,0.111111
4,0.333333,0.333333,0.333333


In [107]:
sub

Unnamed: 0,0,1,2
0,-0.222222,0.0,0.222222
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,-0.111111,-0.111111,0.222222
4,0.0,0.0,0.0


# Test Case 6: short region
- when the exon region runs over the edge of the exon start

In [108]:
ps = 1
psi = 1
df = pd.DataFrame([[-1.0,0.0,0.0,4.0],[1.0,1.0,1.0,1.0],[2.0,2.0,2.0,2.0],[3.0,3.0,3.0,3.0]])
dfi = pd.DataFrame([[-1.0,0.0,0.0,4.0],[1.0,1.0,1.0,1.0],[2.0,2.0,2.0,2.0],[3.0,3.0,3.0,3.0]])
df

Unnamed: 0,0,1,2,3
0,-1.0,0.0,0.0,4.0
1,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0
3,3.0,3.0,3.0,3.0


In [109]:
dfi

Unnamed: 0,0,1,2,3
0,-1.0,0.0,0.0,4.0
1,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0
3,3.0,3.0,3.0,3.0


In [110]:
pdf, pdfi, sub = normalize_and_per_region_subtract(df, dfi, ps, psi, 0)

In [111]:
pdf

Unnamed: 0,0,1,2,3
0,,0.142857,0.142857,0.714286
1,0.25,0.25,0.25,0.25
2,0.25,0.25,0.25,0.25
3,0.25,0.25,0.25,0.25


In [112]:
pdfi

Unnamed: 0,0,1,2,3
0,,0.142857,0.142857,0.714286
1,0.25,0.25,0.25,0.25
2,0.25,0.25,0.25,0.25
3,0.25,0.25,0.25,0.25


In [113]:
sub

Unnamed: 0,0,1,2,3
0,,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0


# Test Case 7: short region2
- short region at boundary

In [114]:
ps = 1
psi = 1
df = pd.DataFrame([[-1.0,2.0,4.0,-1.0],[1.0,1.0,1.0,1.0],[2.0,2.0,2.0,2.0],[3.0,3.0,3.0,3.0]])
dfi = pd.DataFrame([[-1.0,2.0,4.0,-1.0],[1.0,1.0,1.0,1.0],[2.0,2.0,2.0,2.0],[3.0,3.0,3.0,3.0]])
df

Unnamed: 0,0,1,2,3
0,-1.0,2.0,4.0,-1.0
1,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0
3,3.0,3.0,3.0,3.0


In [115]:
dfi

Unnamed: 0,0,1,2,3
0,-1.0,2.0,4.0,-1.0
1,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0
3,3.0,3.0,3.0,3.0


In [116]:
pdf, pdfi, sub = normalize_and_per_region_subtract(df, dfi, ps, psi, 0)

In [117]:
pdf

Unnamed: 0,0,1,2,3
0,,0.375,0.625,
1,0.25,0.25,0.25,0.25
2,0.25,0.25,0.25,0.25
3,0.25,0.25,0.25,0.25


In [118]:
pdfi

Unnamed: 0,0,1,2,3
0,,0.375,0.625,
1,0.25,0.25,0.25,0.25
2,0.25,0.25,0.25,0.25
3,0.25,0.25,0.25,0.25


In [119]:
sub

Unnamed: 0,0,1,2,3
0,,0.0,0.0,
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0


# Test Case 8: short region3
- two short region simulation

In [120]:
ps = 1
psi = 1
df = pd.DataFrame([[-1.0,2.0,4.0,-1.0, 5.0],[1.0,1.0,1.0,1.0],[2.0,2.0,2.0,2.0],[3.0,3.0,3.0,3.0]])
dfi = pd.DataFrame([[-1.0,2.0,4.0,-1.0, 1.0],[1.0,1.0,1.0,1.0],[2.0,2.0,2.0,2.0],[3.0,3.0,3.0,3.0]])
df

Unnamed: 0,0,1,2,3,4
0,-1.0,2.0,4.0,-1.0,5.0
1,1.0,1.0,1.0,1.0,
2,2.0,2.0,2.0,2.0,
3,3.0,3.0,3.0,3.0,


In [121]:
dfi

Unnamed: 0,0,1,2,3,4
0,-1.0,2.0,4.0,-1.0,1.0
1,1.0,1.0,1.0,1.0,
2,2.0,2.0,2.0,2.0,
3,3.0,3.0,3.0,3.0,


In [122]:
pdf, pdfi, sub = normalize_and_per_region_subtract(df, dfi, ps, psi, 0)

In [123]:
pdf

Unnamed: 0,0,1,2,3,4
0,,0.214286,0.357143,,0.428571
1,0.222222,0.222222,0.222222,0.222222,0.111111
2,0.230769,0.230769,0.230769,0.230769,0.076923
3,0.235294,0.235294,0.235294,0.235294,0.058824


In [124]:
pdfi

Unnamed: 0,0,1,2,3,4
0,,0.3,0.5,,0.2
1,0.222222,0.222222,0.222222,0.222222,0.111111
2,0.230769,0.230769,0.230769,0.230769,0.076923
3,0.235294,0.235294,0.235294,0.235294,0.058824


In [125]:
sub

Unnamed: 0,0,1,2,3,4
0,,-0.085714,-0.142857,,0.228571
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0


# Test Case ... something different between eric and my version

In [126]:
line_in_question = '7616	ENSG00000164877.14	MICALL2	chr7	-	1474736	1474783	1474001	1474308	1476377	1476492	7616	148,73	3,0	66,42	8,6	146	100	0.000151433567852	0.0261605473558	0.971,1.0	0.85,0.827	0.147'
df = pd.read_table('/home/bay001/projects/maps_20160420/analysis/tests/included.ip.204_01_RBFOX2.included.feature.se.raw_density_matrix.csv',
                  sep=',',index_col=0)
dfi = pd.read_table('/home/bay001/projects/maps_20160420/analysis/tests/included.input.204_01_RBFOX2.included.feature.se.raw_density_matrix.csv',
                   sep=',',index_col=0)

In [170]:
values = dfi.ix[line_in_question]
values

0       0.26555
1       0.26555
2       0.26555
3       0.26555
4       0.26555
5       0.26555
6       0.26555
7       0.26555
8       0.26555
9       0.26555
10      0.26555
11      0.00000
12      0.00000
13      0.00000
14      0.00000
15      0.00000
16      0.00000
17      0.00000
18      0.00000
19      0.00000
20      0.26555
21      0.26555
22      0.26555
23      0.26555
24      0.26555
25      0.26555
26      0.26555
27      0.26555
28      0.26555
29      0.26555
         ...   
1370    0.00000
1371    0.00000
1372    0.00000
1373    0.00000
1374    0.00000
1375    0.00000
1376    0.00000
1377    0.00000
1378    0.00000
1379    0.00000
1380    0.00000
1381    0.00000
1382    0.00000
1383    0.00000
1384    0.00000
1385    0.00000
1386    0.00000
1387    0.00000
1388    0.00000
1389    0.00000
1390    0.00000
1391    0.00000
1392    0.00000
1393    0.00000
1394    0.00000
1395    0.00000
1396    0.00000
1397    0.00000
1398    0.00000
1399    0.00000
Name: 7616\tENSG00000164

In [147]:
values[values ==-1] # looks right... the skipped exon is 47nt long

697   -1.0
698   -1.0
699   -1.0
700   -1.0
701   -1.0
702   -1.0
Name: 7616\tENSG00000164877.14\tMICALL2\tchr7\t-\t1474736\t1474783\t1474001\t1474308\t1476377\t1476492\t7616\t148,73\t3,0\t66,42\t8,6\t146\t100\t0.000151433567852\t0.0261605473558\t0.971,1.0\t0.85,0.827\t0.147, dtype: float64

In [148]:
# make sure the -1s are being removed properly, and they are:
cleaned = clean(dfi)
cleanedvalues = cleaned.ix[line_in_question]
cleanedvalues[cleanedvalues ==-1] # so we know that the -1 are removed...

Series([], Name: 7616\tENSG00000164877.14\tMICALL2\tchr7\t-\t1474736\t1474783\t1474001\t1474308\t1476377\t1476492\t7616\t148,73\t3,0\t66,42\t8,6\t146\t100\t0.000151433567852\t0.0261605473558\t0.971,1.0\t0.85,0.827\t0.147, dtype: float64)

In [149]:
# this matches what eric sees in b
# pseudocount to add: 0.23389345358
pdf = calculate_pdf(dfi, 0.265550504785, 0)
pdfvalues = pdf.ix[line_in_question]
pdfvalues.head()

0    0.000895
1    0.000895
2    0.000895
3    0.000895
4    0.000895
Name: 7616\tENSG00000164877.14\tMICALL2\tchr7\t-\t1474736\t1474783\t1474001\t1474308\t1476377\t1476492\t7616\t148,73\t3,0\t66,42\t8,6\t146\t100\t0.000151433567852\t0.0261605473558\t0.971,1.0\t0.85,0.827\t0.147, dtype: float64

In [154]:
# Step 1: add min read number to cleaned values.
cleanedvalues = cleaned.ix[line_in_question]
added = cleanedvalues + 0.265550504785

In [155]:
# Step 2: Get the sum of these values
added.sum()

593.50537238695597

In [163]:
# Step 3: divide each value by this sum
added/(added.sum())

0       0.000895
1       0.000895
2       0.000895
3       0.000895
4       0.000895
5       0.000895
6       0.000895
7       0.000895
8       0.000895
9       0.000895
10      0.000895
11      0.000447
12      0.000447
13      0.000447
14      0.000447
15      0.000447
16      0.000447
17      0.000447
18      0.000447
19      0.000447
20      0.000895
21      0.000895
22      0.000895
23      0.000895
24      0.000895
25      0.000895
26      0.000895
27      0.000895
28      0.000895
29      0.000895
          ...   
1370    0.000447
1371    0.000447
1372    0.000447
1373    0.000447
1374    0.000447
1375    0.000447
1376    0.000447
1377    0.000447
1378    0.000447
1379    0.000447
1380    0.000447
1381    0.000447
1382    0.000447
1383    0.000447
1384    0.000447
1385    0.000447
1386    0.000447
1387    0.000447
1388    0.000447
1389    0.000447
1390    0.000447
1391    0.000447
1392    0.000447
1393    0.000447
1394    0.000447
1395    0.000447
1396    0.000447
1397    0.0004

# Looks okay...

In [195]:
eric = pd.read_table('/home/elvannostrand/data/clip/CLIPseq_analysis/scripts/splicing_maps/del_raw_input',sep='\t',
                    names=range(0,4))
values = pd.Series(eric.ix[3][3].split(';')).fillna(0)
values = values.astype(float)

In [197]:
pseudocount = values + .265550504785
pseudocount.sum()

593.50517167028988

In [200]:
values/values.sum()

0       0.001189
1       0.001189
2       0.001189
3       0.001189
4       0.001189
5       0.001189
6       0.001189
7       0.001189
8       0.001189
9       0.001189
10      0.001189
11      0.000000
12      0.000000
13      0.000000
14      0.000000
15      0.000000
16      0.000000
17      0.000000
18      0.000000
19      0.000000
20      0.001189
21      0.001189
22      0.001189
23      0.001189
24      0.001189
25      0.001189
26      0.001189
27      0.001189
28      0.001189
29      0.001189
          ...   
1370    0.000000
1371    0.000000
1372    0.000000
1373    0.000000
1374    0.000000
1375    0.000000
1376    0.000000
1377    0.000000
1378    0.000000
1379    0.000000
1380    0.000000
1381    0.000000
1382    0.000000
1383    0.000000
1384    0.000000
1385    0.000000
1386    0.000000
1387    0.000000
1388    0.000000
1389    0.000000
1390    0.000000
1391    0.000000
1392    0.000000
1393    0.000000
1394    0.000000
1395    0.000000
1396    0.000000
1397    0.0000