In [7]:
%load_ext memory_profiler

import pandas as pd
import numpy as np
import logging
import os
import memory_profiler
outdir = '/home/bay001/projects/maps_20160420/analysis/tests/'
logger = logging.getLogger('plot_features')
logger.setLevel(logging.INFO)
ih = logging.FileHandler(os.path.join(outdir,'log.txt'))
eh = logging.FileHandler(os.path.join(outdir,'log.err'))
ih.setLevel(logging.INFO)
eh.setLevel(logging.ERROR)
logger.addHandler(ih)
logger.addHandler(eh)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ih.setFormatter(formatter)
eh.setFormatter(formatter)
logger.info("starting program")

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


INFO:plot_features:starting program


In [8]:
'''
Created on Sep 21, 2016

@author: brian
'''
import pybedtools as bt

class Feature():
    '''
    classdocs
    '''


    def __init__(self, annotation, source):
        '''
        Constructor
        '''
        self.annotation = annotation.rstrip()
        self.source = source

    def get_bedtool(self):
        if(self.source == 'bed'):
            chrom, start, end, name, score, strand = self.annotation.split('\t')
        return bt.create_interval_from_list([chrom,
                                             start,
                                             end,
                                             name,
                                             score,
                                             strand])

class SkippedExonFeature():
    def __init__(self, annotation, source):
        self.source = source
        self.annotation = annotation.rstrip()
    def get_bedtools(self):
        if(self.source == 'miso'):
            event = self.annotation.split('\t')[0]
            up, se, down = event.split('@')
            
            chrom, start, stop, strand = up.split(':')
            up = bt.create_interval_from_list([chrom, int(start)-1, stop, '0', '0', strand])
            
            chrom, start, stop, strand = se.split(':')
            se = bt.create_interval_from_list([chrom, int(start)-1, stop, '0', '0', strand])
            
            chrom, start, stop, strand = down.split(':')
            down = bt.create_interval_from_list([chrom, int(start)-1, stop, '0', '0', strand])
        elif(self.source == 'hta2_0'):
            pass
        elif(self.source == 'xintao'):
            pass
        elif(self.source == 'eric'):
            name, se = self.annotation.split(';')
            xintao, ericleft, ericright = se.split('||')
            upstream_es = 1
            downstream_es = 250000000
            if("Not_found") not in ericleft:
                upstream_es = ericleft.split(':')[2].split('-')[0]
            if("Not_found") not in ericright:
                downstream_ee = ericright.split(':')[2].split('-')[1]
            
            event, chrom, upstream, downstream, strand = xintao.split(':')
            upstream_ee, skipped_es = upstream.split('-')
            skipped_ee, downstream_es = downstream.split('-')
            
            se = bt.create_interval_from_list([chrom, skipped_es, skipped_ee, '0', '0', strand])
            if(strand == '+'):
                up = bt.create_interval_from_list([chrom, upstream_es, upstream_ee, '0', '0', strand])
                down = bt.create_interval_from_list([chrom, downstream_es, downstream_ee, '0', '0', strand])
            elif(strand == '-'):
                up = bt.create_interval_from_list([chrom, downstream_es, downstream_ee, '0', '0', strand])
                down = bt.create_interval_from_list([chrom, upstream_es, upstream_ee, '0', '0', strand])
        elif(self.source == 'rmats'):
            id, GeneID, geneSymbol, chrom, strand, \
            exonStart_0base, exonEnd, \
            upstreamES, upstreamEE, \
            downstreamES, downstreamEE, \
            ID1, IJC_SAMPLE_1, SJC_SAMPLE_1, \
            IJC_SAMPLE_2, SJC_SAMPLE_2, \
            IncFormLen, SkipFormLen, PValue, \
            FDR, IncLevel1, IncLevel2, IncLevelDifference = self.annotation.split('\t')
            
            se = bt.create_interval_from_list([chrom, exonStart_0base, exonEnd, '0', '0', strand])
            if(strand == '+'):
                up = bt.create_interval_from_list([chrom, upstreamES, upstreamEE, '0', '0', strand])
                down = bt.create_interval_from_list([chrom, downstreamES, downstreamEE, '0', '0', strand])
            elif(strand == '-'):
                down = bt.create_interval_from_list([chrom, upstreamES, upstreamEE, '0', '0', strand])
                up = bt.create_interval_from_list([chrom, downstreamES, downstreamEE, '0', '0', strand])
            else:
                print("Warning, strand not correct!")
                return -1
        return up, se, down

In [9]:

def five_prime_site(rbp,                # type: ReadDensity
                    upstream_interval,  # type: BedTools.Interval
                    interval,           # type: BedTools.Interval
                    exon_offset,        # type: int
                    intron_offset,      # type: int
                    trunc = True):      # type: boolean
    # type: (...) -> (int, list, int)
    '''
    Given an upstream exon and a focus exon, return a list of density 
    values of the surrounding 5' intron/exon boundary given 
    exon_offset and intron_offset parameters. Also returns the 
    list of padded values which can be appended to either end of
    the returned list in order to conform to a uniform length. 
    
    Args:
        rbp: ReadDensity object containing *.pos and *.neg bigwig files
        upstream interval: Interval describing an exon/feature upstream of
            the current feature.
        interval: The focus interval/exon.
        exon_offset: the number of nt from the 5' Exon boundary into the exon.
        intron_offset: the number of nt from the 5' Exon boundary into the intron.
        trunc: if trunc is True, then consider instances where 
            exon_offset > length of the exon.
    Returns: 
        fivep_pad: if the desired wiggle length is X but the returned wiggle 
            does not span the entire length, return N where N is the number
            of upstream positions that will need to be filled for len(wiggle)=X.
            E.G. exon_offset+intron_offset = 10.
                fivep_pad = 3: NNN1111111
        wiggle: list of densities given a region.
        threep_pad: if the desired wiggle length is X but the returned wiggle 
            does not span the entire length, return N where N is the number
            of downstream positions that will need to be filled for len(wiggle)=X.
            E.G. exon_offset+intron_offset = 10.
                threep_pad = 3: 1111111NNN
    '''
    exon = exon_offset
    intron = intron_offset
    
    fivep_pad = 0
    threep_pad = 0
    # [    ]-----|-----[2  |  |  8]-----|----[10   15]
    if interval.strand == "+":
        if(trunc == True):
            if interval.start + exon_offset > interval.end:
                # middle = int((interval.end + interval.start)/2)
                # exon_offset = interval.end - middle
                exon_offset = interval.end - interval.start
                threep_pad = exon - exon_offset
            if interval.start - intron_offset < upstream_interval.end:
                intron_offset = interval.start - upstream_interval.end
                # middle = int((interval.start + upstream_interval.end)/2)
                # intron_offset = interval.start - middle
                fivep_pad = intron - intron_offset
        wiggle = rbp.values(interval.chrom, (interval.start - intron_offset), (interval.start + exon_offset), interval.strand)
    elif interval.strand == "-":
        if(trunc == True):
            if interval.end - exon_offset < interval.start:
                # middle = int((interval.start + interval.end)/2)
                # exon_offset = interval.end - middle
                exon_offset = interval.end - interval.start
                threep_pad = exon - exon_offset
            if interval.end + intron_offset > upstream_interval.start:
                intron_offset = upstream_interval.start - interval.end
                # middle = int((upstream_interval.start + interval.end)/2)
                # intron_offset = upstream_interval.start - middle
                fivep_pad = intron - intron_offset
                
        wiggle = rbp.values(interval.chrom, (interval.end - exon_offset), (interval.end + intron_offset), interval.strand)
    return fivep_pad, wiggle, threep_pad

def three_prime_site(rbp,                   # type: ReadDensity
                     downstream_interval,   # type: BedTools.Interval
                     interval,              # type: BedTools.Interval
                     exon_offset,           # type: int
                     intron_offset,         # type: int
                     trunc = True):         # type: Boolean
    # [      ]-----|-----[   |   ]-----|----[   ]
    # type: (...) -> (int, list, int)
    '''
    Given an downstream exon and a focus exon, return a list of density 
    values of the surrounding 3' intron/exon boundary given 
    exon_offset and intron_offset parameters. Also returns the 
    list of padded values which can be appended to either end of
    the returned list in order to conform to a uniform length. 
    
    Args:
        rbp: ReadDensity object containing *.pos and *.neg bigwig files
        upstream interval: Interval describing an exon/feature upstream of
            the current feature.
        interval: The focus interval/exon.
        exon_offset: the number of nt from the 5' Exon boundary into the exon.
        intron_offset: the number of nt from the 5' Exon boundary into the intron.
        trunc: if trunc is True, then consider instances where 
            exon_offset > length of the exon.
    Returns: 
        fivep_pad: if the desired wiggle length is X but the returned wiggle 
            does not span the entire length, return N where N is the number
            of upstream positions that will need to be filled for len(wiggle)=X.
            E.G. exon_offset+intron_offset = 10.
                fivep_pad = 3: NNN1111111
        wiggle: list of densities given a region.
        threep_pad: if the desired wiggle length is X but the returned wiggle 
            does not span the entire length, return N where N is the number
            of downstream positions that will need to be filled for len(wiggle)=X.
            E.G. exon_offset+intron_offset = 10.
                threep_pad = 3: 1111111NNN
    '''
    exon = exon_offset
    intron = intron_offset
    
    fivep_pad = 0
    threep_pad = 0
    
    if interval.strand == "+":
        if(trunc == True):
            if interval.end - exon_offset < interval.start:
                # middle = int((interval.start + interval.end)/2)
                # exon_offset = interval.end - middle
                exon_offset = interval.end - interval.start
                fivep_pad = exon - exon_offset
            if interval.end + intron_offset > downstream_interval.start:
                # middle = int((interval.end + downstream_interval.start)/2)
                # intron_offset = downstream_interval.start - middle
                intron_offset = downstream_interval.start - interval.end
                threep_pad = intron - intron_offset
        wiggle = rbp.values(interval.chrom, interval.end - exon_offset, interval.end + intron_offset, interval.strand)
    elif interval.strand == "-":
        if(trunc == True):
            if interval.start + exon_offset > interval.end:
                # middle = int((interval.start + interval.end)/2)
                # exon_offset = interval.end - middle
                exon_offset = interval.end - interval.start
                fivep_pad = exon - exon_offset
            if interval.start - intron_offset < downstream_interval.end:
                # middle = int((interval.start + downstream_interval.end)/2)
                # intron_offset = interval.start - middle
                intron_offset = interval.start - downstream_interval.end
                threep_pad = intron - intron_offset
        wiggle = rbp.values(interval.chrom, interval.start - intron_offset, interval.start + exon_offset, interval.strand)
    return fivep_pad, wiggle, threep_pad

In [10]:
def clean(density):
    """
    These functions expect a dataframe with density values (columns)
    across a number of regions (rows). These dataframes may also contain
    information regarding premature boundaries for each region (marked as -1)
    and no-density regions (marked by nan). This cleans the dataframe.
    """
    density = density.fillna(0) # NaNs are regions which contain zero density
    return density.replace(-1, np.nan) # -1 are regions which should not be counted at all

def remove_outliers(rbpdataframe, conf = 0.95):
    logger.info("Removing outliers (keep {})".format(conf))
    means = list()
    sems = list()
    for key, value in rbpdataframe.iteritems():
        df = rbpdataframe[key].dropna()
        
        nums = len(df)
        droppercent = (1-conf)/2.0
        dropnum = int(nums*(droppercent))
        df = df.sort_values()
        if(dropnum>0):
            df = df[dropnum:-dropnum]
        
        means.append(df.mean())
        sems.append(df.sem())
    logger.info("Finished removing outliers (keep {})".format(conf))
    return means, sems

def normalize_and_per_region_subtract(density, input_density, 
                                      pseudocount, ipseudocount, 
                                      min_density_threshold = 0):
    """
    Normalizes ip matrix of m x n (where m is the row of each event in a feature,
    and n is the column relating to nucleotide position). 
    """
    logger.info("Starting normalization (per region subtraction)")
    df_indices = density.index
    dfi_indices = input_density.index
    missing = set(df_indices) - set(dfi_indices)
    
    input_density = input_density.append(input_density.ix[missing])
    
    pdf = calculate_pdf(density, pseudocount, min_density_threshold)
    # pdf.to_csv('/Users/brianyee/git/encode/encode/rbpmaps/testfiles/rbfox2/outputs/ip_pdf.csv')
    pdfi = calculate_pdf(input_density, ipseudocount, min_density_threshold)
    # pdfi.to_csv('/Users/brianyee/git/encode/encode/rbpmaps/testfiles/rbfox2/outputs/input_pdf.csv')
    subtracted = pdf.sub(pdfi)
    logger.info("Starting normalization (per region subtraction)")
    return subtracted

def calculate_pdf(density, pseudocount = None, min_density_threshold = 0):
    """
    Calculates the PDF of a density matrix.
    Logic:
    
    Args: 
        density (pandas.DataFrame) : r x c matrix of densities. May contain
            NaN corresponding to values in which no density was returned.
            These values should be counted.
            May contain -1 corresponding to values in which a particular
            region is shorter than the full DataFrame length. These 
            values should not be counted.
        min_density_threshold (integer) : minimum total density across
            a row. (Deprecated - may be removed in the future)
    
    Returns:
        pdf (pandas.DataFrame) : r x c matrix of densities normalized
            across each respective (r)ow as a probability density func.
    """
    df = clean(density)
    min_read = pseudocount if pseudocount else min([item for item in df.unstack().values if item > 0])

    df = df + min_read
    pdf = df.div(df.sum(axis=1), axis=0)
    return pdf # , mean, sem

In [34]:
def create_se_matrix(annotation, density, exon_offset, intron_offset, is_scaled, combine_regions=True, annotation_type="rmats"):
    """
    Creates an r x c pandas dataframe of r events for a skipped
    exon feature. An SE matrix will contain four distinct regions: 
    
    |_]----||----[__||__]----||----[_|
    
    - the [..exon_offset]--intron_offset--... 3' site of an upstream exon
    - the ...--intron_offset--[exon_offset..] 5' site of the upstream skipped exon
    - the [..exon_offset]--intron_offset--... 3' site of the downstream skipped exon
    - the ..--intron_offset--[exon_offset..] 5' site of the downstream exon
    Args:
        annotation (string) : path of file containing the annotation
        density (ReadDensity) : object containing positive and negative BigWig files
        exon_offset (integer) : how far into the exon boundary to plot
        intron_offset (integer) : how far after the exon boundary to plot
        is_scaled (boolean) : if all features are of different length, this must be true
            to resize all features to fit on a 0-100% scale.
        combine_regions (boolean) : if False, return four DataFrames instead of one.
        annotation_type (string) : may be rmats format or any additional defined format in Feature
    
    Returns:
        pandas.DataFrame : a dataframe of r events for an SE feature.
    """
    logger.info("Starting SE matrix creation [ANNOTATION:{},DENSITY:{},UP:{},DOWN:{},SCALED:{},TYPE:{}".format(
                                                                                                            annotation,
                                                                                                            density.name,
                                                                                                            exon_offset,
                                                                                                            intron_offset,
                                                                                                            is_scaled,
                                                                                                            annotation_type))
    three_upstream = {}
    five_skipped = {}
    three_skipped = {}
    five_downstream = {}
    
    with open(annotation) as f:
        for line in f:
            if not line.startswith('event_name') and not line.startswith('ID'):
                event = line.rstrip()
                upstream_interval, interval, downstream_interval = SkippedExonFeature(event,annotation_type).get_bedtools()
                
                """three prime upstream region"""
                left_pad, wiggle, right_pad = three_prime_site(density, 
                                                                        interval,
                                                                        upstream_interval,
                                                                        exon_offset,
                                                                        intron_offset)
                wiggle = pd.Series(wiggle)
                wiggle = abs(wiggle) # convert all values to positive
        
                wiggle = np.pad(wiggle,(left_pad,right_pad),'constant',constant_values=(-1))
                wiggle = np.nan_to_num(wiggle) 

                three_upstream[event] = wiggle
                """five prime site of skipped region"""
                left_pad, wiggle, right_pad = five_prime_site(density, 
                                                                        upstream_interval,
                                                                        interval,
                                                                        exon_offset,
                                                                        intron_offset)
                
                wiggle = pd.Series(wiggle)
                wiggle = abs(wiggle) # convert all values to positive
                wiggle = np.pad(wiggle,(left_pad,right_pad),'constant',constant_values=(-1))
                wiggle = np.nan_to_num(wiggle)
                five_skipped[event] = wiggle
                """three prime site of skipped region"""
                left_pad, wiggle, right_pad = three_prime_site(density, 
                                                                         downstream_interval,
                                                                         interval,
                                                                         exon_offset,
                                                                         intron_offset)
                wiggle = pd.Series(wiggle)
                wiggle = abs(wiggle) # convert all values to positive
                wiggle = np.pad(wiggle,(left_pad,right_pad),'constant',constant_values=(-1))
                wiggle = np.nan_to_num(wiggle) #
                three_skipped[event] = wiggle
                """five prime site of downstream region"""
                left_pad, wiggle, right_pad = five_prime_site(density, 
                                                                        interval,
                                                                        downstream_interval,
                                                                        exon_offset,
                                                                        intron_offset)
                wiggle = pd.Series(wiggle)
                wiggle = abs(wiggle) # convert all values to positive
                wiggle = np.pad(wiggle,(left_pad,right_pad),'constant',constant_values=(-1))
                wiggle = np.nan_to_num(wiggle) # convert all nans to 0
                five_downstream[event] = wiggle

        three_upstream = pd.DataFrame(three_upstream).T
        five_skipped = pd.DataFrame(five_skipped).T
        three_skipped = pd.DataFrame(three_skipped).T
        five_downstream = pd.DataFrame(five_downstream).T
    logger.info("Finished matrix creation: {}, {}, {}, {}".format(three_upstream.shape[0],
                                                                  five_skipped.shape[0],
                                                                  three_skipped.shape[0],
                                                                  five_downstream.shape[0]))
    if combine_regions == False:
        return three_upstream, five_skipped, three_skipped, five_downstream
    else:
        ra = pd.concat([three_upstream,five_skipped,three_skipped,five_downstream],axis=1)
        ra.columns = range(0,ra.shape[1])
        return ra

'\n        three_upstream = pd.DataFrame(three_upstream).T\n        five_skipped = pd.DataFrame(five_skipped).T\n        three_skipped = pd.DataFrame(three_skipped).T\n        five_downstream = pd.DataFrame(five_downstream).T\n    logger.info("Finished matrix creation: {}, {}, {}, {}".format(three_upstream.shape[0],\n                                                                  five_skipped.shape[0],\n                                                                  three_skipped.shape[0],\n                                                                  five_downstream.shape[0]))\n    if combine_regions == False:\n        return three_upstream, five_skipped, three_skipped, five_downstream\n    else:\n        ra = pd.concat([three_upstream,five_skipped,three_skipped,five_downstream],axis=1)\n        ra.columns = range(0,ra.shape[1])\n        return ra'

In [32]:
'''
Created on Jun 27, 2016

@author: brianyee
'''

# import matrix_functions as mtx
# import normalization_functions as norm
import os
import logging
import pandas as pd

class Map():
    """
    Map class
    
    Attributes:
        self.output_file (string) : output file 
            (deprecated - we use this to just get output_base instead).
        self.log_file (string) : log file 
        self.name (string) : name of the Map object 
            (deprecated - will be removed later).
        self.is_scaled (boolean) : if regions need to be scaled - 
            if features are of different length, scale them from 0-100%
        self.annotation (string) : annotation file - can be rmats, miso, or
            any file whose line is defined in Feature
        self.annotation_type (string) : annotation source - can be 
            'rmats' 'miso' or any filetype defined in Feature
        self.left (integer) : left offset 
            (deprecated - will be removed or renamed 'upstream' later).
        self.right (integer) : right offset 
            (deprecated - will be removed or renamed 'downstream' later). 
        self.exon_offset (integer) : given an exon boundary how many 
            bases 'into' the exon to plot 
            (eg. exon_offset = 3: ------[-----|---]------ if '-'=1nt)
        self.intron_offset (integer) : given an intron boundary how many 
            bases 'outside' the exon to plot 
            (eg. intron_offset = 4: ------[--------]----|-- if '-'=1nt)
        self.density (dictionary{'feature':pandas.DataFrame}) : a dictionary of  
            Pandas.DataFrames representing normed or unnormed m x n 
            matrices where m is the each event within a given feature 
            and n is the length in nucleotides.
    """
    def __init__(self, output_file,
                 name, is_scaled = False, 
                 annotation = None,
                 annotation_type = "miso",
                 left = 0, right = 0,
                 exon_offset = 50, intron_offset = 300):
        '''
        Constructor
        '''
        self.output_file = output_file
        self.output_base = os.path.splitext(output_file)[0]
        self.name = name
        self.is_scaled = is_scaled
        self.annotation = annotation
        self.annotation_type = annotation_type
        self.left = left
        self.right = right
        self.exon_offset = exon_offset
        self.intron_offset = intron_offset
        self.density = {}
        
    def normalize(self, df):
        """
        Sets the Matrix for a Map
        """
        self.density = df
    
    def get_density(self):
        """
        Returns the Matrix for a Map
        """
        return self.density

class ClipWithInput(Map):
    """
    Clip class. Represents a Clip w/ Input Map
    Attributes:
        self.ip (ReadDensity.ReadDensity) : ReadDensity of the IP 
        self.inp (ReadDenstiy.ReadDensity) : ReadDensity of the Input
        self.ip_raw_density (dictionary{'feature':pandas.DataFrame}) : a dictionary of  
            Pandas.DataFrames representing UNNORMED IP m x n 
            matrices where m is the each event within a given feature 
            and n is the length in nucleotides. Each matrix may contain more than one
            'feature', for example, one might plot both '3_UTRs' and 'Prox_Introns'
            in the same map.
        self.inp_raw_density (dictionary{'feature':pandas.DataFrame}) : a dictionary of  
            Pandas.DataFrames representing UNNORMED INPUT m x n 
            matrices where m is the each event within a given feature 
            and n is the length in nucleotides. Each matrix may contain more than one
            'feature', for example, one might plot both '3_UTRs' and 'Prox_Introns'
            in the same map.
        self.density (dictionary{'feature':pandas.DataFrame}) : a dictionary of  
            Pandas.DataFrames representing NORMED m x n 
            matrices (IP over INPUT) where m is the each event within a given feature 
            and n is the length in nucleotides. Note: Each feature will be normalized
            independently.
    """

    def __init__(self, ReadDensity, InputReadDensity, output_file,
                 name, is_scaled = False, 
                 annotation = None,
                 annotation_type = "miso",
                 left = 0, right = 0,
                 exon_offset = 50, intron_offset = 300):
        '''
        Constructor
        '''
        
        Map.__init__(self, output_file,
                     name, is_scaled, 
                     annotation,
                     annotation_type,
                     left, right,
                     exon_offset, intron_offset)
        
        self.ip = ReadDensity
        self.inp = InputReadDensity
        
        self.ip_raw_density = {}
        self.input_raw_density = {}
        
        self.maptype = ""
        
        self.density = {}
        
        self.means = list()
        self.sems = list()
        
        self.logger = logging.getLogger('plot_features.Map.ClipWithInput')
        self.logger.info('creating an instance of ClipWithDensity')
        
    def get_means(self):
        """
        Returns the mean densities as Series
        """
        return pd.Series(self.means)
    
    def get_sems(self):
        """
        Returns standard error as Series
        """
        return pd.Series(self.sems)
    
    def set_annotation(self,annotation_file):
        """
        Sets the annotation source file
        Args:
            annotation_file (string) : MISO, RMATS, or any formatted file 
                defined in Feature.py. 
        """
        self.annotation = annotation_file
    
    def reset_matrix(self):
        """
        Resets all matrices (both raw and normed ip/input) to empty dictionaries
        """
        self.ip_raw_density = {}
        self.input_raw_density = {}
        self.density = {}
            
    def normalize(self, normfunc = None, min_density_sum = 0, label = ""):
        """
        For each feature in the matrix, perform normalization
        
        Args:
            normfunc (function) : a function(pandas.DataFrame, pandas.DataFrame, min_density_sum) 
                that takes normalizes a Map's ip_raw_density DataFrame, 
                containing its IP densities, over its input_raw_density 
                DataFrame, containing its INPUT densities.
            min_density_sum (integer) : density sum cutoff for each event to be counted, 
                passed to normalization function
            label (string) : an intermediate file of this normalized matrix is created for each 
                feature in the matrix. This provides an optional secondary label, useful for
                distinguishing 'included', 'excluded', and 'background' matrices, for example.
        Writes:
            *.normed_matrix.csv : for each key (feature) in a map's density dictionary, 
                write the full contents of the normalized density matrix.
        """
        
        for feature in self.ip_raw_density:
            # print("starting normalization for key {} {} {}".format(key, label, datetime.datetime.now().time()))
            self.density[feature] = normfunc(self.ip_raw_density[feature],
                                             self.input_raw_density[feature], 
                                             self.ip.pseudocount(),
                                             self.inp.pseudocount(),
                                             min_density_sum)
            self.density[feature].to_csv("{}.{}.{}.normed_matrix.csv".format(self.output_base, label, feature))
            # print("finished normalization for key {} {} {}".format(key, label, datetime.datetime.now().time()))
    
    def set_means_and_sems(self, feature, conf = 0.95):
        """
        Sets the means and standard error values after outlier
        removal. Replaces remove_outliers.
        
        Args:
            feature (string) : the feature 
            conf (float) : keep {conf}% of densities
        
        """
        means = list()
        sems = list()
        for key, value in self.density[feature].iteritems():
            df = self.density[feature][key].dropna()
            
            nums = len(df)
            droppercent = (1-conf)/2.0
            dropnum = int(nums*(droppercent))
            df = df.sort_values()
            if(dropnum>0):
                df = df[dropnum:-dropnum]
            
            means.append(df.mean())
            sems.append(df.sem())
        self.means = means
        self.sems = sems
                
    def create_matrices(self, label="", is_scaled=True):
        densities = [self.ip_raw_density, self.input_raw_density]
        rbps = [self.ip, self.inp]
        self.logger.info("Start creating the Matrix - {}".format(self.name))
        for i in range(0,len(densities)):
            densities[i]['feature'] = create_matrix(annotation = self.annotation, 
                                                       density = rbps[i], 
                                                       upstream_offset = 0, 
                                                       downstream_offset = 0, 
                                                       is_scaled = False,
                                                       annotation_type = self.annotation_type)
        self.logger.info("Finished creating the Matrix - {}".format(self.name))
        self.ip_raw_density['feature'].to_csv("{}.ip.{}_raw_density_matrix.csv".format(self.output_base,label))
        self.input_raw_density['feature'].to_csv("{}.input.{}_raw_density_matrix.csv".format(self.output_base,label))
        self.maptype = label
    def create_a3ss_matrices(self, label=""):
        densities = [self.ip_raw_density, self.input_raw_density]
        rbps = [self.ip, self.inp]
        self.logger.info("Start creating the A3SS Matrix - {}".format(self.name))
        for i in range(0,len(densities)):
            densities[i]['feature'] = create_a3ss_matrix(annotation = self.annotation, 
                                                               annotation_type = self.annotation_type,
                                                               density = rbps[i], 
                                                               exon_offset = self.exon_offset, 
                                                               intron_offset = self.intron_offset, 
                                                               is_scaled = self.is_scaled,
                                                               combine_regions = True)
        self.logger.info("Finished creating the A3SS Matrix - {}".format(self.name))
        self.ip_raw_density['feature'].to_csv("{}.ip.{}.{}.a3ss.raw_density_matrix.csv".format(self.output_base, label, 'feature'))
        self.input_raw_density['feature'].to_csv("{}.input.{}.{}.a3ss.raw_density_matrix.csv".format(self.output_base, label, 'feature'))
        self.maptype = 'a3ss'
    def create_a5ss_matrices(self, label=""):
        densities = [self.ip_raw_density, self.input_raw_density]
        rbps = [self.ip, self.inp]
        self.logger.info("Start creating the A5SS Matrix - {}".format(self.name))
        for i in range(0,len(densities)):
            densities[i]['feature'] = create_a5ss_matrix(annotation = self.annotation, 
                                                               annotation_type = self.annotation_type,
                                                               density = rbps[i], 
                                                               exon_offset = self.exon_offset, 
                                                               intron_offset = self.intron_offset, 
                                                               is_scaled = self.is_scaled,
                                                               combine_regions = True)
        self.logger.info("Finished creating the A5SS Matrix - {}".format(self.name))
        self.ip_raw_density['feature'].to_csv("{}.ip.{}.{}.a5ss.raw_density_matrix.csv".format(self.output_base, label, 'feature'))
        self.input_raw_density['feature'].to_csv("{}.input.{}.{}.a5ss.raw_density_matrix.csv".format(self.output_base, label, 'feature'))
        self.maptype = 'a5ss'
    def create_mxe_matrices(self, label=""):
        densities = [self.ip_raw_density, self.input_raw_density]
        rbps = [self.ip, self.inp]
        self.logger.info("Start creating the MXE Matrix - {}".format(self.name))
        for i in range(0,len(densities)):
            
            densities[i]['feature'] = create_mxe_matrix(annotation = self.annotation, 
                                                               annotation_type = self.annotation_type,
                                                               density = rbps[i], 
                                                               exon_offset = self.exon_offset, 
                                                               intron_offset = self.intron_offset, 
                                                               is_scaled = self.is_scaled,
                                                               combine_regions = True)
        self.logger.info("Start creating the MXE Matrix - {}".format(self.name))
        self.ip_raw_density['feature'].to_csv("{}.ip.{}.{}.mxe.raw_density_matrix.csv".format(self.output_base, label, 'feature'))
        self.input_raw_density['feature'].to_csv("{}.input.{}.{}.mxe.raw_density_matrix.csv".format(self.output_base, label, 'feature'))
        self.maptype = 'mxe'
    def create_se_matrices(self, label=""):
        densities = [self.ip_raw_density, self.input_raw_density]
        rbps = [self.ip, self.inp]
        self.logger.info("Start creating the SE Matrix - {}".format(self.name))
        for i in range(0,len(densities)):
            densities[i]['feature'] = create_se_matrix(annotation = self.annotation, 
                                                             annotation_type = self.annotation_type,
                                                             density = rbps[i], 
                                                             exon_offset = self.exon_offset, 
                                                             intron_offset = self.intron_offset, 
                                                             is_scaled = self.is_scaled,
                                                             combine_regions = True)
        self.logger.info("Finished creating the SE Matrix - {}".format(self.name))
        self.ip_raw_density['feature'].to_csv("{}.ip.{}.{}.se.raw_density_matrix.csv".format(self.output_base, label, 'feature'))
        self.input_raw_density['feature'].to_csv("{}.input.{}.{}.se.raw_density_matrix.csv".format(self.output_base, label, 'feature'))
        self.maptype = 'se'
    def create_ri_matrices(self, label=""):
        densities = [self.ip_raw_density, self.input_raw_density]
        rbps = [self.ip, self.inp]
        self.logger.info("Start creating the RI Matrix - {}".format(self.name))
        for i in range(0,len(densities)):
            densities[i]['feature'] = create_ri_matrix(annotation = self.annotation, 
                                                             annotation_type = self.annotation_type,
                                                             density = rbps[i], 
                                                             exon_offset = self.exon_offset, 
                                                             intron_offset = self.intron_offset, 
                                                             is_scaled = self.is_scaled,
                                                             combine_regions = True)
        self.logger.info("Finished creating the SE Matrix - {}".format(self.name))
        self.ip_raw_density['feature'].to_csv("{}.ip.{}.{}.ri.raw_density_matrix.csv".format(self.output_base, label, 'feature'))
        self.input_raw_density['feature'].to_csv("{}.input.{}.{}.ri.raw_density_matrix.csv".format(self.output_base, label, 'feature'))
        self.maptype = 'ri'


In [13]:
'''
Created on May 3, 2016

@author: Gabe
'''
import pyBigWig
import pysam
import numpy as np

class ReadDensity():
    """
    ReadDensity class
    Attributes:
        self.pos(positive *.bw file)
        self.neg(negative *.bw file)
    """
    def __init__(self, pos, neg, name = None, bam = None):
        try:
            self.pos = pyBigWig.open(pos)
            self.neg = pyBigWig.open(neg)
            self.name = name if name is not None else pos.replace('pos','*').replace('neg','*')
            print(bam)
            self.bam = pysam.AlignmentFile(bam)
        except Exception as e:
            print("couldn't open the bigwig files!")
            print(e)
            return 1
    
    def pseudocount(self):
        return 1000000.0/self.bam.count()
        
    def values(self, chrom, start, end, strand):
        """
        Given a chromosome coordinate, return a list of values
        pertaining to the rbpmaps over each nucleotide position.
        Reverse the list if going in the negative strand.
        
        Args:
            chrom (str): (eg. chr1)
            start (int): 0-based start (first position in chromosome is 0)
            end (int): 1-based end (last position is not included)
            strand (char): either '+' or '-'
        """
        try:
            if strand == "+":
                return self.pos.values(chrom, start, end)
            elif strand == "-":
                return list(reversed(self.neg.values(chrom, start, end)))
            else:
                raise("Strand neither + or -")
        except RuntimeError:
            # usually occurs when no chromosome exists in the bigwig file
            return [np.NaN]*abs(start-end)
        

In [14]:
annotationfile = '/projects/ps-yeolab3/bay001/maps/annotations-0.05-0.1-0.05/RBFOX2-HepG2-SE.txt'
pos = '/projects/ps-yeolab3/encode/analysis/encode_v12/204_01_RBFOX2.merged.r2.norm.neg.bw'
neg = '/projects/ps-yeolab3/encode/analysis/encode_v12/204_01_RBFOX2.merged.r2.norm.pos.bw'
bam = '/projects/ps-yeolab3/encode/analysis/encode_v12/204_01_RBFOX2.merged.r2.bam'

ipos = '/projects/ps-yeolab3/encode/analysis/encode_v12/RBFOX2-204-INPUT_S2_R1.unassigned.adapterTrim.round2.rmRep.rmDup.sorted.r2.norm.neg.bw'
ineg = '/projects/ps-yeolab3/encode/analysis/encode_v12/RBFOX2-204-INPUT_S2_R1.unassigned.adapterTrim.round2.rmRep.rmDup.sorted.r2.norm.pos.bw'
ibam = '/projects/ps-yeolab3/encode/analysis/encode_v12/RBFOX2-204-INPUT_S2_R1.unassigned.adapterTrim.round2.rmRep.rmDup.sorted.r2.bam'


clip = ReadDensity(pos,neg,'rbfox2',bam)
inputclip = ReadDensity(ipos,ineg,'rbfox2input',ibam)
outputfile = '/home/bay001/projects/maps_20160420/analysis/tests/rbfox2test.svg'

clipexperiment = ClipWithInput(clip, inputclip, outputfile, 'rbfox2', annotation=annotationfile, annotation_type='rmats')

INFO:plot_features.Map.ClipWithInput:creating an instance of ClipWithDensity


/projects/ps-yeolab3/encode/analysis/encode_v12/204_01_RBFOX2.merged.r2.bam
/projects/ps-yeolab3/encode/analysis/encode_v12/RBFOX2-204-INPUT_S2_R1.unassigned.adapterTrim.round2.rmRep.rmDup.sorted.r2.bam


In [33]:
%memit create_se_matrix(annotationfile, clip, exon_offset=50, intron_offset=350, is_scaled=False, combine_regions=True, annotation_type="rmats")

INFO:plot_features:Starting SE matrix creation [ANNOTATION:/projects/ps-yeolab3/bay001/maps/annotations-0.05-0.1-0.05/RBFOX2-HepG2-SE.txt,DENSITY:rbfox2,UP:50,DOWN:350,SCALED:False,TYPE:rmats
INFO:plot_features:Finished matrix creation: 32823, 32823, 32823, 32823


peak memory: 2244.82 MiB, increment: 709.50 MiB


In [35]:
%memit create_se_matrix(annotationfile, clip, exon_offset=50, intron_offset=350, is_scaled=False, combine_regions=True, annotation_type="rmats")

INFO:plot_features:Starting SE matrix creation [ANNOTATION:/projects/ps-yeolab3/bay001/maps/annotations-0.05-0.1-0.05/RBFOX2-HepG2-SE.txt,DENSITY:rbfox2,UP:50,DOWN:350,SCALED:False,TYPE:rmats


peak memory: 1544.46 MiB, increment: 8.57 MiB


In [59]:
def create_se_matrix2(annotation, density, exon_offset, intron_offset, is_scaled, combine_regions=True, annotation_type="rmats"):
    """
    Creates an r x c pandas dataframe of r events for a skipped
    exon feature. An SE matrix will contain four distinct regions: 
    
    |_]----||----[__||__]----||----[_|
    
    - the [..exon_offset]--intron_offset--... 3' site of an upstream exon
    - the ...--intron_offset--[exon_offset..] 5' site of the upstream skipped exon
    - the [..exon_offset]--intron_offset--... 3' site of the downstream skipped exon
    - the ..--intron_offset--[exon_offset..] 5' site of the downstream exon
    Args:
        annotation (string) : path of file containing the annotation
        density (ReadDensity) : object containing positive and negative BigWig files
        exon_offset (integer) : how far into the exon boundary to plot
        intron_offset (integer) : how far after the exon boundary to plot
        is_scaled (boolean) : if all features are of different length, this must be true
            to resize all features to fit on a 0-100% scale.
        combine_regions (boolean) : if False, return four DataFrames instead of one.
        annotation_type (string) : may be rmats format or any additional defined format in Feature
    
    Returns:
        pandas.DataFrame : a dataframe of r events for an SE feature.
    """
    logger.info("Starting SE matrix creation [ANNOTATION:{},DENSITY:{},UP:{},DOWN:{},SCALED:{},TYPE:{}".format(
                                                                                                            annotation,
                                                                                                            density.name,
                                                                                                            exon_offset,
                                                                                                            intron_offset,
                                                                                                            is_scaled,
                                                                                                            annotation_type))
    three_upstream = pd.DataFrame(columns=range(0,(exon_offset+intron_offset)))
    five_skipped = pd.DataFrame(columns=range(0,(exon_offset+intron_offset)))
    three_skipped = pd.DataFrame(columns=range(0,(exon_offset+intron_offset)))
    five_downstream = pd.DataFrame(columns=range(0,(exon_offset+intron_offset)))
    
    with open(annotation) as f:
        for line in f:
            if not line.startswith('event_name') and not line.startswith('ID'):
                event = line.rstrip()
                upstream_interval, interval, downstream_interval = SkippedExonFeature(event,annotation_type).get_bedtools()
                
                """three prime upstream region"""
                left_pad, wiggle, right_pad = three_prime_site(density, 
                                                                        interval,
                                                                        upstream_interval,
                                                                        exon_offset,
                                                                        intron_offset)
                wiggle = pd.Series(wiggle)
                wiggle = abs(wiggle) # convert all values to positive
        
                wiggle = np.pad(wiggle,(left_pad,right_pad),'constant',constant_values=(-1))
                wiggle = np.nan_to_num(wiggle) 

                three_upstream.ix[event] = wiggle
                """five prime site of skipped region"""
                left_pad, wiggle, right_pad = five_prime_site(density, 
                                                                        upstream_interval,
                                                                        interval,
                                                                        exon_offset,
                                                                        intron_offset)
                
                wiggle = pd.Series(wiggle)
                wiggle = abs(wiggle) # convert all values to positive
                wiggle = np.pad(wiggle,(left_pad,right_pad),'constant',constant_values=(-1))
                wiggle = np.nan_to_num(wiggle)
                five_skipped.ix[event] = wiggle
                """three prime site of skipped region"""
                left_pad, wiggle, right_pad = three_prime_site(density, 
                                                                         downstream_interval,
                                                                         interval,
                                                                         exon_offset,
                                                                         intron_offset)
                wiggle = pd.Series(wiggle)
                wiggle = abs(wiggle) # convert all values to positive
                wiggle = np.pad(wiggle,(left_pad,right_pad),'constant',constant_values=(-1))
                wiggle = np.nan_to_num(wiggle) #
                three_skipped.ix[event] = wiggle
                """five prime site of downstream region"""
                left_pad, wiggle, right_pad = five_prime_site(density, 
                                                                        interval,
                                                                        downstream_interval,
                                                                        exon_offset,
                                                                        intron_offset)
                wiggle = pd.Series(wiggle)
                wiggle = abs(wiggle) # convert all values to positive
                wiggle = np.pad(wiggle,(left_pad,right_pad),'constant',constant_values=(-1))
                wiggle = np.nan_to_num(wiggle) # convert all nans to 0
                five_downstream.ix[event] = wiggle
"""
        three_upstream = pd.DataFrame(three_upstream).T
        five_skipped = pd.DataFrame(five_skipped).T
        three_skipped = pd.DataFrame(three_skipped).T
        five_downstream = pd.DataFrame(five_downstream).T
    logger.info("Finished matrix creation: {}, {}, {}, {}".format(three_upstream.shape[0],
                                                                  five_skipped.shape[0],
                                                                  three_skipped.shape[0],
                                                                  five_downstream.shape[0]))
    if combine_regions == False:
        return three_upstream, five_skipped, three_skipped, five_downstream
    else:
        ra = pd.concat([three_upstream,five_skipped,three_skipped,five_downstream],axis=1)
        ra.columns = range(0,ra.shape[1])
        return ra
"""

'\n        three_upstream = pd.DataFrame(three_upstream).T\n        five_skipped = pd.DataFrame(five_skipped).T\n        three_skipped = pd.DataFrame(three_skipped).T\n        five_downstream = pd.DataFrame(five_downstream).T\n    logger.info("Finished matrix creation: {}, {}, {}, {}".format(three_upstream.shape[0],\n                                                                  five_skipped.shape[0],\n                                                                  three_skipped.shape[0],\n                                                                  five_downstream.shape[0]))\n    if combine_regions == False:\n        return three_upstream, five_skipped, three_skipped, five_downstream\n    else:\n        ra = pd.concat([three_upstream,five_skipped,three_skipped,five_downstream],axis=1)\n        ra.columns = range(0,ra.shape[1])\n        return ra\n'

In [None]:
%memit create_se_matrix2(annotationfile, clip, exon_offset=50, intron_offset=300, is_scaled=False, combine_regions=True, annotation_type="rmats")

In [None]:
%memit create_se_matrix(annotationfile, clip, exon_offset=50, intron_offset=300, is_scaled=False, combine_regions=True, annotation_type="rmats")

In [None]:
%timeit create_se_matrix2(annotationfile, clip, exon_offset=50, intron_offset=300, is_scaled=False, combine_regions=True, annotation_type="rmats")

In [None]:
%timeit create_se_matrix(annotationfile, clip, exon_offset=50, intron_offset=300, is_scaled=False, combine_regions=True, annotation_type="rmats")