# Creates non-overlapping rmats skipped exon regions from an RMATS file.
- the important stuff is that it subsets each overlapping exon

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tnrange, tqdm_notebook
import pybedtools

pd.set_option('display.max_columns', 500)


# get_average_of_overlapping_cassettes
- merge bedtool
- get intersection between bedfile and each nonoverlapping region from merge
- break each intersected group into parts using intersect and groupby
- overlapping regions will have the average dpsi value of the overlaps
- nonoverlapping regions will retain their dpsi value

# Test functions
- these aren't really bedtools, these all return dataframes because they're visible. 

In [3]:
def fake_completely_overlapping_bedtool():
    intervals = []
    intervals.append(pybedtools.create_interval_from_list([
                'chr1', '500', '2500', 'big', '0.75', '+'
            ]))
    intervals.append(pybedtools.create_interval_from_list([
                'chr1', '1000', '2000', 'small', '0.5', '+'
            ]))
    bedtool = pybedtools.BedTool(intervals).to_dataframe()
    bedtool.columns = ['chr','exonStart_0base','exonEnd','geneSymbol','IncLevelDifference','strand']
    return bedtool

def fake_partially_overlapping_bedtool():
    intervals = []
    intervals.append(pybedtools.create_interval_from_list([
                'chr1', '500', '1000', 'genex', '0.75', '+'
            ]))
    intervals.append(pybedtools.create_interval_from_list([
                'chr1', '750', '1500', 'genex', '0.5', '+'
            ]))
    bedtool = pybedtools.BedTool(intervals).to_dataframe()
    bedtool.columns = ['chr','exonStart_0base','exonEnd','geneSymbol','IncLevelDifference','strand']
    return bedtool

def fake_share_start_exon_bedtool():
    intervals = []
    intervals.append(pybedtools.create_interval_from_list([
                'chr1', '500', '1000', 'genex', '0.75', '+'
            ]))
    intervals.append(pybedtools.create_interval_from_list([
                'chr1', '500', '1500', 'genex', '0.5', '+'
            ]))
    bedtool = pybedtools.BedTool(intervals).to_dataframe()
    bedtool.columns = ['chr','exonStart_0base','exonEnd','geneSymbol','IncLevelDifference','strand']
    return bedtool

def fake_share_end_exon_bedtool():
    intervals = []
    intervals.append(pybedtools.create_interval_from_list([
                'chr1', '500', '1500', 'genex', '0.75', '+'
            ]))
    intervals.append(pybedtools.create_interval_from_list([
                'chr1', '750', '1500', 'genex', '0.5', '+'
            ]))
    bedtool = pybedtools.BedTool(intervals).to_dataframe()
    bedtool.columns = ['chr','exonStart_0base','exonEnd','geneSymbol','IncLevelDifference','strand']
    return bedtool

# The actual functions:

In [4]:
def make_rmats_bedtool(df):
    """
    Uses the skipped exon start and end to create a bedtool
    """
    df = df[['chr','exonStart_0base','exonEnd','geneSymbol','IncLevelDifference','strand']]
    bt = pybedtools.BedTool.from_dataframe(df)
    bt = bt.sort()
    return bt

def make_bedtool(df):
    """
    I can't figure out why the BedTool() function isn't working...
    Probably has something to do with turning positions into floats,
    but this function is works just the same...
    """
    intervals = []
    
    for col, row in df.iterrows():
        intervals.append(
            pybedtools.create_interval_from_list(
                [str(row['chrom']), str(row['start']), 
                 str(row['end']), str(row['name']), 
                 str(row['score']), str(row['strand'])]
            )
        )
    return pybedtools.BedTool(intervals)

def redefine_regions(df):
    """
    turns overlapping regions into distinct nonoverlapping regions.
    """
    positions = []
    intervals = []
    for col, row in df.iterrows():
        chrom = row['chrom']
        strand = row['strand']
        positions.append(row['start'])
        positions.append(row['end'])
    positions = sorted(set(positions))
    for p in range(0, len(positions[:-1])):
        intervals.append(pybedtools.create_interval_from_list(
            [chrom, str(positions[p]), str(positions[p+1]), 'name', '0', strand]
        ))
    return pybedtools.BedTool(intervals)

def rescore(to_split):
    """
    Takes a dataframe of overlapping intervals, 
    and returns nonoverlapping regions, scored by 
    either taking the average of the original overlapping region,
    or by taking the single score over the nonoverlapping
    regions. 
    """
    name = to_split['name'].value_counts()[0] # just take the first name, i don't really care about the name part anyway
    final_split = pd.DataFrame(
        make_bedtool(to_split).intersect(
            redefine_regions(to_split)).to_dataframe().groupby(
            ['chrom','start','end','strand'])['score'].mean()
        ).reset_index()
    final_split['name'] = name
    final_split = final_split[['chrom','start','end','name','score','strand']]
    return final_split

def full_wrapper(df):
    """
    Takes a dataframe from an RMATS file and turns it into a BedTool.
    
    Calls 'pybedtools.cluster().to_dataframe()', which groups overlapping 
    regions using the 'thickStart' column.
    
    For each group, if there is only one region within the group, do nothing
    (concat to merged). If there is more than one region, this means we have
    overlapping intervals. Then it must call rescore() to split these regions
    into nonoverlapping intervals. 
    """
    dfx = make_rmats_bedtool(df)
    dfy = dfx.cluster().to_dataframe()
    merged = pd.DataFrame(columns=['chrom','start','end','name','score','strand','thickStart'])
    groups = set(dfy['thickStart'])
    progress = tnrange(len(groups))
    for g in groups:
        dft = dfy[dfy['thickStart']==g] # get all overlapping regions
        if dft.shape[0] > 1:
            merged = pd.concat([merged, rescore(dft)])
        else:
            merged = pd.concat([merged, dft])
        progress.update(1)
    merged = merged[['chrom','start','end','name','score','strand']]
    return merged

# Test functions

# region1 encompasses region2 fully

In [19]:
df = fake_completely_overlapping_bedtool()
df

Unnamed: 0,chr,exonStart_0base,exonEnd,geneSymbol,IncLevelDifference,strand
0,chr1,500,2500,big,0.75,+
1,chr1,1000,2000,small,0.5,+


In [20]:
full_wrapper(df)




Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,500,1000,1,0.75,+
1,chr1,1000,2000,1,0.625,+
2,chr1,2000,2500,1,0.75,+


# exon1 start < exon2 start, exon1 end < exon2 end

In [8]:
df = fake_partially_overlapping_bedtool()
df

Unnamed: 0,chr,exonStart_0base,exonEnd,geneSymbol,IncLevelDifference,strand
0,chr1,500,1000,genex,0.75,+
1,chr1,750,1500,genex,0.5,+


In [9]:
full_wrapper(df)




Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,500,750,2,0.75,+
1,chr1,750,1000,2,0.625,+
2,chr1,1000,1500,2,0.5,+


# Share start exon

In [10]:
df = fake_share_start_exon_bedtool()
df

Unnamed: 0,chr,exonStart_0base,exonEnd,geneSymbol,IncLevelDifference,strand
0,chr1,500,1000,genex,0.75,+
1,chr1,500,1500,genex,0.5,+


In [11]:
full_wrapper(df)




Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,500,1000,2,0.625,+
1,chr1,1000,1500,2,0.5,+


# Share end exon

In [12]:
df = fake_share_end_exon_bedtool()
df

Unnamed: 0,chr,exonStart_0base,exonEnd,geneSymbol,IncLevelDifference,strand
0,chr1,500,1500,genex,0.75,+
1,chr1,750,1500,genex,0.5,+


In [13]:
full_wrapper(df)




Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,500,750,2,0.75,+
1,chr1,750,1500,2,0.625,+
