# To fix the redundant-ness of the background controls, Eric went back and added junction counts to each of them. This notebook looks at the format for each of his controls

In [1]:
import pandas as pd
import numpy as np
import os
import pybedtools as bt

In [2]:
def get_bedtools(row, x, event='se'):
    chrom, strand, _, _, _ = row['annotation'].split('|')
    if event == 'se' or event == 'mxe' or event == 'ri':
        low_start, low_end = [int(ex) for ex in row['low_exon'].split('-')]
        hi_start, hi_end = [int(ex) for ex in row['hi_exon'].split('-')]
        interval = bt.create_interval_from_list(
            [chrom, low_end, hi_start, row['annotation'], '0',
             strand])
    elif event == 'a3ss':
        flank_start, flank_end = [int(ex) for ex in row['upstream_exon'].split('-')]
        long_start, long_end = [int(ex) for ex in row['long_exon'].split('-')]
        short_start, short_end = [int(ex) for ex in row['short_exon'].split('-')]
    
        if strand == '+':
            interval = bt.create_interval_from_list(
                [chrom, flank_end, short_start, row['annotation'], '0',
                 strand])
        else:
            interval = bt.create_interval_from_list(
                [chrom, short_end, flank_start, row['annotation'], '0',
                 strand]
            )
    elif event == 'a5ss':
        flank_start, flank_end = [int(ex) for ex in row['downstream_exon'].split('-')]
        long_start, long_end = [int(ex) for ex in row['long_exon'].split('-')]
        short_start, short_end = [int(ex) for ex in row['short_exon'].split('-')]
    
        if strand == '+':
            interval = bt.create_interval_from_list(
                [chrom, short_end, flank_start, row['annotation'], '0',
                 strand])
        else:
            interval = bt.create_interval_from_list(
                [chrom, flank_end, short_start, row['annotation'], '0',
                 strand]
            )
    return interval

# SE

In [3]:
se_all_names = ['annotation','low_exon','skipped_exon','hi_exon', 'incl','excl']

df = pd.read_table(
    '/projects/ps-yeolab3/encode/hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.strict_CE_all_20170805',
    names=se_all_names
)
df.tail()

Unnamed: 0,annotation,low_exon,skipped_exon,hi_exon,incl,excl
18862,chr13|-|46627826-46627878|46627933-46629896|46...,46627321-46627826,46627878-46627933,46629896-46629984,255540,1909800
18863,chr13|-|45911688-45912794|45912911-45913631|45...,45911312-45911688,45912794-45912911,45913631-45913668,2307880,18917970
18864,chr13|-|113851557-113852504|113852578-11385474...,113851332-113851557,113852504-113852578,113854740-113854830,83110,693640
18865,chr13|-|48547527-48562675|48562838-48563016|48...,48547398-48547527,48562675-48562838,48563016-48563116,55830,435550
18866,chr13|-|75901962-75911056|75911176-75915260|75...,75901887-75901962,75911056-75911176,75915260-75915371,64760,509600


In [4]:
bedtools = []
for ix, row in df.iterrows():
    bedtools.append(get_bedtools(row, '.', 'se'))

# A3SS

In [5]:
a3ss_all_names = ['annotation','upstream_exon','long_exon','short_exon', 'incl','excl']
df = pd.read_table(
    '/projects/ps-yeolab3/encode/hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.aA3SScenter_0.1_20170805',
    names=a3ss_all_names
)
df.tail()

Unnamed: 0,annotation,upstream_exon,long_exon,short_exon,incl,excl
234,chr17|-|47784326|47783671|47783696,47784326-47784430,47783565-47783696,47783565-47783671,2686947,2493410140
235,chr17|-|38083736|38080473|38080478,38083736-38083854,38080282-38080478,38080282-38080473,42471725,4009515388
236,chr13|+|76408523|76409368|76409287,76408361-76408523,76409287-76409475,76409368-76409475,929758,103237972
237,chr13|-|46541638|46539564|46539567,46541638-46542163,46539414-46539567,46539414-46539564,19061225,161029307
238,chr13|-|79929354|79928696|79928705,79929354-79929519,79928573-79928705,79928573-79928696,1012538,85224031


In [6]:
bedtools = []
for ix, row in df.iterrows():
    bedtools.append(get_bedtools(row, '.', 'a3ss'))

# A5SS

In [7]:
a5ss_all_names = ['annotation','short_exon','long_exon','downstream_exon', 'incl','excl']
df = pd.read_table(
    '/projects/ps-yeolab3/encode/hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.aA5SScenter_0.1_20170805',
    names=a5ss_all_names
)
df.head()

Unnamed: 0,annotation,short_exon,long_exon,downstream_exon,incl,excl
0,chr3|+|100455548|100455560|100463676,100455419-100455548,100455419-100455560,100463676-100463775,2368923,203358278
1,chr3|+|142740224|142740227|142740314,142740191-142740224,142740191-142740227,142740314-142740397,14151625,1302514640
2,chr3|-|128890497|128890476|128890381,128890497-128890614,128890476-128890614,128890288-128890381,61614932,5584550000
3,chr3|-|129155303|129155285|129152979,129155303-129156151,129155285-129156151,129152904-129152979,9571934,629912824
4,chr18|+|46284776|46284782|46287760,46284289-46284776,46284289-46284782,46287760-46288060,533669,45255454


In [8]:
bedtools = []
for ix, row in df.iterrows():
    bedtools.append(get_bedtools(row, '.', 'a5ss'))


# Now we have a piloted function that returns proper bedtools, we can run the subset_junctioncountsonly script on each one to return the non-redundant set of exons.

In [9]:
import glob
from tqdm import tnrange, tqdm_notebook

In [16]:
input_dir = '/projects/ps-yeolab3/encode/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/'


prog = '/home/bay001/projects/codebase/bfx/pyscripts/rnaseq/subset_rmats_junctioncountonly.py'

nSE_annotations = glob.glob(os.path.join(input_dir, '*nSE*805'))
aSE_annotations = glob.glob(os.path.join(input_dir, '*aSE*805'))
CE_annotations = glob.glob(os.path.join(input_dir, '*CE*805'))
se_annotations = nSE_annotations + aSE_annotations + CE_annotations

progress = tnrange(len(se_annotations))

for se_annotation in se_annotations:
    if '.NR' not in se_annotation:
        se_nr_annotation = se_annotation + '.NR'
        cmd = 'python {} '.format(prog)
        cmd = cmd + '-i {} '.format(se_annotation)
        cmd = cmd + '-o {} '.format(se_nr_annotation)
        cmd = cmd + '-e {} '.format('se')
        cmd = cmd + '-f {} '.format('eric')
        ! $cmd
    progress.update(1)

In [17]:
a3ss_annotations = glob.glob(os.path.join(wd, '*A3SS*805'))
progress = tnrange(len(a3ss_annotations))

for annotation in a3ss_annotations:
    if '.NR' not in annotation:
        nr_annotation = annotation + '.NR'
        cmd = 'python {} '.format(prog)
        cmd = cmd + '-i {} '.format(annotation)
        cmd = cmd + '-o {} '.format(nr_annotation)
        cmd = cmd + '-e {} '.format('a3ss')
        cmd = cmd + '-f {} '.format('eric')
        ! $cmd
    progress.update(1)




In [18]:
a5ss_annotations = glob.glob(os.path.join(wd, '*A5SS*805'))
progress = tnrange(len(a5ss_annotations))

for annotation in a5ss_annotations:
    if '.NR' not in annotation:
        nr_annotation = annotation + '.NR'
        cmd = 'python {} '.format(prog)
        cmd = cmd + '-i {} '.format(annotation)
        cmd = cmd + '-o {} '.format(nr_annotation)
        cmd = cmd + '-e {} '.format('a5ss')
        cmd = cmd + '-f {} '.format('eric')
        ! $cmd
    progress.update(1)

In [7]:
wd = '/projects/ps-yeolab3/encode/'

ri_annotations = glob.glob(os.path.join(wd, '*RI*805'))
ri_annotations

[]