# makes the 'big trackhub' for the washU Genomebrowser
- uses the non-overlapping junctionCountsOnly annotations that we're using for the rbp-maps

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from rnaseq import rmats_inclevel_analysis as rmats
from encode import manifest_helpers
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tnrange, tqdm_notebook
import pybedtools

pd.set_option('display.max_columns', 500)


In [2]:
pos_annotations = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.positive.nr.txt')
neg_annotations = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.negative.nr.txt')
all_annotations = glob.glob('/projects/ps-yeolab3/bay001/maps/current_normed_annotations/se/*significant.nr.txt')

## make merged incleveldifference matrix
- merges junction regions and get incleveldifference for each RBP
- performs outer join on junction regions
- returns full matrix of RBPs

# Loop over all RMATS files and return merged matrix
- merge ALL intervals -> get unique regions that contain all spliced events. Each region will be the 'compared' region and have an ID associated.
- intersect each rmats file with the ALL intervals file and return the associated ID.

In [3]:
import pybedtools

def rmats2bedtool_all(annotations):
    """
    Returns merged intervals that contain all regions from all 
    junction counts files. This ensures that regions will be non-overlapping
    and can be used as 'keys' to regions on the epigenome browser.
    """
    intervals = []
    progress = tnrange(len(annotations))
    for pos in annotations:
        nice_name = os.path.basename(pos)
        nice_name = nice_name.replace('-SE.MATS.JunctionCountOnly','')
        nice_name = nice_name.replace('nr.txt','')
        # nice_name = os.path.basename(pos).replace('-SE.MATS.JunctionCountOnly.positive.nr.txt','')
        # nice_name = nice_name.replace('-SE.MATS.JunctionCountOnly.negative.nr.txt','')
        
        df = pd.read_table(pos)
        for col, row in df.iterrows():
            intervals.append(
                pybedtools.create_interval_from_list(
                    [row['chr'], str(row['exonStart_0base']), str(row['exonEnd']), nice_name, str(row['IncLevelDifference']), row['strand']])
            )
        progress.update(1)
    bedtool_all_intervals = pybedtools.BedTool(intervals)
    return bedtool_all_intervals.sort().merge()

def rmats2bedtool(annotation):
    """
    Returns a bedtool object from an rmats annotation
    Uses the exonStart_0base and exonEnd as coordinates.
    """
    intervals = []
    df = pd.read_table(annotation)
    # progress = tnrange(df.shape[0], leave=False)
    for _, row in df.iterrows():
        intervals.append(
            pybedtools.create_interval_from_list(
                [
                    row['chr'], str(row['exonStart_0base']), 
                    str(row['exonEnd']), row['GeneID'], 
                    str(row['IncLevelDifference']), row['strand']
                ]
            )
        )
        # progress.update(1)
    
    return pybedtools.BedTool(intervals)

def transform_individual_rmats_positions(rmats_file, big_merged_bedtool):
    """
    turns individual rmats exon positions into something 
    common to those found in the big merged bedtool file.
    
    Parameters:
    rmats_file : string
        rmats JunctionCountsOnly.txt
    big_merged_bedtool : pybedtools.BedTool
        intervals containing all regions for all annotations being compared.
    
    Returns: dataframe
    """
    
    # just get the "nice name" (RBP_CELL without the extra stuff)
    nice_name = os.path.basename(rmats_file)
    
    # nice_name = nice_name.replace('-SE.MATS.JunctionCountOnly..nr.txt','')
    # nice_name = nice_name.replace('-SE.MATS.JunctionCountOnly.negative.nr.txt','')
    nice_name = nice_name.split('-')
    nice_name = '{}_{}'.format(nice_name[0], nice_name[2])
    
    # for each rmats file, intersect with the merged bedtool to bin regions into those that are common amongst all
    individual_rmats_bedtool = rmats2bedtool(rmats_file).sort()
    intersected = individual_rmats_bedtool.intersect(big_merged_bedtool, wb=True).to_dataframe()
    
    # thickStart, thickEnd, itemRgb actually contain the 'key' common regions from the big_merged_bedtool intersection.
    intersected['chrom'] = intersected['thickStart']
    intersected['start'] = intersected['thickEnd']
    intersected['end'] = intersected['itemRgb']
    
    # re-format so that it's a proper dataframe, and re-name the 'score' column to be that of the name of the RBP. 
    intersected = intersected[['chrom','start','end','name','score','strand']]
    intersected.columns = ['chrom','start','end','name','{}'.format(nice_name),'strand']
    intersected.set_index(['chrom','start','end','name','strand'], inplace=True)
    return intersected

def merge_all_rmats_transformed(all_annotations, big_merged_bedtool):
    """
    merge all dpsi for common regions (as described in big_merged_bedtool) into one dataframe. 
    """
    progress = tnrange(len(all_annotations))
    
    # do this once to easily/automatically populate the index.
    merged = transform_individual_rmats_positions(all_annotations[0], big_merged_bedtool)
    progress.update(1)
    
    # foreach subsequent file, merge (outer join to not miss any) files into merged.
    for annotation in all_annotations[1:]:
        df = transform_individual_rmats_positions(annotation, big_merged_bedtool)
        merged = pd.merge(merged, df, how='outer', left_index=True, right_index=True)
        progress.update(1)
    return merged

In [4]:
# get the full list of conjoined exon intervals
big_merged_bedtool = rmats2bedtool_all(all_annotations)

          449/|/100%|| 449/449 [00:40<00:00, 19.03it/s]

In [5]:
merged = merge_all_rmats_transformed(all_annotations, big_merged_bedtool)




In [6]:
merged.to_csv('/home/bay001/projects/encode/analysis/rnaseq_trackhub_attempt2/merged_from_rmats_nonredundant.txt', sep='\t')

In [7]:
merged.loc['chr1', 1191424, 1191561, 'ENSG00000160087.16', '-'].dropna()

MBNL1_K562       -0.070
HNRNPA0_K562     -0.091
PUS1_K562        -0.216
PABPN1_K562      -0.097
WDR3_K562        -0.092
AGGF1_K562       -0.143
DDX3X_K562       -0.107
HNRNPA2B1_K562   -0.127
TRIM56_K562      -0.161
SSB_K562         -0.081
ZC3H8_K562       -0.064
KIF1C_K562       -0.098
ZRANB2_K562      -0.065
AKAP1_K562       -0.096
G3BP1_K562       -0.125
SF1_K562         -0.090
SMNDC1_K562      -0.103
TARDBP_K562      -0.067
CPSF6_K562       -0.080
AKAP8L_K562      -0.145
RBM25_K562       -0.061
UTP18_K562       -0.100
EWSR1_K562       -0.079
HLTF_K562        -0.093
SRSF5_K562       -0.077
GPKOW_K562       -0.060
SF3B1_HepG2      -0.112
ASCC1_K562       -0.083
RBM15_K562       -0.115
GTF2F1_K562      -0.074
ILF3_K562        -0.089
NSUN2_K562       -0.101
RBM25_HepG2      -0.099
TRIP6_K562       -0.079
TUFM_K562        -0.075
SUGP2_K562       -0.183
RBM39_K562       -0.121
IGF2BP2_K562     -0.103
U2AF1_HepG2      -0.143
NAA15_K562       -0.111
SUPT6H_K562      -0.069
U2AF1_K562      

In [10]:
merged.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,TIAL1_K562,CEBPZ_K562,PARN_K562,DDX24_K562,MBNL1_K562,ILF3_HepG2,EIF4G1_K562,KRR1_HepG2,PES1_HepG2,RPS10_HepG2,SRSF9_HepG2,PNPT1_HepG2,LARP4_K562,RPS3A_HepG2,WDR43_K562,NONO_K562,RPS10_K562,DNAJC21_HepG2,HLTF_HepG2,PCBP2_HepG2,HNRNPA0_K562,RPS5_HepG2,CIRBP_K562,PUS1_K562,PABPN1_K562,HNRNPC_HepG2,WDR3_K562,RBM17_K562,TFIP11_K562,AGGF1_K562,SUB1_K562,PSIP1_K562,XRN2_HepG2,TIA1_HepG2,MTPAP_HepG2,DDX3X_K562,ILF2_HepG2,SLBP_K562,UPF1_K562,ETF1_HepG2,EIF2S1_K562,SUB1_HepG2,GRWD1_HepG2,PRPF6_K562,LSM11_HepG2,NUFIP2_K562,EXOSC9_HepG2,EIF4G2_HepG2,GNB2L1_HepG2,NUFIP2_HepG2,MATR3_K562,CUGBP1_HepG2,CALR_HepG2,DDX28_K562,EIF3H_K562,KIF1C_HepG2,GRWD1_K562,KHSRP_K562,PKM2_K562,DDX1_K562,DDX27_K562,GTF2F1_HepG2,HNRNPM_K562,STAU1_K562,EIF2S2_HepG2,AKAP1_HepG2,UCHL5_K562,NCBP2_K562,SUCLG1_K562,HNRNPA2B1_K562,DNAJC2_K562,TRIM56_K562,PABPC1_K562,SSB_K562,EIF4A3_HepG2,APOBEC3C_K562,PABPC4_HepG2,MTPAP_K562,ESF1_K562,BOP1_HepG2,NFX1_K562,EIF4G1_HepG2,NPM1_HepG2,HNRNPF_K562,PPIL4_HepG2,AQR_K562,HNRNPC_K562,FUBP3_HepG2,EIF4B_K562,FUS_HepG2,PHF6_K562,PRPF8_K562,MAGOH_K562,SF3B1_K562,SUPV3L1_HepG2,TROVE2_K562,CEBPZ_HepG2,EFTUD2_HepG2,HNRNPAB_K562,DDX19B_K562,PPP1R8_K562,ESF1_HepG2,GRSF1_HepG2,UTP18_HepG2,RBFOX2_K562,ZC3H8_K562,ASCC1_HepG2,PHF6_HepG2,PCBP1_HepG2,EIF2C1_HepG2,EIF2C1_K562,PRPF6_HepG2,BCCIP_HepG2,EIF2S1_HepG2,CSTF2_K562,TIA1_K562,PCBP2_K562,UCHL5_HepG2,GPKOW_HepG2,PUF60_HepG2,RNASEN50_K562,HNRNPU_K562,TIAL1_HepG2,RPS2_HepG2,FXR1_K562,TAF15_K562,IGF2BP2_HepG2,KIF1C_K562,PUM2_K562,CSTF2T_K562,SFPQ_K562,HNRNPL_K562,GLRX3_K562,CPEB4_K562,QKI_HepG2,SART3_K562,HNRNPD_HepG2,NSUN2_HepG2,CDC40_HepG2,ILF2_K562,ZRANB2_K562,LIN28B_K562,METAP2_HepG2,SAFB2_HepG2,SND1_K562,AARS_K562,AKAP1_K562,RBM39_HepG2,HNRNPU_HepG2,RBM5_HepG2,DDX28_HepG2,KIAA1967_HepG2,RAVER1_K562,CSTF2_HepG2,G3BP1_K562,PCBP1_K562,CNOT7_K562,PA2G4_K562,SF3A3_K562,BCLAF1_HepG2,SRP68_HepG2,RPS19_K562,IGF2BP3_HepG2,CCAR1_HepG2,BUD13_HepG2,FIP1L1_K562,SF1_K562,NUSAP1_K562,UBE2L3_K562,TRA2A_K562,DDX3X_HepG2,SMNDC1_K562,TARDBP_K562,HNRNPA1_HepG2,SRFBP1_HepG2,TARDBP_HepG2,IGF2BP1_K562,AKAP8L_HepG2,FTO_K562,DAZAP1_HepG2,DDX52_K562,CPSF6_K562,HDGF_HepG2,EIF4A3_K562,HNRNPAB_HepG2,SLTM_HepG2,DHX30_HepG2,NCBP2_HepG2,EIF3A_K562,DNAJC2_HepG2,EIF2S2_K562,FUS_K562,SNRNP70_HepG2,AKAP8L_K562,RTF1_HepG2,BCLAF1_K562,WRN_K562,RBM25_K562,NONO_HepG2,RPS3_K562,XPO5_HepG2,LARP7_K562,KHDRBS1_K562,HNRNPA0_HepG2,XRCC5_K562,PKM2_HepG2,DDX59_HepG2,FMR1_HepG2,SRSF7_HepG2,TUFM_HepG2,SSRP1_K562,RTF1_K562,PTBP1_HepG2,PABPC1_HepG2,CCDC124_K562,DAZAP1_K562,NIP7_HepG2,SRSF1_K562,HNRNPL_HepG2,HNRNPK_K562,RRP9_K562,SUGP2_HepG2,SRP68_K562,AATF_HepG2,SRSF5_HepG2,FASTKD2_K562,KHSRP_HepG2,ZNF622_HepG2,DKC1_K562,RRP9_HepG2,SF1_HepG2,SSB_HepG2,G3BP2_HepG2,RPLP0_HepG2,DDX21_HepG2,EFTUD2_K562,FIP1L1_HepG2,SF3A3_HepG2,PA2G4_HepG2,HNRNPM_HepG2,SSRP1_HepG2,FAM120A_K562,KHDRBS1_HepG2,HSPD1_K562,UTP18_K562,KRR1_K562,NPM1_K562,EWSR1_K562,SLTM_K562,TRA2A_HepG2,SRSF7_K562,HNRNPA1_K562,LARP4_HepG2,HLTF_K562,EIF4G2_K562,PUM2_HepG2,PARN_HepG2,EIF4B_HepG2,MSI2_HepG2,CIRBP_HepG2,SRPK2_K562,DDX21_K562,FUBP3_K562,TROVE2_HepG2,TBRG4_HepG2,SFPQ_HepG2,SUCLG1_HepG2,NOL12_K562,CPSF6_HepG2,SRSF5_K562,CPSF7_K562,DKC1_HepG2,GPKOW_K562,FTO_HepG2,SF3B1_HepG2,RCC2_K562,BCCIP_K562,CSDA_HepG2,G3BP1_HepG2,RDBP_K562,DNAJC21_K562,RBFOX2_HepG2,DDX6_HepG2,PRPF8_HepG2,CUGBP1_K562,FXR2_K562,HNRNPF_HepG2,STIP1_HepG2,XRN2_K562,DDX52_HepG2,EXOSC9_K562,ASCC1_K562,METAP2_K562,RBM15_K562,MATR3_HepG2,CPSF7_HepG2,RBM22_HepG2,PUS1_HepG2,HSPD1_HepG2,TFIP11_HepG2,NOL12_HepG2,EEF2_K562,PUM1_HepG2,HNRNPA2B1_HepG2,EEF2_HepG2,APOBE_HepG2,SBDS_HepG2,AKAP8_K562,HNRNPUL1_K562,ATP5C1_K562,UTP3_HepG2,DHX30_K562,LARP7_HepG2,RBM22_K562,NKRF_HepG2,GEMIN5_K562,UPF1_HepG2,MARK2_HepG2,GTF2F1_K562,UPF2_HepG2,ILF3_K562,CKAP4_HepG2,NAA15_HepG2,EIF3G_K562,GEMIN5_HepG2,DDX19B_HepG2,DDX5_HepG2,G3BP2_K562,HNRNPUL1_HepG2,FXR1_HepG2,NSUN2_K562,SNRNP200_HepG2,RBM25_HepG2,SRSF9_K562,RBM34_HepG2,DDX1_HepG2,TBRG4_K562,SNRNP200_K562,SUPV3L1_K562,SART3_HepG2,ACO1_HepG2,SBDS_K562,IGF2BP3_K562,RAVER1_HepG2,FKBP4_K562,DROSHA_HepG2,TRIP6_K562,PPIL4_K562,AKAP8_HepG2,XRCC5_HepG2,PUM1_K562,TUFM_K562,SRSF1_HepG2,PSIP1_HepG2,SUGP2_K562,AATF_K562,RBM39_K562,RBM17_HepG2,IGF2BP2_K562,SMN1_HepG2,DDX6_K562,QKI_K562,SRSF3_HepG2,PUF60_K562,STIP1_K562,EWSR1_HepG2,YTHDC2_K562,RECQL_HepG2,DDX27_HepG2,CNOT7_HepG2,AUH_HepG2,NUSAP1_HepG2,RBM34_K562,RPS3A_K562,KIAA1967_K562,PNPT1_K562,EIF3D_HepG2,SERBP1_K562,RBM47_HepG2,XPO5_K562,RPS19_HepG2,MAGOH_HepG2,RBM27_HepG2,PES1_K562,PTBP1_K562,U2AF1_HepG2,NUP35_HepG2,TRIM56_HepG2,LIN28B_HepG2,SND1_HepG2,CSTF2T_HepG2,NAA15_K562,DDX55_HepG2,SUPT6H_K562,XRCC6_HepG2,BOP1_K562,STAU1_HepG2,U2AF1_K562,HNRNPK_HepG2,DDX51_K562,UBE2L3_HepG2,IGF2BP1_HepG2,RPL23A_HepG2,ADAR_HepG2,ZRANB2_HepG2,FASTKD2_HepG2,DDX47_HepG2,DDX47_K562,SUPT6H_HepG2,EIF3D_K562,SRFBP1_K562,SERBP1_HepG2,ADAR_K562,SMNDC1_HepG2,CSDA_K562,MSI2_K562,ABCF1_HepG2,PABPC4_K562,SF3B4_HepG2,UPF2_K562,BUD13_K562,GRSF1_K562,RCC2_HepG2,DDX24_HepG2,PPIG_K562,TAF15_HepG2,U2AF2_HepG2,RECQL_K562,ABCF1_K562,SMN1_K562,CCAR1_K562,SF3B4_K562,FASTKD1_HepG2,SLBP_HepG2,PPIG_HepG2,ATP5C1_HepG2,RBM15_HepG2,FAM120A_HepG2,SAFB2_K562,DDX55_K562,XRCC6_K562,EIF3G_HepG2,SRSF4_K562,MAK16_K562,RDBP_HepG2,AARS_HepG2,HNRNPLL_HepG2,U2AF2_K562,FMR1_K562
chrom,start,end,name,strand,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1,Unnamed: 281_level_1,Unnamed: 282_level_1,Unnamed: 283_level_1,Unnamed: 284_level_1,Unnamed: 285_level_1,Unnamed: 286_level_1,Unnamed: 287_level_1,Unnamed: 288_level_1,Unnamed: 289_level_1,Unnamed: 290_level_1,Unnamed: 291_level_1,Unnamed: 292_level_1,Unnamed: 293_level_1,Unnamed: 294_level_1,Unnamed: 295_level_1,Unnamed: 296_level_1,Unnamed: 297_level_1,Unnamed: 298_level_1,Unnamed: 299_level_1,Unnamed: 300_level_1,Unnamed: 301_level_1,Unnamed: 302_level_1,Unnamed: 303_level_1,Unnamed: 304_level_1,Unnamed: 305_level_1,Unnamed: 306_level_1,Unnamed: 307_level_1,Unnamed: 308_level_1,Unnamed: 309_level_1,Unnamed: 310_level_1,Unnamed: 311_level_1,Unnamed: 312_level_1,Unnamed: 313_level_1,Unnamed: 314_level_1,Unnamed: 315_level_1,Unnamed: 316_level_1,Unnamed: 317_level_1,Unnamed: 318_level_1,Unnamed: 319_level_1,Unnamed: 320_level_1,Unnamed: 321_level_1,Unnamed: 322_level_1,Unnamed: 323_level_1,Unnamed: 324_level_1,Unnamed: 325_level_1,Unnamed: 326_level_1,Unnamed: 327_level_1,Unnamed: 328_level_1,Unnamed: 329_level_1,Unnamed: 330_level_1,Unnamed: 331_level_1,Unnamed: 332_level_1,Unnamed: 333_level_1,Unnamed: 334_level_1,Unnamed: 335_level_1,Unnamed: 336_level_1,Unnamed: 337_level_1,Unnamed: 338_level_1,Unnamed: 339_level_1,Unnamed: 340_level_1,Unnamed: 341_level_1,Unnamed: 342_level_1,Unnamed: 343_level_1,Unnamed: 344_level_1,Unnamed: 345_level_1,Unnamed: 346_level_1,Unnamed: 347_level_1,Unnamed: 348_level_1,Unnamed: 349_level_1,Unnamed: 350_level_1,Unnamed: 351_level_1,Unnamed: 352_level_1,Unnamed: 353_level_1,Unnamed: 354_level_1,Unnamed: 355_level_1,Unnamed: 356_level_1,Unnamed: 357_level_1,Unnamed: 358_level_1,Unnamed: 359_level_1,Unnamed: 360_level_1,Unnamed: 361_level_1,Unnamed: 362_level_1,Unnamed: 363_level_1,Unnamed: 364_level_1,Unnamed: 365_level_1,Unnamed: 366_level_1,Unnamed: 367_level_1,Unnamed: 368_level_1,Unnamed: 369_level_1,Unnamed: 370_level_1,Unnamed: 371_level_1,Unnamed: 372_level_1,Unnamed: 373_level_1,Unnamed: 374_level_1,Unnamed: 375_level_1,Unnamed: 376_level_1,Unnamed: 377_level_1,Unnamed: 378_level_1,Unnamed: 379_level_1,Unnamed: 380_level_1,Unnamed: 381_level_1,Unnamed: 382_level_1,Unnamed: 383_level_1,Unnamed: 384_level_1,Unnamed: 385_level_1,Unnamed: 386_level_1,Unnamed: 387_level_1,Unnamed: 388_level_1,Unnamed: 389_level_1,Unnamed: 390_level_1,Unnamed: 391_level_1,Unnamed: 392_level_1,Unnamed: 393_level_1,Unnamed: 394_level_1,Unnamed: 395_level_1,Unnamed: 396_level_1,Unnamed: 397_level_1,Unnamed: 398_level_1,Unnamed: 399_level_1,Unnamed: 400_level_1,Unnamed: 401_level_1,Unnamed: 402_level_1,Unnamed: 403_level_1,Unnamed: 404_level_1,Unnamed: 405_level_1,Unnamed: 406_level_1,Unnamed: 407_level_1,Unnamed: 408_level_1,Unnamed: 409_level_1,Unnamed: 410_level_1,Unnamed: 411_level_1,Unnamed: 412_level_1,Unnamed: 413_level_1,Unnamed: 414_level_1,Unnamed: 415_level_1,Unnamed: 416_level_1,Unnamed: 417_level_1,Unnamed: 418_level_1,Unnamed: 419_level_1,Unnamed: 420_level_1,Unnamed: 421_level_1,Unnamed: 422_level_1,Unnamed: 423_level_1,Unnamed: 424_level_1,Unnamed: 425_level_1,Unnamed: 426_level_1,Unnamed: 427_level_1,Unnamed: 428_level_1,Unnamed: 429_level_1,Unnamed: 430_level_1,Unnamed: 431_level_1,Unnamed: 432_level_1,Unnamed: 433_level_1,Unnamed: 434_level_1,Unnamed: 435_level_1,Unnamed: 436_level_1,Unnamed: 437_level_1,Unnamed: 438_level_1,Unnamed: 439_level_1,Unnamed: 440_level_1,Unnamed: 441_level_1,Unnamed: 442_level_1,Unnamed: 443_level_1,Unnamed: 444_level_1,Unnamed: 445_level_1,Unnamed: 446_level_1,Unnamed: 447_level_1,Unnamed: 448_level_1,Unnamed: 449_level_1,Unnamed: 450_level_1,Unnamed: 451_level_1,Unnamed: 452_level_1,Unnamed: 453_level_1
chr1,231300,231385,ENSG00000228463.4,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.355,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.283,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.148,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.13,,,,,,,,,
chr1,713663,713839,ENSG00000228327.2,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.318,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
chr1,783033,783186,ENSG00000228794.4,+,,,,,,,,,,,,,,,,,,,,,,,,-0.263,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.315,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
chr1,883510,883612,ENSG00000188976.6,-,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.077,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.514,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
chr1,889383,889462,ENSG00000188976.6,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.587,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [12]:
from collections import defaultdict

def format_df_to_trackhub(df, qcat_dict, out_file):
    """
    kind of a messy way to re-format the dataframe
    """
    with open(out_file, 'w') as o:
        count = 1
        progress = tnrange(df.shape[0])
        for col, row in df.iterrows():
            row_str = "{}\t{}\t{}\t".format(row['chrom'], row['start'], row['end'])
            row_str = row_str + 'id:{},qcat:'.format(count)
            qcat_str = ""
            row.sort_values(inplace=True)
            for i in row.index:
                if i in qcat_dict.keys() and not pd.isnull(row[i]): # there is a value
                    
                    qcat_str = qcat_str + '[{},{}], '.format(row[i], qcat_dict[i][0])

            qcat_str = qcat_str[:-2]
            if qcat_str != '':
                o.write(row_str + '[ {} ]\n'.format(qcat_str))
            count += 1
            progress.update(1)
    return 0

def rbp_to_qcat(json_like):
    """
    turns this json like file into a dictionary
    with rbp names as keys and category ID, color as values
    """
    categories = defaultdict(list)
    with open(json_like, 'r') as f:
        for line in f:
            if line.startswith('\t'):
                try:
                    line = line.replace('\'','')
                    category, rbp = line.replace('[','').replace(']','').split(':')
                    rbpname, rbpcolor, _ = rbp.split(',')
                    categories[rbpname] = [int(category.replace('\t','')), rbpcolor]
                except ValueError:
                    print(line)
    return categories

def return_json_id_from_merged_column(column):
    """
    only difference between this and jxc function in the junctioncountsonly notebook is the - and _
    """
    rbp_name, rbp_cell = column.split('_')
    return "{}_{}_01".format(rbp_name, rbp_cell) # we don't care about replicates; rmats is one file per 2 reps



def merged_column_to_qcat_elements(column, qcat_dict):
    current = len(qcat_dict.keys())
    # print(return_json_id_from_merged_column(column))
    values = qcat_dict[
        return_json_id_from_merged_column(column)
    ]
    if values != []:
        return values #[-1, "#0000FF"] # values
    else:
        return [-1, "#0000FF"]
    
json_file = '/home/bay001/projects/encode/analysis/rnaseq_trackhub/combined_10bpfull.datahub.pos'
qcat_dict = rbp_to_qcat(json_file)

	},



In [16]:
# this ensures that a unique identifier will be assigned to any shRNA rnaseq expt not already assigned in clip data.
from collections import defaultdict

colors = sns.color_palette("husl", len(qcat_dict)).as_hex()

new_qcat_dict = defaultdict(list)
counter = len(qcat_dict)
print('total IDs already assigned: {}'.format(counter))
for column in merged.columns:
    if 'HepG2' in column or 'K562' in column:
        exists, qcat_id_color = merged_column_to_qcat_elements(column, qcat_dict)
        if exists == -1:
            counter += 1
            new_qcat_dict[column] = counter, colors[counter-len(qcat_dict)].upper()
        else:
            new_qcat_dict[column] = exists, qcat_id_color

total IDs already assigned: 656


In [17]:
qcat_df = pd.DataFrame(new_qcat_dict).T.reset_index().sort_values(by=0)
qcat_df.head()

Unnamed: 0,index,0,1
170,HNRNPC_HepG2,1,#000000
294,RBFOX2_HepG2,3,#FFFF00
188,IGF2BP1_HepG2,5,#1CE6FF
175,HNRNPK_HepG2,7,#FF34FF
375,SRSF7_HepG2,9,#FF4A46


In [18]:
out_file = '/home/bay001/projects/encode/analysis/rnaseq_trackhub_attempt2/trackhub_merged_from_rmats_nonredundant.jsonlike'
format_df_to_trackhub(merged.reset_index(), new_qcat_dict, out_file)

0

In [19]:
datahub_file = '/home/bay001/projects/encode/analysis/rnaseq_trackhub_attempt2/trackhub_merged_from_rmats_nonredundant.datahub.txt'
with open(datahub_file, 'w') as f:
    f.write('[\n')
    f.write('{\n')
    f.write('type:\'quantitativeCategorySeries\',\n')
    f.write('name:\'test_hub_please_ignore\',\n')
    f.write('height:500,\n')
    f.write('url:\"https://s3-us-west-1.amazonaws.com/washington-university-epigenome-browser-trackhub-test-2/trackhub_merged_from_rmats_nonredundant.sorted.jsonlike.gz\",\n')
    f.write('backgroundcolor:\'#FFFFFF\',\n')
    f.write('mode:\'show\',\n')
    f.write('categories:{\n')
    ### write the actual stuff
    for _, row in qcat_df.iterrows():
        f.write('\t\'{}\':[\'{}\',\'{}\'],\n'.format(
            row[0], row['index'], row[1]
            ))
    f.write('\t},\n')
    f.write('},\n')
    f.write(']')
    

# Track preparation
- http://wiki.wubrowse.org/QuantitativeCategorySeries

In [23]:
sorted_out_file = '/home/bay001/projects/encode/analysis/rnaseq_trackhub_attempt2/trackhub_merged_from_rmats_nonredundant.sorted.jsonlike'
! sort -k1,1 -k2,2n $out_file > $sorted_out_file

In [24]:
! bgzip -f $sorted_out_file

In [25]:
gz = '{}.gz'.format(sorted_out_file)
! tabix -p bed $gz