# This notebook creates the 'cross-correlation of binding' figure for all ENCODE submitted RBPs
- as shown on the website and for supplementary figures
- 'use max' means that when comparing RBP A to RBP B, we are using the max value between the two (if A overlaps B by 50%, but B overlaps A by 25%, then we use 50% to make the A-B comparison). 
- 'use all' means that when comparing RBP A to RBP B, we are using both values without masking the lower overlap.

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
from collections import OrderedDict
import seaborn as sns
import pandas as pd
import os
import glob
from tqdm import tnrange, tqdm_notebook


In [7]:
wd = '/home/bay001/projects/encode/analysis/overlapping_peaks_files/use_max_reiterate'
files = glob.glob(os.path.join(wd,'*.csv'))
print("number of files: ",len(files))

master_table = pd.read_table(
    '/home/bay001/projects/encode/analysis/overlapping_peaks_files/master_list/ENCODE_CLIPperv2_20161120_peaksALLvsALLoverlap.csv',
    index_col=0
)


img_dir = '/home/bay001/projects/encode/analysis/overlapping_peaks_files/use_max_reiterate'

('number of files: ', 0)


# Split eric's table into individual cols

In [8]:
def get_max(df, s1, s2):
    """
    Gets max of A overlapping % B or B overlapping A
    """
    return max(df.loc[s1,s2],df.loc[s2,s1])

def make_individual_files(wd, master_table):
    """
    Creates individual files for each rbp A 
    Containing the max overlap of either:
    A peaks overlapping B peaks, or B peaks overlapping A peaks.
    
    Params:
    Master list from eric containing all overlaps for peaks in all expts.
    
    Returns: 
    Tabbed file
    """
    progress = tnrange(len(master_table.columns))
    for col in master_table.columns: # for each column
        samples = OrderedDict()
        sample = OrderedDict()
        for idx in master_table.loc[col].index: # for each index
            sample[idx] = get_max(master_table, col, idx)
        samples[col] = sample
        df = pd.DataFrame(samples)
        df.to_csv(os.path.join(wd,'{}.csv'.format(col)), sep='\t')
        progress.update(1)
        
def make_individual_files_without_taking_max(wd, master_table):
    """
    Creates individual files for each rbp A 
    Containing the overlap of rbp A peaks to all other rbp peaks
    
    Params:
    Master list from eric containing all overlaps for peaks in all expts.
    
    Returns: 
    Tabbed file 
    """
    for col in master_table.columns: # for each column
        df = pd.DataFrame(master_table[col])
        df.to_csv(os.path.join(wd,'{}.csv'.format(col)), sep='\t')

make_individual_files(wd, master_table)

          362/|/100%|| 362/362 [00:30<00:00, 26.31it/s]

# Plot the individual cols as barcharts using the files made from above.

In [23]:
def get_rep(row):
    return rep(row['intersecting_rbp'])
    
def rep(string):
    print(string)
    return string.split('_')[-1]

def rbp(string):
    parts = string.split('_')
    parts = '{}_{}'.format(parts[2],parts[3])
    return parts

def get_rbp(row): 
    return rbp(row['intersecting_rbp'])

def add_rep_and_rbp_info(df):
    df.columns = ['intersecting_rbp','Rep']
    df['rep'] = df.apply(get_rep,axis=1)
    df['rbp'] = df.apply(get_rbp,axis=1)
    return df

def split_and_groupby_rep(df):
    rep1 = df.groupby(['rep']).get_group('01')
    rep2 = df.groupby(['rep']).get_group('02')
    merged = pd.merge(rep1,rep2,how='left', on='rbp')
    merged.set_index('rbp',inplace=True)
    return merged

def purge_same_rep(df,col):
    """
    Remove the rbp values overlapping with itself 
    (will always be 1 regardless)
    """
    df.loc[df['Unnamed: 0']==col,col] = 0
    return df

def get_avg(row):
    """
    Returns the average between the two reps
    """
    return (row['Rep_x'] + row['Rep_y'])/2.0

def iterate_over_files_and_make_cross_corr_barh_draft1(files):
    """
    Iterates over all files created from either:
        make_individual_files()
        make_individual_files_without_taking_max()
    
    plots the barchart, may be deprecated
    """
    for f in files:
        name = os.path.splitext(os.path.basename(f))[0]
        table = pd.read_table(f)
        df = pd.DataFrame(table)
        return df
        # df = purge_same_rep(df, df.columns[1])
        df = add_rep_and_rbp_info(df)
        df = split_and_groupby_rep(df)
        df['avg'] = df.apply(get_avg,axis=1)
        df.rename(columns={'Rep_x':"Rep 1", "Rep_y":"Rep 2"},inplace=True)
        same_rep_values = df.ix[(rbp(name))]
        df.sort_values(by=['avg'],inplace=True,ascending=False)
        df.drop(rbp(name),inplace=True)
        df.head()
        df2 = pd.DataFrame(same_rep_values).T

        dfx = pd.concat([df2,df])
        # same_rep_values"""
        # df2 = pd.DataFrame(same_rep_values).T
        # df2

        if int(rep(name)) == 1:
            dfx.loc[rbp(name),'Rep 1'] = 0
        else:
            dfx.loc[rbp(name),'Rep 2'] = 0

        dfy = dfx[['Rep 1','Rep 2']].head(25).iloc[::-1]
        dfy.plot(kind="barh", figsize=(10,5), rot=0)
        plt.ylabel('Top 25 concordantly bound RBPs')
        plt.xlabel('Fraction of overlapping peaks')
        # plt.ylim(0,1)
        plt.tight_layout()
        plt.savefig(os.path.join(img_dir,'{}.png'.format(name)))

In [10]:
dm = ['604_01_DEAF1','215_02_TIA1','204_01_RBFOX2']
# rep_master[rep_master['file']=='204_01_RBFOX2'].sort_values(by=['fold-enrichment'], ascending=False).head()

# Amend the barchart

####### I'm realizing now that we did opposite things with the replicates - for example for 204/RBFOX2, my bars are for the RBFOX2 replicates (red = 01, orange = 02); I think you did bars for the individual dataset replicates (your blue = other RBP rep1, your green = other RBP rep2)... in the interest of not making this confusing I'd actually like to try a version that combines both, so basically 
- have 4 bars for 204 vs RBP B: 
    - red = 204_01 vs B_01, 
    - orange = 204_01 vs B_02, 
    - light blue = 204_02 vs B_01, 
    - dark blue = 204_02 vs B_02 
    
    (does this make sense? So individual values are all the same, but now sort Y axis by the mean across all 4 replicate pair comparisons?)


In [15]:
files = glob.glob(os.path.join(wd, '*.csv'))
rbps = []  # the name of all rbps (no reps). Should be 362/2 = 181 total
for f in files:
    name = os.path.splitext(os.path.basename(f))[0]
    rbps.append(rbp(name))
rbps = list(set(rbps))
print("number of rbps: {}".format(len(rbps)))

number of rbps: 181


In [24]:
def concat_reps(df):
    """
    reformat the dataframe so that for one replicate, 
    """
    # df = purge_same_rep(df, df.columns[1])
    df = add_rep_and_rbp_info(df)
    df = split_and_groupby_rep(df)
    df['avg'] = df.apply(get_avg,axis=1)
    df.rename(columns={'Rep_x':"Rep 1", "Rep_y":"Rep 2"},inplace=True)
    df.sort_values(by=['avg'],inplace=True,ascending=False)
    """
    same_rep_values = df.ix[(rbp(name))]
    df.drop(rbp(name),inplace=True)
    df2 = pd.DataFrame(same_rep_values).T

    dfx = pd.concat([df2,df])
    if int(rep(name)) == 1:
        dfx.loc[rbp(name),'Rep 1'] = 0
    else:
        dfx.loc[rbp(name),'Rep 2'] = 0

    return dfx"""
    return df

def join_reps(r1, r2):
    """
    Returns a merged table with left-merge = rep1, right-merge = rep2
    Merges on index names (rbp name)
    """
    name1 = os.path.splitext(os.path.basename(r1))[0]
    name2 = os.path.splitext(os.path.basename(r2))[0]
    df1 = pd.read_table(r1)
    df2 = pd.read_table(r2)
    df1 = concat_reps(df1)
    df2 = concat_reps(df2)
    df1.columns = ["Rep 1"+'-'+col for col in df1.columns]
    df2.columns = ["Rep 2"+'-'+col for col in df2.columns]
    merged = pd.merge(df1, df2, how='left', left_index=True, right_index=True)
    return merged

def get_all_avg(row):
    """
    Returns the average of all relevant rows.
    """
    sum_of_relevant_rows = row['Rep 1-Rep 1']+row['Rep 1-Rep 2']+row['Rep 2-Rep 1']+row['Rep 2-Rep 2']
    return sum_of_relevant_rows/4.0

def plot_barchart(wd, img_dir, rbps):
    """
    Iterate foreach rbp in the folder, plot the barchart
    """
    progress = tnrange(len(rbps))
    for rbp_name in rbps:

        f, ax = plt.subplots()
        # ax.set_xlabel('X LABEL')    

        cols = sns.color_palette("hls", 8)
        # use the rbp name to glob both replicates
        repfiles = glob.glob(os.path.join(wd,'*{}*.csv'.format(rbp_name)))  # get both reps
        repfiles = (sorted(repfiles)) # so the '_01' is always before '_02'
        assert len(repfiles) == 2
        r1 = repfiles[0]
        r2 = repfiles[1]
        r_all = join_reps(r1,r2) # join two reps
        r_all['avg'] = r_all.apply(get_all_avg,axis=1) # get avg
        r_all.sort_values(by=['avg'],inplace=True,ascending=False) # sort by highest average
        r_all_head = r_all.head(25) # get the top 25
        r_all_head = r_all_head[['Rep 1-Rep 1','Rep 1-Rep 2','Rep 2-Rep 1','Rep 2-Rep 2']].iloc[::-1] # subset just these columns
        r_all_head.columns = [rbp_name + " " + c for c in r_all_head.columns] # rename '1' in 1vs2 columns
        r_all_head.columns = [c.replace('-',' - (B) ') for c in r_all_head.columns] # rename the '2' in 1vs2 columns
        r_all_head.index.name = r_all_head.index.name + " (B)" # rename the label to be more clear
        r_all_head['{} Rep 1 - (B) Rep 1'.format(rbp_name)].ix[rbp_name] = 0
        r_all_head['{} Rep 2 - (B) Rep 2'.format(rbp_name)].ix[rbp_name] = 0
        r_all_head['{} Rep 2 - (B) Rep 1'.format(rbp_name)].ix[rbp_name] = 0
        r_all_head.plot(
            kind="barh", 
            figsize=(10,15), 
            rot=0, 
            color=[cols[0],cols[1],cols[4],cols[5]],
            fontsize=12,
            legend=False,
            ax=ax
        ) # color as in the above specs
        ax.set_xlim(0,1)
        vals = ax.get_xticks()
        ax.set_xlabel('Fraction overlap')    
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top') 
        ax.set_xticklabels(['{:3.2f}%'.format(x*100) for x in vals])
        plt.legend(fontsize=14,loc=4)
        plt.tight_layout()
        plt.savefig(os.path.join(img_dir,'{}_RBPconcordancy.png'.format(rbp_name)))
        progress.update(1)

In [None]:
plot_barchart(wd, img_dir, rbps)

203_01_HNRNPC_HepG2_01
203_02_HNRNPC_HepG2_02
204_01_RBFOX2_HepG2_01
204_02_RBFOX2_HepG2_02
205_01_IGF2BP1_HepG2_01
205_02_IGF2BP1_HepG2_02
206_01_HNRNPK_HepG2_01
206_02_HNRNPK_HepG2_02
209_01_SRSF7_HepG2_01
209_02_SRSF7_HepG2_02
211_01_IGF2BP3_HepG2_01
211_02_IGF2BP3_HepG2_02
215_01_TIA1_HepG2_01
215_02_TIA1_HepG2_02
216_01_SRSF9_HepG2_01
216_02_SRSF9_HepG2_02
218_01_TRA2A_HepG2_01
218_02_TRA2A_HepG2_02
220_01_IGF2BP1_K562_01
220_02_IGF2BP1_K562_02
222_01_HNRNPM_HepG2_01
222_02_HNRNPM_HepG2_02
223_01_FKBP4_HepG2_01
223_02_FKBP4_HepG2_02
224_01_HNRNPM_K562_01
224_02_HNRNPM_K562_02
226_01_IGF2BP2_K562_01
226_02_IGF2BP2_K562_02
227_01_SLTM_HepG2_01
227_02_SLTM_HepG2_02
228_01_SF3B4_HepG2_01
228_02_SF3B4_HepG2_02
230_01_BCCIP_HepG2_01
230_02_BCCIP_HepG2_02
235x4000_01_XRN2_HepG2_01
235x4000_02_XRN2_HepG2_02
236_01_FMR1_K562_01
236_02_FMR1_K562_02
237_01_FXR2_K562_01
237_02_FXR2_K562_02
240_01_TRA2A_K562_01
240_02_TRA2A_K562_02
241_01_DDX42_K562_01
241_02_DDX42_K562_02
242_01_U2AF2_K562_01

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


203_01_HNRNPC_HepG2_01
203_02_HNRNPC_HepG2_02
204_01_RBFOX2_HepG2_01
204_02_RBFOX2_HepG2_02
205_01_IGF2BP1_HepG2_01
205_02_IGF2BP1_HepG2_02
206_01_HNRNPK_HepG2_01
206_02_HNRNPK_HepG2_02
209_01_SRSF7_HepG2_01
209_02_SRSF7_HepG2_02
211_01_IGF2BP3_HepG2_01
211_02_IGF2BP3_HepG2_02
215_01_TIA1_HepG2_01
215_02_TIA1_HepG2_02
216_01_SRSF9_HepG2_01
216_02_SRSF9_HepG2_02
218_01_TRA2A_HepG2_01
218_02_TRA2A_HepG2_02
220_01_IGF2BP1_K562_01
220_02_IGF2BP1_K562_02
222_01_HNRNPM_HepG2_01
222_02_HNRNPM_HepG2_02
223_01_FKBP4_HepG2_01
223_02_FKBP4_HepG2_02
224_01_HNRNPM_K562_01
224_02_HNRNPM_K562_02
226_01_IGF2BP2_K562_01
226_02_IGF2BP2_K562_02
227_01_SLTM_HepG2_01
227_02_SLTM_HepG2_02
228_01_SF3B4_HepG2_01
228_02_SF3B4_HepG2_02
230_01_BCCIP_HepG2_01
230_02_BCCIP_HepG2_02
235x4000_01_XRN2_HepG2_01
235x4000_02_XRN2_HepG2_02
236_01_FMR1_K562_01
236_02_FMR1_K562_02
237_01_FXR2_K562_01
237_02_FXR2_K562_02
240_01_TRA2A_K562_01
240_02_TRA2A_K562_02
241_01_DDX42_K562_01
241_02_DDX42_K562_02
242_01_U2AF2_K562_01

272_01_U2AF2_HepG2_01
272_02_U2AF2_HepG2_02
275_01_CSTF2T_K562_01
275_02_CSTF2T_K562_02
278_01_PRPF8_K562_01
278_02_PRPF8_K562_02
279_01_FAM120A_K562_01
279_02_FAM120A_K562_02
280_01_AUH_HepG2_01
280_02_AUH_HepG2_02
281_01_HNRNPU_HepG2_01
281_02_HNRNPU_HepG2_02
282_01_U2AF1_HepG2_01
282_02_U2AF1_HepG2_02
283_01_HNRNPA1_HepG2_01
283_02_HNRNPA1_HepG2_02
285_01_TIA1_K562_01
285_02_TIA1_K562_02
289_01_HNRNPUL1_K562_01
289_02_HNRNPUL1_K562_02
291_01_PRPF8_HepG2_01
291_02_PRPF8_HepG2_02
292_01_CSTF2T_HepG2_01
292_02_CSTF2T_HepG2_02
297_01_FAM120A_HepG2_01
297_02_FAM120A_HepG2_02
298_01_LARP7_HepG2_01
298_02_LARP7_HepG2_02
301_01_SRSF1_HepG2_01
301_02_SRSF1_HepG2_02
302_01_SND1_HepG2_01
302_02_SND1_HepG2_02
311_01_XRCC6_HepG2_01
311_02_XRCC6_HepG2_02
312_01_SRSF1_K562_01
312_02_SRSF1_K562_02
315_01_FXR1_K562_01
315_02_FXR1_K562_02
316_01_SND1_K562_01
316_02_SND1_K562_02
321_01_HNRNPUL1_HepG2_01
321_02_HNRNPUL1_HepG2_02
325_01_LIN28B_K562_01
325_02_LIN28B_K562_02
326_01_HNRNPK_K562_01
326_02_H

617_02_TBRG4_HepG2_02
624_01_AARS_K562_01
624_02_AARS_K562_02
625_01_EIF3D_HepG2_01
625_02_EIF3D_HepG2_02
626_01_EIF4B_HepG2_01
626_02_EIF4B_HepG2_02
628_01_GRWD1_HepG2_01
628_02_GRWD1_HepG2_02
629_01_UCHL5_HepG2_01
629_02_UCHL5_HepG2_02
631_01_EFTUD2_HepG2_01
631_02_EFTUD2_HepG2_02
632x_01_SUB1_HepG2_01
632x_02_SUB1_HepG2_02
641_01_ZC3H11A_HepG2_01
641_02_ZC3H11A_HepG2_02
649_01_FASTKD2_HepG2_01
649_02_FASTKD2_HepG2_02
650_01_SUPV3L1_HepG2_01
650_02_SUPV3L1_HepG2_02
652_01_SUGP2_HepG2_01
652_02_SUGP2_HepG2_02
654_01_DHX30_HepG2_01
654_02_DHX30_HepG2_02
655_01_XPO5_HepG2_01
655_02_XPO5_HepG2_02
658_01_RPS5_HepG2_01
658_02_RPS5_HepG2_02
668_01_PUS1_K562_01
668_02_PUS1_K562_02
676_01_RBFOX2_K562_01
676_02_RBFOX2_K562_02
677_01_GEMIN5_K562_01
677_02_GEMIN5_K562_02
678_01_HNRNPL_HepG2_01
678_02_HNRNPL_HepG2_02
679_01_RPL23A_HepG2_01
679_02_RPL23A_HepG2_02
682_01_DHX30_K562_01
682_02_DHX30_K562_02
684_01_ZC3H11A_K562_01
684_02_ZC3H11A_K562_02
693_01_SERBP1_K562_01
693_02_SERBP1_K562_02
695_

595_01_DDX55_HepG2_01
595_02_DDX55_HepG2_02
596_01_GRSF1_HepG2_01
596_02_GRSF1_HepG2_02
603_01_DDX59_HepG2_01
603_02_DDX59_HepG2_02
610_01_UCHL5_K562_01
610_02_UCHL5_K562_02
614_01_RPS11_K562_01
614_02_RPS11_K562_02
617_01_TBRG4_HepG2_01
617_02_TBRG4_HepG2_02
624_01_AARS_K562_01
624_02_AARS_K562_02
625_01_EIF3D_HepG2_01
625_02_EIF3D_HepG2_02
626_01_EIF4B_HepG2_01
626_02_EIF4B_HepG2_02
628_01_GRWD1_HepG2_01
628_02_GRWD1_HepG2_02
629_01_UCHL5_HepG2_01
629_02_UCHL5_HepG2_02
631_01_EFTUD2_HepG2_01
631_02_EFTUD2_HepG2_02
632x_01_SUB1_HepG2_01
632x_02_SUB1_HepG2_02
641_01_ZC3H11A_HepG2_01
641_02_ZC3H11A_HepG2_02
649_01_FASTKD2_HepG2_01
649_02_FASTKD2_HepG2_02
650_01_SUPV3L1_HepG2_01
650_02_SUPV3L1_HepG2_02
652_01_SUGP2_HepG2_01
652_02_SUGP2_HepG2_02
654_01_DHX30_HepG2_01
654_02_DHX30_HepG2_02
655_01_XPO5_HepG2_01
655_02_XPO5_HepG2_02
658_01_RPS5_HepG2_01
658_02_RPS5_HepG2_02
668_01_PUS1_K562_01
668_02_PUS1_K562_02
676_01_RBFOX2_K562_01
676_02_RBFOX2_K562_02
677_01_GEMIN5_K562_01
677_02_GEMIN

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/IPython/core/ultratb.py", line 313, in wrapped
    return f(*args, **kwargs)
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/IPython/core/ultratb.py", line 358, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/inspect.py", line 1048, in getinnerframes
    framelist.append((tb.tb_frame,) + getframeinfo(tb, context))
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/inspect.py", line 1012, in getframeinfo
    lines, lnum = findsource(frame)
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/IPython/core/ultratb.py", line 182, in find

IndexError: string index out of range

ERROR:tornado.general:Uncaught exception, closing connection.
Traceback (most recent call last):
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/bay001/anaconda2/envs/brian/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_ex

In [None]:
plt.xticks([x for x in plt.xticks()])
# '{:3.2f}%'.format(x*100)

In [None]:
col = '492_01_DDX3X_HepG2_01'
row = '626_02_EIF4B_HepG2_02'
print(master_table[col].ix[row])
print(master_table[row].ix[col])

In [None]:
def print_all_columns(substr, table=master_table):
    for col in table.columns:
        if substr in col:
            print(col)
            
print_all_columns('SBDS')

In [None]:
igf2bp1_test = rep_master[
    (rep_master['file']=='220_01_IGF2BP1')#  |
    # (rep_master['file']=='220_02_IGF2BP1')
]

igf2bp1_test[['information content']].sort_values(by=['information content'],ascending=False)


In [None]:
master_table.ix['352_01_SBDS_K562_01']