# Calculates the dpsi from the average of the psi values for each replicate - average dpsi for its matched control.
- maybe just use the RMATS outputs from xintao instead

In [38]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import glob
from rnaseq import rmats_inclevel_analysis as rmats
import os
from tqdm import tnrange, tqdm_notebook

In [3]:
def get_reps_and_return_dpsi(psi_val_file_dict):
    """
    given a dictionary of:
    control_rep1, control_rep2, expt_rep1, expt_rep2:
    
    Return the full pipeline:
    1. read dpsi
    2. get average psi for each rep
    3. dpsi by subtracting expt minus control
    """
    progress = tnrange(7, desc='outer loop', leave=False)
    control_rep1_df = read_psi(psi_val_file_dict['control_rep1'])
    progress.update(1)
    control_rep2_df = read_psi(psi_val_file_dict['control_rep2'])
    progress.update(1)
    expt_rep1_df = read_psi(psi_val_file_dict['expt_rep1'])
    progress.update(1)
    expt_rep2_df = read_psi(psi_val_file_dict['expt_rep2'])
    progress.update(1)
    control_df = get_avg_psi(control_rep1_df, control_rep2_df, 'control')
    progress.update(1)
    expt_df = get_avg_psi(expt_rep1_df, expt_rep2_df, 'expt')
    progress.update(1)
    dpsi = get_dpsi(control_df, expt_df)
    progress.update(1)
    return dpsi

DFHEAD = ['bam', 'psi', 'jxc', 'exon']

def read_psi(f):
    """
    Reads in a file generated from a hack of eric's script:
    see (in the 'from_eric/' folder):
    calculate_psi_for_gencode_exons_manifest2_justpsi_flip_loop.pl
    """
    df = pd.read_table(f, names=DFHEAD)
    df.drop_duplicates(inplace=True)
    if not df.shape[0] == df.drop_duplicates('exon').shape[0]: # assures there are unique exon triplets for each file
        print(df.duplicated().head())
        print("Warning: {} has multiple exon triplets for a single jxc region".format(f))    
    return df

def get_avg_psi(df_rep1, df_rep2, prefix='control'):
    """
    get average psi from two dataframes. 
    Fills nonexistant exons (do not meet min threshold of 30 reads?) with 0
    """
    df_rep1.columns = [
        '{}_rep1bam'.format(prefix), 
        '{}_rep1psi'.format(prefix), 
        '{}_rep1jxc'.format(prefix), 
        'exon'
    ]
    df_rep2.columns = [
        '{}_rep2bam'.format(prefix), 
        '{}_rep2psi'.format(prefix), 
        '{}_rep2jxc'.format(prefix), 
        'exon'
    ]
    
    ### 
    # merges the rep1 and rep2 dpsi and calculates average psi 
    # exons that are calculated one but not the other are 0
    # by means of an outer join on the middle exon
    ### 
    merged = pd.merge(df_rep1, df_rep2, how='outer', on='exon') 
    merged = merged.fillna(0)
    
    merged['{}_avg_psi'.format(prefix)] = (
        merged['{}_rep1psi'.format(prefix)] + \
        merged['{}_rep2psi'.format(prefix)])/2
    return merged

def get_dpsi(control_df, expt_df):
    """
    control - knockdown expt
    (-) more included in knockdown expt
    (+) more included in control
    This function is experimental centric. 
    So any average psi that is in the control vs not in expt will not be counted?
    
    This function also sets the middle exon, which may be the same for
    two different triplet events. Therefore may produce identical exons for
    distinct triplets and different dpsi values for a single middle exon.
    
    SEE: prioritize duplicates and drop function, keep only lowest dpsi
    """
    progress = tnrange(7, desc='get dpsi', leave=False)
    merged = pd.merge(expt_df, control_df, how='left', on='exon').fillna(0)
    progress.update(1)
    merged['dpsi'] = merged['control_avg_psi'] - merged['expt_avg_psi']
    progress.update(1)
    merged['middle_exon_start'] = merged['exon'].apply(lambda x: x.split('|')[1].split('-')[0])
    progress.update(1)
    merged['middle_exon_end'] = merged['exon'].apply(lambda x: x.split('|')[1].split('-')[1])
    progress.update(1)
    merged['chrom'] = merged.apply(get_chrom, axis=1)
    progress.update(1)
    merged['strand'] = merged.apply(get_strand, axis=1)
    progress.update(1)
    merged = merged[['chrom', 'middle_exon_start', 'middle_exon_end', 'dpsi', 'strand']]
    progress.update(1)
    return merged

def get_strand(row):
    """
    given chrom:strand:upstream:middle:downstream format
    of the jxc field, return the strand
    """
    if row['expt_rep1jxc'] != 0:
        return row['expt_rep1jxc'].split(':')[1]
    elif row['expt_rep2jxc'] != 0:
        return row['expt_rep2jxc'].split(':')[1]
    else:
        print(row)
        return 0
    
def get_chrom(row):
    """
    given chrom:strand:upstream:middle:downstream format
    of the jxc field, return the chrom
    """
    if row['expt_rep1jxc'] != 0:
        return row['expt_rep1jxc'].split(':')[0]
    elif row['expt_rep2jxc'] != 0:
        return row['expt_rep2jxc'].split(':')[0]
    else:
        print(row)
        return 0

In [4]:
def get_rep_and_control(manifest_file, rbp_name):
    """
    given a manifest file name and an RBP name, 
    return the associated BAM files for control and expt (rep1 and 2).
    """
    df = pd.read_table(manifest, index_col='name')
    return {
        'control_rep1':df.ix[rbp_name]['control_rep1'],
        'control_rep2':df.ix[rbp_name]['control_rep2'],
        'expt_rep1':df.ix[rbp_name]['expt_rep1'],
        'expt_rep2':df.ix[rbp_name]['expt_rep2'],
    }

def get_psi_value_files(manifest_file, psi_dir, rbp_name):
    """
    given a psi value directory and an RBP name,
    return the associated psi value files for control and expt (rep1 and 2).
    """
    psi_value_files = {}
    bam_files = get_rep_and_control(manifest_file, rbp_name)
    for key, bam_file in bam_files.iteritems():
        psi_value_files[key] = os.path.join(psi_dir, bam_file.replace('.bam','.txt'))
    return psi_value_files

In [None]:
manifest = '/projects/ps-yeolab3/encode/hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt'
psi_dir = '/projects/ps-yeolab3/bay001/tmp/psi/'
out_dir = '/projects/ps-yeolab3/bay001/tmp/dpsi_0/'
psi_vals = glob.glob(os.path.join(psi_dir, "*.txt"))

In [None]:
rbfox2 = get_psi_value_files(manifest, psi_dir, 'RBFOX2')
df = get_reps_and_return_dpsi(rbfox2)

In [None]:
manifest_df = pd.read_table(manifest)
all_rbps = manifest_df['name']
progress = tnrange(len(all_rbps))
for name in all_rbps:
    files = get_psi_value_files(manifest, psi_dir, name)
    df = get_reps_and_return_dpsi(files)
    # df.set_index(['chrom','middle_exon_start','middle_exon_end','strand'], inplace=True)
    df.to_csv(os.path.join(out_dir, '{}.txt'.format(name)), sep='\t', header=False)
    progress.update(1)

# Join dataframes

In [5]:
# dpsi_dir = '/projects/ps-yeolab3/bay001/tmp/dpsi_0/'
dpsi_files = glob.glob(os.path.join(out_dir, '*.txt'))
dpsi_files[0]

'/projects/ps-yeolab3/bay001/tmp/dpsi_0/SRFBP1.txt'

In [25]:
def concat_index(row):
    return "{}:{}:{}:{}".format(row['chrom'], row['start'], row['end'], row['strand'])

def set_index_and_remove_cols(df):
    del df['old_idx']
    df['new_idx'] = df.apply(concat_index, axis=1)
    del df['chrom']
    del df['start']
    del df['end']
    del df['strand']
    df.set_index('new_idx', inplace=True)
    return df

def prioritize_duplicates_and_remove(df):
    # remove all dpsi of 0
    df = df[df['dpsi'] != 0]
    # sort remaining dpsi by ascending order
    df.sort_values(by=['dpsi'], inplace=True, ascending=True)
    # report the first only (lowest dpsi) for any duplicated exon
    df.drop_duplicates(['chrom','start','end','strand'], inplace=True)
    return df

In [33]:
DPSI_FILE_HEADER = ['old_idx','chrom','start','end','dpsi','strand']


def join_all_dpsi_values(dpsi_files, cell_type):
    """
    joins all of the dpsi values into a single merged table
    """
    progress = tnrange(len(dpsi_files))
    
    # initialize first one
    merged = pd.read_table(dpsi_files[0], names=DPSI_FILE_HEADER) #, index_col=[1,2,3,5])
    merged = prioritize_duplicates_and_remove(merged)
    merged = set_index_and_remove_cols(merged)
    # rename columns to match the rbp
    merged.columns = [os.path.basename(dpsi_files[0]).replace('.txt','_{}'.format(cell_type))]
    progress.update(1)
    # Merge the other rbps
    for dpsi_file in dpsi_files[1:]:
        
        # read in file
        df = pd.read_table(dpsi_file, names=DPSI_FILE_HEADER) #, index_col=[1,2,3,5])
        df = prioritize_duplicates_and_remove(df)
        df = set_index_and_remove_cols(df)
        df.columns = [os.path.basename(dpsi_file).replace('.txt','_{}'.format(cell_type))]
        
        merged = pd.merge(merged, df, how='outer', left_index=True, right_index=True)
        # print(merged.ix[merged.duplicated()].head())
        # merged.drop_duplicates(inplace=True)
        progress.update(1)
    return merged

merged = join_all_dpsi_values(dpsi_files, 'HepG2')

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [34]:
merged.to_csv(os.path.join(dpsi_dir, 'MERGED_DPSI.tsv'), sep='\t')

# Use merged dataframe to create trackhub

In [89]:
from collections import defaultdict

def format_df_to_trackhub(df, qcat_dict, out_file):
    """
    kind of a messy way to re-format the dataframe
    """
    o = open(out_file, 'w')
    count = 0
    progress = tnrange(df.shape[0])
    for col, row in df.iterrows():
        row_str = "{}\t{}\t{}\t".format(row['chrom'], row['start'], row['end'])
        row_str = row_str + 'id:{},qcat:'.format(count)
        qcat_str = ""
        for i in row.index:
            if i in qcat_dict.keys() and not pd.isnull(row[i]): # there is a value
                qcat_str = qcat_str + '[{},{}], '.format(qcat_dict[i][0], row[i])
                
        qcat_str = qcat_str[:-2]
        
        o.write(row_str + '[ {} ]\n'.format(qcat_str))
        count += 1
        progress.update(1)
    o.close()
    return 0

def rbp_to_qcat(json_like):
    """
    turns this json like file into a dictionary
    with rbp names as keys and category ID, color as values
    """
    categories = defaultdict(list)
    with open(json_like, 'r') as f:
        for line in f:
            if line.startswith('\t'):
                try:
                    line = line.replace('\'','')
                    category, rbp = line.replace('[','').replace(']','').split(':')
                    rbpname, rbpcolor, _ = rbp.split(',')
                    categories[rbpname] = [int(category.replace('\t','')), rbpcolor]
                except ValueError:
                    print(line)
    return categories

def return_json_id_from_merged_column(column):
    """
    only difference between this and jxc function in the junctioncountsonly notebook is the - and _
    """
    rbp_name, rbp_cell = column.split('_')
    return "{}_{}_01".format(rbp_name, rbp_cell) # we don't care about replicates; rmats is one file per 2 reps

def merged_column_to_qcat_elements(column, qcat_dict):
    count = len(qcat_dict.keys())
    # print(return_json_id_from_merged_column(column))
    values = qcat_dict[
        return_json_id_from_merged_column(column)
    ]
    if values != []:
        return values, count
    else:
        return [count+1, "#0000FF"], count+1 # default blue
    
json_file = '/home/bay001/projects/encode/analysis/rnaseq_trackhub/combined_10bpfull.datahub.pos'
qcat_dict = rbp_to_qcat(json_file)
# format_df_to_trackhub(trunc, '/home/bay001/test.bed') # .to_csv('/home/bay001/test.bed', sep='\t', header=False)

	},



In [44]:
out_file = '/projects/ps-yeolab3/bay001/tmp/dpsi_0/TRACKHUB.ALL'
merged_reset_idx = merged.reset_index()
merged_reset_idx['chrom'] = merged_reset_idx['new_idx'].apply(lambda x: x.split(':')[0])
merged_reset_idx['start'] = merged_reset_idx['new_idx'].apply(lambda x: x.split(':')[1])
merged_reset_idx['end'] = merged_reset_idx['new_idx'].apply(lambda x: x.split(':')[2])

# format_df_to_trackhub(merged_reset_idx, out_file)

In [85]:
# this ensures that a unique identifier will be assigned to any shRNA rnaseq expt not already assigned in clip data.
new_qcat_dict = {}
count = len(qcat_dict)
for column in merged_reset_idx.columns:
    
    if 'HepG2' in column:
        qcat_id_color, count = merged_column_to_qcat_elements(column, qcat_dict)
        new_qcat_dict[column] = [qcat_id_color[0], qcat_id_color[1]]

In [90]:
format_df_to_trackhub(merged_reset_idx, new_qcat_dict, out_file)

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


0

# Write datahub

In [86]:
new_qcat_dict

{'AARS_HepG2': [499, '#0000FF'],
 'AATF_HepG2': [382, '#0000FF'],
 'ABCF1_HepG2': [496, '#0000FF'],
 'ACO1_HepG2': [504, '#0000FF'],
 'ADAR_HepG2': [500, '#0000FF'],
 'AGO1_HepG2': [425, '#0000FF'],
 'AKAP1_HepG2': [414, '#0000FF'],
 'AKAP8L_HepG2': [392, '#0000FF'],
 'AKAP8_HepG2': [416, '#0000FF'],
 'APOBEC3C_HepG2': [369, '#0000FF'],
 'ASCC1_HepG2': [495, '#0000FF'],
 'ATP5C1_HepG2': [374, '#0000FF'],
 'AUH_HepG2': [75, '#4A3B53'],
 'BCCIP_HepG2': [33, '#8FB0FF'],
 'BCLAF1_HepG2': [395, '#0000FF'],
 'BOP1_HepG2': [245, '#6367A9'],
 'BUD13_HepG2': [215, '#C8A1A1'],
 'CALR_HepG2': [372, '#0000FF'],
 'CCAR1_HepG2': [480, '#0000FF'],
 'CCAR2_HepG2': [389, '#0000FF'],
 'CDC40_HepG2': [247, '#549E79'],
 'CEBPZ_HepG2': [428, '#0000FF'],
 'CELF1_HepG2': [479, '#0000FF'],
 'CIRBP_HepG2': [403, '#0000FF'],
 'CKAP4_HepG2': [384, '#0000FF'],
 'CNOT7_HepG2': [376, '#0000FF'],
 'CPSF6_HepG2': [373, '#0000FF'],
 'CPSF7_HepG2': [408, '#0000FF'],
 'CSTF2T_HepG2': [87, '#FF90C9'],
 'CSTF2_HepG2': [32

In [92]:
datahub_file = '/projects/ps-yeolab3/bay001/tmp/dpsi_0/DATAHUB'
with open(datahub_file, 'w') as f:
    f.write('[\n')
    f.write('{\n')
    f.write('type:\'quantitativeCategorySeries\',\n')
    f.write('name:\'test_hub_please_ignore\',\n')
    f.write('height:500,\n')
    f.write('url:\"https://google.com\",\n')
    f.write('backgroundcolor:\'#FFFFFF,\n')
    f.write('mode:\'show\',\n')
    f.write('categories:{\n')
    ### write the actual stuff
    for rbp_name, values in new_qcat_dict.iteritems():
        f.write('\t\'{}\':[\'{}\',\'{}\']\n'.format(
            values[0], rbp_name, values[1]
            ))
    f.write('\t},\n')
    f.write('},\n')
    f.write(']')
    