# Check that we have downloaded shRNA BAM and rMATS files for each replicate

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tnrange, tqdm_notebook
from IPython.core.display import display, HTML
from collections import defaultdict

In [2]:
fns = glob.glob('/projects/ps-yeolab3/encode/rnaseq/differential_expression/DESeq_output/current/*DESeq_output.txt')
for fn in fns:
    if os.path.basename(fn).startswith('HepG2') or os.path.basename(fn).startswith('K562'):
        pass
    else:
        ! rm $fn

In [3]:
from_xintao_20180329 = pd.read_table(
    '/projects/ps-yeolab3/encode/rnaseq/HepG2_fileID_table_set39.20180329.txt'
)
print(from_xintao_20180329.columns)

from_xintao_20171211 = pd.read_table(
    '/projects/ps-yeolab3/encode/rnaseq/HepG2_fileID_table.20171211.txt',
    comment='#'
)
print(from_xintao_20171211.columns)

Index([u'Experiment_ID', u'RBP', u'Cell_line', u'Replicate', u'FASTQ_R1',
       u'FASTQ_R2', u'BAM', u'TSV'],
      dtype='object')
Index([u'Experiment_ID', u'RBP', u'Cell_line', u'Replicate', u'FASTQ_R1',
       u'FASTQ_R2', u'BAM', u'TSV'],
      dtype='object')


In [4]:
rnaseq_dir = '/projects/ps-yeolab3/encode/rnaseq/'
deseq_dir = '/projects/ps-yeolab3/encode/rnaseq/differential_expression/DESeq_output/current/'
output_dir = '/projects/ps-yeolab3/encode/rnaseq/shrna_knockdown_graveley_tophat'


In [5]:
download_file = os.path.join(rnaseq_dir, 'HepG2_fileID_table.20171211.sh')
with open(download_file, 'w') as o:
    for bam in from_xintao_20171211['BAM']:
        target_file = os.path.join(output_dir, bam + '.bam')
        if not os.path.exists(target_file):
            o.write('wget https://www.encodeproject.org/files/{}/@@download/{}.bam -O {}\n'.format(bam, bam, target_file))

In [6]:
download_file = os.path.join(rnaseq_dir, 'HepG2_fileID_table_set39.20180329.sh')
with open(download_file, 'w') as o:
    for bam in from_xintao_20180329['BAM']:
        target_file = os.path.join(output_dir, bam + '.bam')
        if not os.path.exists(target_file):
            o.write('wget https://www.encodeproject.org/files/{}/@@download/{}.bam -O {}\n'.format(bam, bam, target_file))

# K562 now

In [7]:
from_xintao_20180329 = pd.read_table(
    os.path.join(rnaseq_dir, 'K562_fileID_table_set39.20180329.txt'),
    comment='#'
)
print(from_xintao_20180329.columns)

from_xintao_20171211 = pd.read_table(
    os.path.join(rnaseq_dir, 'K562_fileID_table.20171211.txt'),
    comment='#'
)
print(from_xintao_20171211.columns)

Index([u'Experiment_ID', u'RBP', u'Cell_line', u'Replicate', u'FASTQ_R1',
       u'FASTQ_R2', u'BAM', u'TSV'],
      dtype='object')
Index([u'Experiment_ID', u'RBP', u'Cell_line', u'Replicate', u'FASTQ_R1',
       u'FASTQ_R2', u'BAM', u'TSV'],
      dtype='object')


In [8]:
from_xintao_20171211[from_xintao_20171211['RBP']=='SRSF9']

Unnamed: 0,Experiment_ID,RBP,Cell_line,Replicate,FASTQ_R1,FASTQ_R2,BAM,TSV


In [9]:
download_file = os.path.join(rnaseq_dir, 'K562_fileID_table.20171211.sh')
with open(download_file, 'w') as o:
    for bam in from_xintao_20171211['BAM']:
        target_file = os.path.join(output_dir, bam + '.bam')
        if not os.path.exists(target_file):
            o.write('wget https://www.encodeproject.org/files/{}/@@download/{}.bam -O {}\n'.format(bam, bam, target_file))

In [10]:
download_file = os.path.join(rnaseq_dir, 'K562_fileID_table_set39.20180329.sh')
with open(download_file, 'w') as o:
    for bam in from_xintao_20180329['BAM']:
        target_file = os.path.join(output_dir, bam + '.bam')
        if not os.path.exists(target_file):
            o.write('wget https://www.encodeproject.org/files/{}/@@download/{}.bam -O {}\n'.format(bam, bam, target_file))

# Check md5sums according to ENCODE

In [11]:
hepg2_from_xintao_20180329 = pd.read_table(
    os.path.join(rnaseq_dir, 'HepG2_fileID_table_set39.20180329.txt')
)
hepg2_from_xintao_20171211 = pd.read_table(
    os.path.join(rnaseq_dir, 'HepG2_fileID_table.20171211.txt'),
    comment='#'
)
k562_from_xintao_20180329 = pd.read_table(
    os.path.join(rnaseq_dir, 'K562_fileID_table_set39.20180329.txt')
)
k562_from_xintao_20171211 = pd.read_table(
    os.path.join(rnaseq_dir, 'K562_fileID_table.20171211.txt'),
    comment='#'
)

In [12]:
import urllib
import json
import requests
experiments = "https://www.encodeproject.org/experiments/"

def write_md5sums(df, out_file):
    o = open(out_file, 'w') 
    progress = tnrange(len(set(df['BAM'])))
    for h in set(df['BAM']): # queries every experiment
        url = experiments+h+"/?format=json"
        response = urllib.urlopen(url)
        data = json.loads(response.read())
        if 'code' in data.keys():
            next
        else:
            o.write('{}\t{}\n'.format(h, data[u'md5sum']))
        progress.update(1)
    o.close()

In [13]:
# we only need to grab the md5sums once

# write_md5sums(hepg2_from_xintao_20180329, os.path.join(rnaseq_dir, 'HepG2_fileID_table_set39.20180329.encode_md5sums'))
# write_md5sums(hepg2_from_xintao_20171211, os.path.join(rnaseq_dir, 'HepG2_fileID_table.20171211.encode_md5sums'))
# write_md5sums(k562_from_xintao_20180329,  os.path.join(rnaseq_dir, 'K562_fileID_table_set39.20180329.encode_md5sums'))
# write_md5sums(k562_from_xintao_20171211, os.path.join(rnaseq_dir, 'K562_fileID_table.20171211.encode_md5sums'))


# Check md5sums according to local
- check md5sums for every bam file we've downloaded (may have more bam files than end up being used)

In [14]:

# ! md5sum /projects/ps-yeolab3/encode/rnaseq/shrna_knockdown_graveley_tophat/*.bam > md5sums-20180330

# Compare the md5sums reported on encodeproject.org with the md5sums we got from our local dir

In [15]:
# just reading one to see what the format is
local = pd.read_table(
    os.path.join(output_dir, 'md5sums-20180330'),
    names=['md5sum_local','bam'], sep='  '
)
local.head()

  after removing the cwd from sys.path.


Unnamed: 0,md5sum_local,bam
0,f975e267cf67083d4fafe1e2ad94e347,ENCFF005EIM.bam
1,1d2d6469725a3610afb8c391f66cde71,ENCFF011BRT.bam
2,96c0821c39a0a1d3fb4811b7fa044d86,ENCFF011SYE.bam
3,f138fe9ca9d2108f09264609541afd59,ENCFF012GYQ.bam
4,86f7cc4b04fbea00f311a67bf27235b5,ENCFF012WHT.bam


In [16]:
# just reading one to see what the format is
remote = pd.read_table(
    os.path.join(rnaseq_dir, 'HepG2_fileID_table_set39.20180329.encode_md5sums'),
    names=['bam','md5sum_remote'], sep='\t'
)
remote['bam'] = remote['bam'] + '.bam'
remote.head()

Unnamed: 0,bam,md5sum_remote
0,ENCFF265NOS.bam,fa7fe7949c178398afaa9255a1065ead
1,ENCFF996LFJ.bam,9d92b876914f36372be667f985e126ab
2,ENCFF654TTY.bam,98930ca94aa2e2343c6646a5a1bf92c5
3,ENCFF644TFQ.bam,6c3c67853a57abdc76abbe8301160ac4
4,ENCFF444CHN.bam,03a9b5e6a9bb79662cbd6eb1d408cc5b


In [17]:
def compare_local_and_remote(local_file, remote_file):
    """ Compares the two md5sum files and returns bam files which are different """
    local = pd.read_table(
        local_file,
        names=['md5sum_local','bam'], sep='  '
    )
    remote = pd.read_table(
        remote_file,
        names=['bam','md5sum_remote'], sep='\t'
    )
    remote['bam'] = remote['bam'] + '.bam'
    merged = pd.merge(remote, local, how='left', left_on='bam', right_on='bam')
    return merged[merged['md5sum_remote']!=merged['md5sum_local']]

In [18]:
# for each file generated from expected md5sums, compare against the md5sums we got
# shape should be 0 for all
for remote in [
    os.path.join(rnaseq_dir, 'HepG2_fileID_table_set39.20180329.encode_md5sums'),
    os.path.join(rnaseq_dir, 'HepG2_fileID_table.20171211.encode_md5sums'),
    os.path.join(rnaseq_dir, 'K562_fileID_table_set39.20180329.encode_md5sums'),
    os.path.join(rnaseq_dir, 'K562_fileID_table.20171211.encode_md5sums')
]:
    merged = compare_local_and_remote(
        os.path.join(output_dir, 'md5sums-20180330'),
        remote
    )
    print(display(HTML(merged.to_html())))

  """


Unnamed: 0,bam,md5sum_remote,md5sum_local


None


Unnamed: 0,bam,md5sum_remote,md5sum_local


None


Unnamed: 0,bam,md5sum_remote,md5sum_local


None


Unnamed: 0,bam,md5sum_remote,md5sum_local


None


# There is one different bam file (ENCFF742IJE.bam)
- double check to make sure the md5sum reported is accurate
- redownload if necessary
- actually this link doesn't exist, instead shown as ENCFF893QAU.bam
- downloaded ENCFF893QAU.bam manually and confirmed it matches remote md5sum
- MANUALLY change /projects/ps-yeolab3/encode/rnaseq/HepG2_fileID_table.20171211
- rerun this notebook to make sure everything matches now

In [19]:
# manually check this md5sum
! md5sum /projects/ps-yeolab3/encode/rnaseq/shrna_knockdown_graveley_tophat/ENCFF893QAU.bam

94cf6b7e3be9c6c3828af1ec3b0731d7  /projects/ps-yeolab3/encode/rnaseq/shrna_knockdown_graveley_tophat/ENCFF893QAU.bam


In [20]:
# manually lookup md5sum of remote file
experiments = "https://www.encodeproject.org/experiments/"

url = experiments+"ENCFF893QAU"+"/?format=json"
response = urllib.urlopen(url)
data = json.loads(response.read())
if 'code' in data.keys():
    next
else:
    print(data[u'md5sum'])

94cf6b7e3be9c6c3828af1ec3b0731d7


# (re)move any BAM files that don't belong to these lists
The following bam files have been removed (20180402):
- ENCFF394JPI.bam (RBM15, removed/archived file on encodeproject.org, accessed 20180402)
- ENCFF839HZQ.bam (RBM15, removed/archived file on encodeproject.org, accessed 20180402)
- ENCFF992OHI.bam (U2AF2, removed per email 20171211)
- ENCFF374PNQ.bam (PES1, removed/archived file on encodeproject.org, accessed 20180402)
- ENCFF742IJE.bam (Control, confirmed as ENCFF893QAU.bam so removed from directory)
- ENCFF959LXB.bam (PES1, removed/archived file on encodeproject.org, accessed 20180402)
- ENCFF057UPR.bam (U2AF2, removed per email 20181211)
- ENCFF985GAR.bam (RPS3, removed per email 20180330)
- ENCFF403FGR.bam (RPS3, removed per email 20180330)
- ENCFF152BOQ.bam (SRSF4, removed per email 20180330)
- ENCFF631XDT.bam (SRSF4, removed per email 20180330)
- ENCFF792MXQ.bam (SRSF9, removed per email 20180330)
- ENCFF990JSY.bam (SRSF9, removed per email 20180330)

In [21]:
# get a list of all bam files specified in each manifest

list_of_all_bam_files_used = []

for bam in set(hepg2_from_xintao_20171211['BAM']):
    list_of_all_bam_files_used.append(os.path.join(output_dir, bam + '.bam'))
for bam in set(k562_from_xintao_20171211['BAM']):
    list_of_all_bam_files_used.append(os.path.join(output_dir, bam + '.bam'))
for bam in set(hepg2_from_xintao_20180329['BAM']):
    list_of_all_bam_files_used.append(os.path.join(output_dir, bam + '.bam'))
for bam in set(k562_from_xintao_20180329['BAM']):
    list_of_all_bam_files_used.append(os.path.join(output_dir, bam + '.bam'))
    
print("Total number of bam files (hepg2 20171211): {}".format(len(set(hepg2_from_xintao_20171211['BAM']))))
print("Total number of bam files (k562 20171211): {}".format(len(set(k562_from_xintao_20171211['BAM']))))
print("Total number of bam files (hepg2 20180329): {}".format(len(set(hepg2_from_xintao_20180329['BAM']))))
print("Total number of bam files (k562 20180329): {}".format(len(set(k562_from_xintao_20180329['BAM']))))

print("Total number of bam files: {}".format(len(set(list_of_all_bam_files_used))))

Total number of bam files (hepg2 20171211): 508
Total number of bam files (k562 20171211): 492
Total number of bam files (hepg2 20180329): 24
Total number of bam files (k562 20180329): 36
Total number of bam files: 1060


In [22]:
# get a list of all bam files downloaded so far

list_of_all_bam_files_downloaded = glob.glob(
    os.path.join(output_dir, '*.bam')
)

len(list_of_all_bam_files_downloaded)

1060

In [23]:
# differences should be any manual corrections made post-manifest
# bam files that were downloaded previously but are not being used are moved to: path/to/output/deprecated_manually_removed
print(
    "Number of used bams that have not been downloaded: ",
    len(set(list_of_all_bam_files_used).difference(set(list_of_all_bam_files_downloaded)))
)
print(
    "Number of downloaded bams that are not used: ",
    len(set(list_of_all_bam_files_downloaded).difference(set(list_of_all_bam_files_used)))
)

for bam in set(list_of_all_bam_files_downloaded).difference(set(list_of_all_bam_files_used)):
    prefix = os.path.splitext(bam)[0]
    ! mv $prefix* $output_dir/deprecated_manually_removed/
    print(bam)

('Number of used bams that have not been downloaded: ', 0)
('Number of downloaded bams that are not used: ', 0)


# Make sure we have all the RMATS files too

In [24]:
rmats_dir = '/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_current'


# Softlink known differences between APOBEC3C and APOBE accessions
- per email 20171218

In [25]:
! ln -s $rmats_dir/APOBE-BGHLV22-HepG2.set22.SE.MATS.JunctionCountOnly.txt $rmats_dir/APOBEC3C-BGHLV22-HepG2.set22.SE.MATS.JunctionCountOnly.txt
! ln -s $rmats_dir/APOBE-BGHLV22-HepG2.set22.A3SS.MATS.JunctionCountOnly.txt $rmats_dir/APOBEC3C-BGHLV22-HepG2.set22.A3SS.MATS.JunctionCountOnly.txt
! ln -s $rmats_dir/APOBE-BGHLV22-HepG2.set22.A5SS.MATS.JunctionCountOnly.txt $rmats_dir/APOBEC3C-BGHLV22-HepG2.set22.A5SS.MATS.JunctionCountOnly.txt
! ln -s $rmats_dir/APOBE-BGHLV22-HepG2.set22.MXE.MATS.JunctionCountOnly.txt $rmats_dir/APOBEC3C-BGHLV22-HepG2.set22.MXE.MATS.JunctionCountOnly.txt
! ln -s $rmats_dir/APOBE-BGHLV22-HepG2.set22.RI.MATS.JunctionCountOnly.txt $rmats_dir/APOBEC3C-BGHLV22-HepG2.set22.RI.MATS.JunctionCountOnly.txt

ln: creating symbolic link `/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_current/APOBEC3C-BGHLV22-HepG2.set22.SE.MATS.JunctionCountOnly.txt': File exists
ln: creating symbolic link `/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_current/APOBEC3C-BGHLV22-HepG2.set22.A3SS.MATS.JunctionCountOnly.txt': File exists
ln: creating symbolic link `/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_current/APOBEC3C-BGHLV22-HepG2.set22.A5SS.MATS.JunctionCountOnly.txt': File exists
ln: creating symbolic link `/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_current/APOBEC3C-BGHLV22-HepG2.set22.MXE.MATS.JunctionCountOnly.txt': File exists
ln: creating symbolic link `/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_current/APOBEC3C-BGHLV22-HepG2.set22.RI.MATS.JunctionCountOnly.txt': File exists


In [26]:
se_rmats_files = glob.glob(os.path.join(rmats_dir, '*SE.MATS.JunctionCountOnly.txt'))
a3ss_rmats_files = glob.glob(os.path.join(rmats_dir, '*A3SS.MATS.JunctionCountOnly.txt'))
a5ss_rmats_files = glob.glob(os.path.join(rmats_dir, '*A5SS.MATS.JunctionCountOnly.txt'))
mxe_rmats_files = glob.glob(os.path.join(rmats_dir, '*MXE.MATS.JunctionCountOnly.txt'))
ri_rmats_files = glob.glob(os.path.join(rmats_dir, '*RI.MATS.JunctionCountOnly.txt'))

for f in [
    se_rmats_files,
    a3ss_rmats_files,
    a5ss_rmats_files,
    mxe_rmats_files,
    ri_rmats_files
]:
    print(len(f))

473
473
473
473
473


# Check any duplicated rmats files
- make sure we have only one rmats file per rbp/cell combo, and resolve here if necessary

In [27]:
rbps = defaultdict(list)
for se in se_rmats_files:
    kdset = os.path.basename(se).split('.')[1]
    rbp, uid, cell = os.path.basename(se).split('.')[0].split('-')
    key = '{}-{}'.format(rbp, cell)
    rbps[key].append({'RBP':rbp, 'uID':uid, 'cell':cell, 'kdset':kdset, 'fullpath':se})

for key, vals in rbps.iteritems():
    if len(vals) > 1:
        for val in vals:
            print(key, val)

In [28]:
rbps['ASCC-HepG2']

[]

# Manually move duplicated rmats files to deprecated_unused
- move the non-set39 ones per email 20180330

In [29]:
! mv $rmats_dir/SRSF9-LV11-K562.set11.*.MATS.JunctionCountOnly.txt $rmats_dir/deprecated_manually_removed
! mv $rmats_dir/SRSF4-BGKLV13-K562.set13.*.MATS.JunctionCountOnly.txt $rmats_dir/deprecated_manually_removed

mv: cannot stat `/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_current/SRSF9-LV11-K562.set11.*.MATS.JunctionCountOnly.txt': No such file or directory
mv: cannot stat `/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_current/SRSF4-BGKLV13-K562.set13.*.MATS.JunctionCountOnly.txt': No such file or directory


# Concat all separate manifests together 
- makes it easier to combine/integrate other assay/files together

In [30]:
merged = hepg2_from_xintao_20171211

for df in [
    k562_from_xintao_20171211,
    hepg2_from_xintao_20180329,
    k562_from_xintao_20180329
]:
    merged = pd.concat([
        merged,
        df
    ], axis=0)
merged.columns = ['RNASEQ_'+c for c in merged.columns]
print(merged.shape)
merged.head()

(1060, 8)


Unnamed: 0,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,RNASEQ_Replicate,RNASEQ_FASTQ_R1,RNASEQ_FASTQ_R2,RNASEQ_BAM,RNASEQ_TSV
0,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_1,ENCFF488SPX,ENCFF464UID,ENCFF604SNM,ENCFF739CVP
1,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_2,ENCFF499YCP,ENCFF950IJN,ENCFF255RNW,ENCFF530DUY
2,ENCSR424YSV,AATF,HepG2,AATF_BGHLV14-3,ENCFF094WZU,ENCFF434WFA,ENCFF159JHT,ENCFF894KMA
3,ENCSR424YSV,AATF,HepG2,AATF_BGHLV14-4,ENCFF495VXI,ENCFF695STL,ENCFF800WEJ,ENCFF204TFW
4,ENCSR610VTA,ABCF1,HepG2,ABCF1_BGHLV30-3,ENCFF067AQF,ENCFF390FTP,ENCFF015TIY,ENCFF724GZU


# Add DESeq2 results

In [31]:
# get batch_id.. no these accession numbers are not consistent w/ each other.
# def isolate_batchid_from_replicate_id(row):
#     try:
#         return row['RNASEQ_Replicate'].split('_')[1].split('-')[0]
#    except IndexError:
#         return row['RNASEQ_Replicate'].split('-')[1]
#     
# merged['RNASEQ_Batch_ID'] = merged.apply(isolate_batchid_from_replicate_id, axis=1)
# merged.head()

In [32]:
deseq_dict = defaultdict(list)

all_deseq_files = glob.glob(
    os.path.join(deseq_dir, '*DESeq_output.txt')
)
print(len(all_deseq_files))

# for every deseq2 file, get the name and teh cell type from the filename
for deseq_file in all_deseq_files:
    parts = os.path.basename(deseq_file).split('_')
    cell_line = parts[0]
    rbp = parts[1]
    batch_id = parts[2]
    deseq_dict['{}-{}'.format(cell_line, rbp)].append(os.path.basename(deseq_file))
print(len(deseq_dict.keys()))

def get_deseq_file_from_row_info(row, deseq_dict=deseq_dict):
    """
    Using the row's cell line and RBP columns, return the deseq2 file associated with it.
    Prints any RBP/cell line that has more than 1 associated file
    Returns - for any RBP/cell line that's missing (should only apply to nontarget controls)
    """
    try:
        matching_files = deseq_dict['{}-{}'.format(row['RNASEQ_Cell_line'], row['RNASEQ_RBP'])]
        if len(matching_files) == 1:
            return matching_files[0]
        elif len(matching_files) == 0:
            if row['RNASEQ_RBP'] != 'non-target':
                print(row['RNASEQ_RBP'])
            return '-'
        else:
            print(row['RNASEQ_RBP'], row['RNASEQ_Cell_line'])
            return matching_files[0]
    except KeyError:
        return '-'
    
merged['RNASEQ_DESeq2'] = merged.apply(get_deseq_file_from_row_info, axis=1)
merged.head()

472
472


Unnamed: 0,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,RNASEQ_Replicate,RNASEQ_FASTQ_R1,RNASEQ_FASTQ_R2,RNASEQ_BAM,RNASEQ_TSV,RNASEQ_DESeq2
0,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_1,ENCFF488SPX,ENCFF464UID,ENCFF604SNM,ENCFF739CVP,HepG2_AARS_BGHLV17_DESeq_output.txt
1,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_2,ENCFF499YCP,ENCFF950IJN,ENCFF255RNW,ENCFF530DUY,HepG2_AARS_BGHLV17_DESeq_output.txt
2,ENCSR424YSV,AATF,HepG2,AATF_BGHLV14-3,ENCFF094WZU,ENCFF434WFA,ENCFF159JHT,ENCFF894KMA,HepG2_AATF_BGHLV14_DESeq_output.txt
3,ENCSR424YSV,AATF,HepG2,AATF_BGHLV14-4,ENCFF495VXI,ENCFF695STL,ENCFF800WEJ,ENCFF204TFW,HepG2_AATF_BGHLV14_DESeq_output.txt
4,ENCSR610VTA,ABCF1,HepG2,ABCF1_BGHLV30-3,ENCFF067AQF,ENCFF390FTP,ENCFF015TIY,ENCFF724GZU,HepG2_ABCF1_BGHLV30_DESeq_output.txt


In [33]:
merged[(merged['RNASEQ_DESeq2']=='-') & (merged['RNASEQ_RBP']!='non-target')]

Unnamed: 0,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,RNASEQ_Replicate,RNASEQ_FASTQ_R1,RNASEQ_FASTQ_R2,RNASEQ_BAM,RNASEQ_TSV,RNASEQ_DESeq2


# Pair KD expts with their controls

In [34]:
exp2ctrl = {}

with open(os.path.join(rnaseq_dir, 'HepG2_expID_table.20171211.txt'), 'r') as f:
    f.readline() # skip header
    for line in f:
        line = line.rstrip().split('\t')
        exp2ctrl[line[3]] = line[4]

with open(os.path.join(rnaseq_dir, 'K562_expID_table.20171211.txt'), 'r') as f:
    f.readline() # skip header
    for line in f:
        line = line.rstrip().split('\t')
        exp2ctrl[line[3]] = line[4]

with open(os.path.join(rnaseq_dir, 'HepG2_expID_table_set39.20180329.txt'), 'r') as f:
    f.readline() # skip header
    for line in f:
        line = line.rstrip().split('\t')
        exp2ctrl[line[3]] = line[4]

with open(os.path.join(rnaseq_dir, 'K562_expID_table_set39.20180329.txt'), 'r') as f:
    f.readline() # skip header
    for line in f:
        line = line.rstrip().split('\t')
        exp2ctrl[line[3]] = line[4]
        
print("Total number of paired expts", len(exp2ctrl.keys()))

('Total number of paired expts', 475)


# Add replicate info
- This is perhaps not the true replicate number, but per a previous email the lower number should be rep1 while the higher number is rep2

In [35]:
exp2ctrl['ENCSR715QWM']

'ENCSR521WAI'

In [36]:
def assign_rep_number(row):
    bam = row['RNASEQ_BAM']
    url = experiments+bam+"/?format=json"
    response = urllib.urlopen(url)
    data = json.loads(response.read())
    if 'code' in data.keys():
        next
    else:
        print('.'),
        return data[u'biological_replicates'][0]
    
merged['Rep'] = merged.apply(assign_rep_number, axis=1)
merged.head()

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

Unnamed: 0,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,RNASEQ_Replicate,RNASEQ_FASTQ_R1,RNASEQ_FASTQ_R2,RNASEQ_BAM,RNASEQ_TSV,RNASEQ_DESeq2,Rep
0,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_1,ENCFF488SPX,ENCFF464UID,ENCFF604SNM,ENCFF739CVP,HepG2_AARS_BGHLV17_DESeq_output.txt,1
1,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_2,ENCFF499YCP,ENCFF950IJN,ENCFF255RNW,ENCFF530DUY,HepG2_AARS_BGHLV17_DESeq_output.txt,2
2,ENCSR424YSV,AATF,HepG2,AATF_BGHLV14-3,ENCFF094WZU,ENCFF434WFA,ENCFF159JHT,ENCFF894KMA,HepG2_AATF_BGHLV14_DESeq_output.txt,1
3,ENCSR424YSV,AATF,HepG2,AATF_BGHLV14-4,ENCFF495VXI,ENCFF695STL,ENCFF800WEJ,ENCFF204TFW,HepG2_AATF_BGHLV14_DESeq_output.txt,2
4,ENCSR610VTA,ABCF1,HepG2,ABCF1_BGHLV30-3,ENCFF067AQF,ENCFF390FTP,ENCFF015TIY,ENCFF724GZU,HepG2_ABCF1_BGHLV30_DESeq_output.txt,1





# separate the controls and the expts

In [37]:
expt = merged[merged['RNASEQ_RBP']!='non-target']
ctrl = merged[merged['RNASEQ_RBP']=='non-target']
print("Total number of expt bam files", expt.shape[0])
print("Total number of ctrl bam files", ctrl.shape[0])

('Total number of expt bam files', 944)
('Total number of ctrl bam files', 116)


In [38]:
def return_ctrl(row, exp2ctrl=exp2ctrl):
    try:
        return exp2ctrl[row['RNASEQ_Experiment_ID']]
    except KeyError: 
        return '-'
    
expt['RNASEQ_Control_Experiment_ID'] = expt.apply(return_ctrl, axis=1)
expt.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,RNASEQ_Replicate,RNASEQ_FASTQ_R1,RNASEQ_FASTQ_R2,RNASEQ_BAM,RNASEQ_TSV,RNASEQ_DESeq2,Rep,RNASEQ_Control_Experiment_ID
0,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_1,ENCFF488SPX,ENCFF464UID,ENCFF604SNM,ENCFF739CVP,HepG2_AARS_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV
1,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_2,ENCFF499YCP,ENCFF950IJN,ENCFF255RNW,ENCFF530DUY,HepG2_AARS_BGHLV17_DESeq_output.txt,2,ENCSR856ZRV
2,ENCSR424YSV,AATF,HepG2,AATF_BGHLV14-3,ENCFF094WZU,ENCFF434WFA,ENCFF159JHT,ENCFF894KMA,HepG2_AATF_BGHLV14_DESeq_output.txt,1,ENCSR003EKR
3,ENCSR424YSV,AATF,HepG2,AATF_BGHLV14-4,ENCFF495VXI,ENCFF695STL,ENCFF800WEJ,ENCFF204TFW,HepG2_AATF_BGHLV14_DESeq_output.txt,2,ENCSR003EKR
4,ENCSR610VTA,ABCF1,HepG2,ABCF1_BGHLV30-3,ENCFF067AQF,ENCFF390FTP,ENCFF015TIY,ENCFF724GZU,HepG2_ABCF1_BGHLV30_DESeq_output.txt,1,ENCSR067GHD


In [39]:
ctrl.columns = [c.replace('RNASEQ_','CONTROL_') for c in ctrl.columns]
merged = pd.merge(
    expt, 
    ctrl, 
    how='outer', 
    left_on=['RNASEQ_Control_Experiment_ID','Rep'],
    right_on=['CONTROL_Experiment_ID', 'Rep']
)
merged.head()

Unnamed: 0,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,RNASEQ_Replicate,RNASEQ_FASTQ_R1,RNASEQ_FASTQ_R2,RNASEQ_BAM,RNASEQ_TSV,RNASEQ_DESeq2,Rep,RNASEQ_Control_Experiment_ID,CONTROL_Experiment_ID,CONTROL_RBP,CONTROL_Cell_line,CONTROL_Replicate,CONTROL_FASTQ_R1,CONTROL_FASTQ_R2,CONTROL_BAM,CONTROL_TSV,CONTROL_DESeq2
0,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_1,ENCFF488SPX,ENCFF464UID,ENCFF604SNM,ENCFF739CVP,HepG2_AARS_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-
1,ENCSR570CWH,BCCIP,HepG2,BCCIP_BGHLV17_45,ENCFF291SQT,ENCFF225QRN,ENCFF022PCS,ENCFF938GXB,HepG2_BCCIP_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-
2,ENCSR481AYC,BCLAF1,HepG2,BCLAF1_BGHLV17_5,ENCFF182RAD,ENCFF863SFE,ENCFF775APS,ENCFF670VTV,HepG2_BCLAF1_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-
3,ENCSR815JDY,CSTF2,HepG2,CSTF2_BGHLV17_10,ENCFF041YGN,ENCFF909LAP,ENCFF116TYA,ENCFF568ZNU,HepG2_CSTF2_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-
4,ENCSR210RWL,DDX27,HepG2,DDX27_BGHLV17_53,ENCFF265RYO,ENCFF554LIG,ENCFF495SBM,ENCFF311UBJ,HepG2_DDX27_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-


# Append rmats files
- combine using RBP name and cell line
- if there are any missing rmats files, this block will print what they are

In [40]:
def get_junctioncountonly(row, rbps=rbps):
    """
    Returns the basename of the rmats file associated with the row, given its RBP name and cell line.
    Returns - if an exception was found.
    """
    key = '{}-{}'.format(row['RNASEQ_RBP'], row['RNASEQ_Cell_line'])
    try:
        return os.path.basename(rbps[key][0]['fullpath'])
    except Exception as e:
        if row['RNASEQ_RBP'] != 'non-target':
            print(e, row['RNASEQ_RBP'], row['RNASEQ_Cell_line']) # print missing rmats file
        return '-'
    
merged['SE_jxc_file'] = merged.apply(get_junctioncountonly, axis=1)
merged.head()

Unnamed: 0,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,RNASEQ_Replicate,RNASEQ_FASTQ_R1,RNASEQ_FASTQ_R2,RNASEQ_BAM,RNASEQ_TSV,RNASEQ_DESeq2,Rep,...,CONTROL_Experiment_ID,CONTROL_RBP,CONTROL_Cell_line,CONTROL_Replicate,CONTROL_FASTQ_R1,CONTROL_FASTQ_R2,CONTROL_BAM,CONTROL_TSV,CONTROL_DESeq2,SE_jxc_file
0,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_1,ENCFF488SPX,ENCFF464UID,ENCFF604SNM,ENCFF739CVP,HepG2_AARS_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,AARS-BGHLV17-HepG2.set17.SE.MATS.JunctionCount...
1,ENCSR570CWH,BCCIP,HepG2,BCCIP_BGHLV17_45,ENCFF291SQT,ENCFF225QRN,ENCFF022PCS,ENCFF938GXB,HepG2_BCCIP_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,BCCIP-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...
2,ENCSR481AYC,BCLAF1,HepG2,BCLAF1_BGHLV17_5,ENCFF182RAD,ENCFF863SFE,ENCFF775APS,ENCFF670VTV,HepG2_BCLAF1_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,BCLAF1-BGHLV17-HepG2.set17.SE.MATS.JunctionCou...
3,ENCSR815JDY,CSTF2,HepG2,CSTF2_BGHLV17_10,ENCFF041YGN,ENCFF909LAP,ENCFF116TYA,ENCFF568ZNU,HepG2_CSTF2_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,CSTF2-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...
4,ENCSR210RWL,DDX27,HepG2,DDX27_BGHLV17_53,ENCFF265RYO,ENCFF554LIG,ENCFF495SBM,ENCFF311UBJ,HepG2_DDX27_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,DDX27-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...


In [41]:
merged

Unnamed: 0,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,RNASEQ_Replicate,RNASEQ_FASTQ_R1,RNASEQ_FASTQ_R2,RNASEQ_BAM,RNASEQ_TSV,RNASEQ_DESeq2,Rep,...,CONTROL_Experiment_ID,CONTROL_RBP,CONTROL_Cell_line,CONTROL_Replicate,CONTROL_FASTQ_R1,CONTROL_FASTQ_R2,CONTROL_BAM,CONTROL_TSV,CONTROL_DESeq2,SE_jxc_file
0,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_1,ENCFF488SPX,ENCFF464UID,ENCFF604SNM,ENCFF739CVP,HepG2_AARS_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,AARS-BGHLV17-HepG2.set17.SE.MATS.JunctionCount...
1,ENCSR570CWH,BCCIP,HepG2,BCCIP_BGHLV17_45,ENCFF291SQT,ENCFF225QRN,ENCFF022PCS,ENCFF938GXB,HepG2_BCCIP_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,BCCIP-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...
2,ENCSR481AYC,BCLAF1,HepG2,BCLAF1_BGHLV17_5,ENCFF182RAD,ENCFF863SFE,ENCFF775APS,ENCFF670VTV,HepG2_BCLAF1_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,BCLAF1-BGHLV17-HepG2.set17.SE.MATS.JunctionCou...
3,ENCSR815JDY,CSTF2,HepG2,CSTF2_BGHLV17_10,ENCFF041YGN,ENCFF909LAP,ENCFF116TYA,ENCFF568ZNU,HepG2_CSTF2_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,CSTF2-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...
4,ENCSR210RWL,DDX27,HepG2,DDX27_BGHLV17_53,ENCFF265RYO,ENCFF554LIG,ENCFF495SBM,ENCFF311UBJ,HepG2_DDX27_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,DDX27-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...
5,ENCSR222CSF,DDX28,HepG2,DDX28_BGHLV17_55,ENCFF030DXO,ENCFF449SRE,ENCFF088AZS,ENCFF293UWH,HepG2_DDX28_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,DDX28-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...
6,ENCSR964YTW,DDX55,HepG2,DDX55_BGHLV17_59,ENCFF867MZR,ENCFF584HLK,ENCFF799DYJ,ENCFF946DGA,HepG2_DDX55_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,DDX55-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...
7,ENCSR639LKS,FKBP4,HepG2,FKBP4_BGHLV17_15,ENCFF796SHO,ENCFF107HUY,ENCFF635BOE,ENCFF590SVM,HepG2_FKBP4_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,FKBP4-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...
8,ENCSR647NYX,NONO,HepG2,NONO_BGHLV17_61,ENCFF781GYO,ENCFF748HNR,ENCFF905RMN,ENCFF201WEV,HepG2_NONO_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,NONO-BGHLV17-HepG2.set17.SE.MATS.JunctionCount...
9,ENCSR998MZP,PRPF8,HepG2,PRPF8_BGHLV17_23,ENCFF285CHT,ENCFF198XKY,ENCFF556JQE,ENCFF879UCI,HepG2_PRPF8_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,-,PRPF8-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...


# Create master manifest file (including eCLIP files)

In [42]:
final_submitted_eclip_manifest = pd.read_table(
    '/projects/ps-yeolab3/bay001/reference_data/misc_ENCODE/eCLIP_finalstatus_20180404_ENCODE.tsv' # should be 2018, but same file so oh well
)
final_submitted_eclip_manifest.columns = ['eCLIP_'+c.replace(' ','_') for c in final_submitted_eclip_manifest.columns]
final_submitted_eclip_manifest[['eCLIP_uID']] = final_submitted_eclip_manifest[['eCLIP_uID']].astype('string')
final_submitted_eclip_manifest.head()

Unnamed: 0,eCLIP_uID,eCLIP_Official_Gene_Symbol,eCLIP_Final_internal_accession,eCLIP_Submission_status,eCLIP_Antibody_Cat_#,eCLIP_Antibody_Lot_#,eCLIP_Cell_Line
0,284,DDX21,284x4000fix,DONE - 2 reps submitted 11/02/17,RN090PW,1,K562
1,406,PABPC4,406,DONE - 2 reps submitted 11/02/17,A301-466A,1,K562
2,416,WRN,416,DONE - 2 reps submitted 11/02/17,GTX101081,39812,K562
3,218,TRA2A,218,DONE - 2 reps submitted 6/28/15,A303-779A,1,HepG2
4,220,IGF2BP1,220,DONE - 2 reps submitted 6/28/15,RN007P,3,K562


In [43]:
final_submitted_eclip_manifest

Unnamed: 0,eCLIP_uID,eCLIP_Official_Gene_Symbol,eCLIP_Final_internal_accession,eCLIP_Submission_status,eCLIP_Antibody_Cat_#,eCLIP_Antibody_Lot_#,eCLIP_Cell_Line
0,284,DDX21,284x4000fix,DONE - 2 reps submitted 11/02/17,RN090PW,001,K562
1,406,PABPC4,406,DONE - 2 reps submitted 11/02/17,A301-466A,001,K562
2,416,WRN,416,DONE - 2 reps submitted 11/02/17,GTX101081,39812,K562
3,218,TRA2A,218,DONE - 2 reps submitted 6/28/15,A303-779A,001,HepG2
4,220,IGF2BP1,220,DONE - 2 reps submitted 6/28/15,RN007P,003,K562
5,222,HNRNPM,222,DONE - 2 reps submitted 6/28/15,SC-20001,D1304,HepG2
6,223,FKBP4,223,DONE - 2 reps submitted 6/28/15,A301-427A,001,HepG2
7,224,HNRNPM,224,DONE - 2 reps submitted 6/28/15,SC-20001,D1304,K562
8,226,IGF2BP2,226,DONE - 2 reps submitted 6/28/15,RN008P,002,K562
9,230,BCCIP,230,DONE - 2 reps submitted 6/28/15,A302-196A,001,HepG2


In [44]:
del merged['CONTROL_DESeq2']

# Amend official RBP names vs Xintao's names

In [45]:
merged.head()

Unnamed: 0,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,RNASEQ_Replicate,RNASEQ_FASTQ_R1,RNASEQ_FASTQ_R2,RNASEQ_BAM,RNASEQ_TSV,RNASEQ_DESeq2,Rep,RNASEQ_Control_Experiment_ID,CONTROL_Experiment_ID,CONTROL_RBP,CONTROL_Cell_line,CONTROL_Replicate,CONTROL_FASTQ_R1,CONTROL_FASTQ_R2,CONTROL_BAM,CONTROL_TSV,SE_jxc_file
0,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_1,ENCFF488SPX,ENCFF464UID,ENCFF604SNM,ENCFF739CVP,HepG2_AARS_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,AARS-BGHLV17-HepG2.set17.SE.MATS.JunctionCount...
1,ENCSR570CWH,BCCIP,HepG2,BCCIP_BGHLV17_45,ENCFF291SQT,ENCFF225QRN,ENCFF022PCS,ENCFF938GXB,HepG2_BCCIP_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,BCCIP-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...
2,ENCSR481AYC,BCLAF1,HepG2,BCLAF1_BGHLV17_5,ENCFF182RAD,ENCFF863SFE,ENCFF775APS,ENCFF670VTV,HepG2_BCLAF1_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,BCLAF1-BGHLV17-HepG2.set17.SE.MATS.JunctionCou...
3,ENCSR815JDY,CSTF2,HepG2,CSTF2_BGHLV17_10,ENCFF041YGN,ENCFF909LAP,ENCFF116TYA,ENCFF568ZNU,HepG2_CSTF2_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,CSTF2-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...
4,ENCSR210RWL,DDX27,HepG2,DDX27_BGHLV17_53,ENCFF265RYO,ENCFF554LIG,ENCFF495SBM,ENCFF311UBJ,HepG2_DDX27_BGHLV17_DESeq_output.txt,1,ENCSR856ZRV,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,DDX27-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...


In [46]:
x2official = {
    'CSDA':'YBX3',
    'RNASEN50':'DROSHA',
    'EIF2C1':'AGO1',
    'CUGBP1':'CELF1',
    'PKM2':'PKM',
    'KIAA1967':'CCAR2',
    'RDBP':'NELFE',
#     'GNB2L1':'RACK1',
}
def convert2official(row, x2official=x2official):
    try:
        return x2official[row['RNASEQ_RBP']]
    except KeyError:
        return row['RNASEQ_RBP']

merged['RNASEQ_Official_RBP'] = merged.apply(convert2official, axis=1)
merged.head()

Unnamed: 0,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,RNASEQ_Replicate,RNASEQ_FASTQ_R1,RNASEQ_FASTQ_R2,RNASEQ_BAM,RNASEQ_TSV,RNASEQ_DESeq2,Rep,...,CONTROL_Experiment_ID,CONTROL_RBP,CONTROL_Cell_line,CONTROL_Replicate,CONTROL_FASTQ_R1,CONTROL_FASTQ_R2,CONTROL_BAM,CONTROL_TSV,SE_jxc_file,RNASEQ_Official_RBP
0,ENCSR547NWD,AARS,HepG2,AARS_BGHLV17_1,ENCFF488SPX,ENCFF464UID,ENCFF604SNM,ENCFF739CVP,HepG2_AARS_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,AARS-BGHLV17-HepG2.set17.SE.MATS.JunctionCount...,AARS
1,ENCSR570CWH,BCCIP,HepG2,BCCIP_BGHLV17_45,ENCFF291SQT,ENCFF225QRN,ENCFF022PCS,ENCFF938GXB,HepG2_BCCIP_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,BCCIP-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...,BCCIP
2,ENCSR481AYC,BCLAF1,HepG2,BCLAF1_BGHLV17_5,ENCFF182RAD,ENCFF863SFE,ENCFF775APS,ENCFF670VTV,HepG2_BCLAF1_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,BCLAF1-BGHLV17-HepG2.set17.SE.MATS.JunctionCou...,BCLAF1
3,ENCSR815JDY,CSTF2,HepG2,CSTF2_BGHLV17_10,ENCFF041YGN,ENCFF909LAP,ENCFF116TYA,ENCFF568ZNU,HepG2_CSTF2_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,CSTF2-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...,CSTF2
4,ENCSR210RWL,DDX27,HepG2,DDX27_BGHLV17_53,ENCFF265RYO,ENCFF554LIG,ENCFF495SBM,ENCFF311UBJ,HepG2_DDX27_BGHLV17_DESeq_output.txt,1,...,ENCSR856ZRV,non-target,HepG2,NT_BGHLV17_63,ENCFF385GEX,ENCFF922CDR,ENCFF705LNP,ENCFF231SUG,DDX27-BGHLV17-HepG2.set17.SE.MATS.JunctionCoun...,DDX27


In [47]:
merged = pd.merge(
    final_submitted_eclip_manifest, 
    merged, 
    how='outer',
    left_on=['eCLIP_Official_Gene_Symbol', 'eCLIP_Cell_Line'],
    right_on=['RNASEQ_Official_RBP', 'RNASEQ_Cell_line']
)

merged.to_csv(
    '/projects/ps-yeolab3/bay001/reference_data/misc_ENCODE/eCLIP_finalstatus_20180406_ENCODE_combined_RNASEQ.tsv',
    sep='\t',
    index=False,
    header=True
)

In [48]:
rep1 = merged[merged['Rep']==1]
rep2 = merged[merged['Rep']==2]
others = merged[(merged['Rep']!=1) & (merged['Rep']!=2)]
rep1 = pd.concat([rep1, others], axis=0)
rep1.head()

Unnamed: 0,eCLIP_uID,eCLIP_Official_Gene_Symbol,eCLIP_Final_internal_accession,eCLIP_Submission_status,eCLIP_Antibody_Cat_#,eCLIP_Antibody_Lot_#,eCLIP_Cell_Line,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,...,CONTROL_Experiment_ID,CONTROL_RBP,CONTROL_Cell_line,CONTROL_Replicate,CONTROL_FASTQ_R1,CONTROL_FASTQ_R2,CONTROL_BAM,CONTROL_TSV,SE_jxc_file,RNASEQ_Official_RBP
0,284,DDX21,284x4000fix,DONE - 2 reps submitted 11/02/17,RN090PW,1,K562,ENCSR384LKC,DDX21,K562,...,ENCSR163JUC,non-target,K562,NT_BGKcLV02-1,ENCFF076EAI,ENCFF478ILB,ENCFF157TKC,ENCFF412XQF,DDX21-BGKcLV02-K562.BGKcLV02.SE.MATS.JunctionC...,DDX21
2,406,PABPC4,406,DONE - 2 reps submitted 11/02/17,A301-466A,1,K562,ENCSR047EEG,PABPC4,K562,...,ENCSR344XID,non-target,K562,Nontarget-LV08-1_00003531,ENCFF759IRT,ENCFF963MPW,ENCFF092XAP,ENCFF115OOZ,PABPC4-LV08-K562.set10.SE.MATS.JunctionCountOn...,PABPC4
4,416,WRN,416,DONE - 2 reps submitted 11/02/17,GTX101081,39812,K562,ENCSR165BCF,WRN,K562,...,ENCSR718EWL,non-target,K562,NT_BGKLV38-1,ENCFF409FXM,ENCFF698ZGA,ENCFF995IVQ,ENCFF699ZGX,WRN-BGKLV38-K562.set38.SE.MATS.JunctionCountOn...,WRN
6,218,TRA2A,218,DONE - 2 reps submitted 6/28/15,A303-779A,1,HepG2,ENCSR030GZQ,TRA2A,HepG2,...,ENCSR491FOC,non-target,HepG2,NT_BGHLV14-1,ENCFF510FRX,ENCFF409JQD,ENCFF499UUZ,ENCFF043ORB,TRA2A-BGHLV14-HepG2.set14.SE.MATS.JunctionCoun...,TRA2A
8,220,IGF2BP1,220,DONE - 2 reps submitted 6/28/15,RN007P,3,K562,ENCSR543LCG,IGF2BP1,K562,...,ENCSR163JUC,non-target,K562,NT_BGKcLV02-1,ENCFF076EAI,ENCFF478ILB,ENCFF157TKC,ENCFF412XQF,IGF2BP1-BGKcLV02-K562.BGKcLV02.SE.MATS.Junctio...,IGF2BP1


In [49]:
merged_with_reps = pd.merge(
    rep1, 
    rep2, 
    how='outer', 
    left_on=[
        'eCLIP_uID','eCLIP_Official_Gene_Symbol','eCLIP_Final_internal_accession','eCLIP_Submission_status',
        'eCLIP_Antibody_Cat_#','eCLIP_Antibody_Lot_#','eCLIP_Cell_Line','RNASEQ_Experiment_ID',
        'RNASEQ_RBP','RNASEQ_Official_RBP','RNASEQ_Cell_line',
        'RNASEQ_DESeq2','RNASEQ_Control_Experiment_ID','CONTROL_Experiment_ID','CONTROL_RBP',
        'CONTROL_Cell_line','SE_jxc_file'
    ],
    right_on=[
        'eCLIP_uID','eCLIP_Official_Gene_Symbol','eCLIP_Final_internal_accession','eCLIP_Submission_status',
        'eCLIP_Antibody_Cat_#','eCLIP_Antibody_Lot_#','eCLIP_Cell_Line','RNASEQ_Experiment_ID',
        'RNASEQ_RBP','RNASEQ_Official_RBP','RNASEQ_Cell_line',
        'RNASEQ_DESeq2','RNASEQ_Control_Experiment_ID','CONTROL_Experiment_ID','CONTROL_RBP',
        'CONTROL_Cell_line','SE_jxc_file'
    ]
)

In [50]:
merged_with_reps.columns = [c.replace('_x','_rep1').replace('_y','_rep2') for c in merged_with_reps.columns]

In [51]:
merged_with_reps

Unnamed: 0,eCLIP_uID,eCLIP_Official_Gene_Symbol,eCLIP_Final_internal_accession,eCLIP_Submission_status,eCLIP_Antibody_Cat_#,eCLIP_Antibody_Lot_#,eCLIP_Cell_Line,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Cell_line,...,RNASEQ_FASTQ_R1_rep2,RNASEQ_FASTQ_R2_rep2,RNASEQ_BAM_rep2,RNASEQ_TSV_rep2,Rep_rep2,CONTROL_Replicate_rep2,CONTROL_FASTQ_R1_rep2,CONTROL_FASTQ_R2_rep2,CONTROL_BAM_rep2,CONTROL_TSV_rep2
0,284,DDX21,284x4000fix,DONE - 2 reps submitted 11/02/17,RN090PW,001,K562,ENCSR384LKC,DDX21,K562,...,ENCFF704MDH,ENCFF305QYC,ENCFF788HMQ,ENCFF025TLM,2.0,NT_BGKcLV02-2,ENCFF102NCM,ENCFF875FRI,ENCFF739JDP,ENCFF564TZO
1,406,PABPC4,406,DONE - 2 reps submitted 11/02/17,A301-466A,001,K562,ENCSR047EEG,PABPC4,K562,...,ENCFF457DTV,ENCFF708OQJ,ENCFF710FGW,ENCFF144LFK,2.0,nontarget-LV08-2_00003531,ENCFF799FEY,ENCFF686ZOZ,ENCFF265ZZB,ENCFF040ZLI
2,416,WRN,416,DONE - 2 reps submitted 11/02/17,GTX101081,39812,K562,ENCSR165BCF,WRN,K562,...,ENCFF931PKC,ENCFF490GIG,ENCFF864ZFK,ENCFF699WMK,2.0,NT_BGKLV38-2,ENCFF357AKH,ENCFF113MXN,ENCFF713RDR,ENCFF266DWI
3,218,TRA2A,218,DONE - 2 reps submitted 6/28/15,A303-779A,001,HepG2,ENCSR030GZQ,TRA2A,HepG2,...,ENCFF808UZU,ENCFF307NOS,ENCFF578EXH,ENCFF459BTW,2.0,NT_BGHLV14-2,ENCFF813XBJ,ENCFF276KEO,ENCFF326XKY,ENCFF243JZQ
4,220,IGF2BP1,220,DONE - 2 reps submitted 6/28/15,RN007P,003,K562,ENCSR543LCG,IGF2BP1,K562,...,ENCFF772HHV,ENCFF731HPX,ENCFF117XXW,ENCFF365YEY,2.0,NT_BGKcLV02-2,ENCFF102NCM,ENCFF875FRI,ENCFF739JDP,ENCFF564TZO
5,222,HNRNPM,222,DONE - 2 reps submitted 6/28/15,SC-20001,D1304,HepG2,ENCSR995JMS,HNRNPM,HepG2,...,ENCFF592ZZR,ENCFF306TVJ,ENCFF303DFN,ENCFF168RBP,2.0,NT_BGHLV30-2,ENCFF773HLT,ENCFF768JSC,ENCFF774FAK,ENCFF990KGR
6,223,FKBP4,223,DONE - 2 reps submitted 6/28/15,A301-427A,001,HepG2,ENCSR639LKS,FKBP4,HepG2,...,ENCFF663YGQ,ENCFF071HGB,ENCFF377GZT,ENCFF333XHH,2.0,NT_BGHLV17_64,ENCFF403CZA,ENCFF278TEH,ENCFF038LUH,ENCFF533PXL
7,224,HNRNPM,224,DONE - 2 reps submitted 6/28/15,SC-20001,D1304,K562,ENCSR746NIM,HNRNPM,K562,...,ENCFF160TRI,ENCFF795GAH,ENCFF548EFX,ENCFF129QBF,2.0,NT_BGKLV29-2,ENCFF352SRA,ENCFF852RBO,ENCFF083EEO,ENCFF809LLU
8,226,IGF2BP2,226,DONE - 2 reps submitted 6/28/15,RN008P,002,K562,ENCSR952RRH,IGF2BP2,K562,...,ENCFF255GDS,ENCFF394UEG,ENCFF342OBS,ENCFF028NMK,2.0,nontarget-LV11-2,ENCFF891EGO,ENCFF667EXM,ENCFF970PYT,ENCFF170EPB
9,230,BCCIP,230,DONE - 2 reps submitted 6/28/15,A302-196A,001,HepG2,ENCSR570CWH,BCCIP,HepG2,...,ENCFF135KIN,ENCFF426CCB,ENCFF596JEB,ENCFF311BCU,2.0,NT_BGHLV17_64,ENCFF403CZA,ENCFF278TEH,ENCFF038LUH,ENCFF533PXL


In [52]:
merged_with_reps = merged_with_reps[[
    'eCLIP_uID','eCLIP_Official_Gene_Symbol','eCLIP_Final_internal_accession',
    'eCLIP_Submission_status','eCLIP_Antibody_Cat_#','eCLIP_Antibody_Lot_#',
    'eCLIP_Cell_Line',
    'RNASEQ_Experiment_ID','RNASEQ_RBP','RNASEQ_Official_RBP','RNASEQ_Cell_line',
    'RNASEQ_Control_Experiment_ID','CONTROL_RBP','CONTROL_Cell_line',
    'RNASEQ_Replicate_rep1','RNASEQ_FASTQ_R1_rep1','RNASEQ_FASTQ_R2_rep1',
    'RNASEQ_BAM_rep1','RNASEQ_TSV_rep1','Rep_rep1',
    'CONTROL_Replicate_rep1','CONTROL_FASTQ_R1_rep1','CONTROL_FASTQ_R2_rep1',
    'CONTROL_BAM_rep1','CONTROL_TSV_rep1',
    'RNASEQ_Replicate_rep2','RNASEQ_FASTQ_R1_rep2','RNASEQ_FASTQ_R2_rep2',
    'RNASEQ_BAM_rep2','RNASEQ_TSV_rep2','Rep_rep2',
    'CONTROL_Replicate_rep2','CONTROL_FASTQ_R1_rep2','CONTROL_FASTQ_R2_rep2',
    'CONTROL_BAM_rep2','CONTROL_TSV_rep2',
    'SE_jxc_file','RNASEQ_DESeq2',
]]
print(len(merged_with_reps.columns))

38


In [53]:
merged_with_reps.columns

Index([u'eCLIP_uID', u'eCLIP_Official_Gene_Symbol',
       u'eCLIP_Final_internal_accession', u'eCLIP_Submission_status',
       u'eCLIP_Antibody_Cat_#', u'eCLIP_Antibody_Lot_#', u'eCLIP_Cell_Line',
       u'RNASEQ_Experiment_ID', u'RNASEQ_RBP', u'RNASEQ_Official_RBP',
       u'RNASEQ_Cell_line', u'RNASEQ_Control_Experiment_ID', u'CONTROL_RBP',
       u'CONTROL_Cell_line', u'RNASEQ_Replicate_rep1', u'RNASEQ_FASTQ_R1_rep1',
       u'RNASEQ_FASTQ_R2_rep1', u'RNASEQ_BAM_rep1', u'RNASEQ_TSV_rep1',
       u'Rep_rep1', u'CONTROL_Replicate_rep1', u'CONTROL_FASTQ_R1_rep1',
       u'CONTROL_FASTQ_R2_rep1', u'CONTROL_BAM_rep1', u'CONTROL_TSV_rep1',
       u'RNASEQ_Replicate_rep2', u'RNASEQ_FASTQ_R1_rep2',
       u'RNASEQ_FASTQ_R2_rep2', u'RNASEQ_BAM_rep2', u'RNASEQ_TSV_rep2',
       u'Rep_rep2', u'CONTROL_Replicate_rep2', u'CONTROL_FASTQ_R1_rep2',
       u'CONTROL_FASTQ_R2_rep2', u'CONTROL_BAM_rep2', u'CONTROL_TSV_rep2',
       u'SE_jxc_file', u'RNASEQ_DESeq2'],
      dtype='object')

In [54]:
merged_with_reps.to_csv(
    '/projects/ps-yeolab3/bay001/reference_data/misc_ENCODE/eCLIP_finalstatus_20180406_ENCODE_combined_RNASEQ.tsv',
    sep='\t',
    index=False,
    header=True
)

In [55]:
merged_with_reps[merged_with_reps['RNASEQ_Official_RBP']=='RACK1']

Unnamed: 0,eCLIP_uID,eCLIP_Official_Gene_Symbol,eCLIP_Final_internal_accession,eCLIP_Submission_status,eCLIP_Antibody_Cat_#,eCLIP_Antibody_Lot_#,eCLIP_Cell_Line,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Official_RBP,...,RNASEQ_BAM_rep2,RNASEQ_TSV_rep2,Rep_rep2,CONTROL_Replicate_rep2,CONTROL_FASTQ_R1_rep2,CONTROL_FASTQ_R2_rep2,CONTROL_BAM_rep2,CONTROL_TSV_rep2,SE_jxc_file,RNASEQ_DESeq2


In [56]:
final_hepg2 = merged_with_reps[merged_with_reps[u'eCLIP_Cell_Line']=='HepG2']
final_hepg2.dropna(subset=['RNASEQ_RBP']).shape

(92, 38)

In [57]:
final_k562 = merged_with_reps[merged_with_reps[u'eCLIP_Cell_Line']=='K562']
final_k562.dropna(subset=['RNASEQ_RBP']).shape

(111, 38)