# Basic download from encode script.
- Current stats as of 2-28:
    - 526 Hg19 bams total (254 HepG2, 272 K562).  
    - 474 Hg19 bams (230 HepG2, 244 K562)
    - 52 control Hg19 bams (24 HepG2, 28 K562)
    - 2 missing possible controls (TIA1 K562 and PPIL4 K562)
    

In [1]:
import pandas as pd
import urllib
import json
import requests
import os
from collections import defaultdict
import qtools
pd.set_option("display.max_columns",500)
host = 'https://www.encodeproject.org'
experiments = "https://www.encodeproject.org/experiments/"
date = '3-1-2017'
annotated_manifest = '/home/bay001/projects/encode/permanent_data/RBP_annotations_MASTER_AllDatasets_20170221.csv'
downloaded_dir = '/projects/ps-yeolab3/encode/rnaseq/shrna_knockdown/'
other_dir = '/projects/ps-yeolab3/encode/rnaseq/shrna_knockdown/aug_16-jan_17/'

In [2]:
assembly = 'hg19'
df = pd.read_table(annotated_manifest,skiprows=5).fillna(0)

_annotated_hepg2 = df[(df['RNAseq HepG2']==1) & (df['eCLIP HepG2']==1)]
_annotated_k562 = df[(df['RNAseq K562']==1) & (df['eCLIP K562']==1)]

hepg2_rbps = _annotated_hepg2['RBP name'] + '-human'
k562_rbps = _annotated_k562['RBP name'] + '-human'

encode_manifest = '/home/bay001/projects/encode/permanent_data/metadata_from_encodeprojectorg_{}.tsv'.format(date)
encode_manifest_df = pd.read_table(encode_manifest)
encode_manifest_df = encode_manifest_df[
    (encode_manifest_df['Output type']=='alignments') & 
    (encode_manifest_df['Lab']=='ENCODE Processing Pipeline') &
    (encode_manifest_df['Assembly']==assembly)
]

encodeorg_hepg2 = encode_manifest_df[encode_manifest_df['Biosample term name']=='HepG2']
encodeorg_k562 = encode_manifest_df[encode_manifest_df['Biosample term name']=='K562']

encodeorg_intersect_annotations_hepg2 = encodeorg_hepg2[encodeorg_hepg2['Experiment target'].isin(hepg2_rbps)]
encodeorg_intersect_annotations_k562 = encodeorg_k562[encodeorg_k562['Experiment target'].isin(k562_rbps)]

print("number of hepg2 bams on encode: {}".format(encodeorg_hepg2.shape[0]))
print("number of k562 bams on encode: {}".format(encodeorg_k562.shape[0]))
print("number of intersecting Hepg2 rbps: {}".format(_annotated_hepg2.shape[0]))
print("number of intersecting K562 rbps: {}".format(_annotated_k562.shape[0]))
print(
    "number of encodeorg hepg2 bams associated with intersecting annotated rbps: {}".format(
        encodeorg_intersect_annotations_hepg2.shape[0]
    )
)
print(
    "number of encodeorg k562 bams associated with intersecting annotated rbps: {}".format(
        encodeorg_intersect_annotations_k562.shape[0]
    )
)

number of hepg2 bams on encode: 952
number of k562 bams on encode: 1020
number of intersecting Hepg2 rbps: 75
number of intersecting K562 rbps: 84
number of encodeorg hepg2 bams associated with intersecting annotated rbps: 316
number of encodeorg k562 bams associated with intersecting annotated rbps: 392


# Do some other cursory checks since the numbers don't add up...
- There are doubly-counted RBPs which have both shRNA and CRISPR knockouts, we should just use one.

In [3]:
# length of the set of rbps
len(set(encodeorg_intersect_annotations_hepg2['Experiment target']))
for rbp in set(encodeorg_intersect_annotations_hepg2['Experiment target']):
    if encodeorg_intersect_annotations_hepg2[encodeorg_intersect_annotations_hepg2['Experiment target']==rbp].shape[0] != 4:
        print(rbp) # these end up being rbps that contain both CRISPR knockouts and RNASEQ knockouts

SRSF7-human
DDX59-human
IGF2BP1-human
U2AF2-human


# Actually let's just use the gsheet from xintao, which contains expt information
- 73 eclip + shrna (hepg2) (CDC40, RBM5 missing)
- 84 eclip + shrna (k562)
- 

In [4]:
gsheet = pd.read_table(
    '/home/bay001/projects/encode/permanent_data/submitted_exp_20170110_gsheet_from_xintao_2-28-2017.tsv',
    index_col=0
).fillna(0)
gsheet.drop(['# on DCC'],inplace=True)
annotated_hepg2 = gsheet[
    (gsheet['RNA-Seq on HepG2']!=0)
] # overwriting above df
annotated_k562 = gsheet[gsheet['RNA-Seq on K562']!=0] # overwriting above df

# manually adding them in since the CRISPR expts don't intersect with eCLIP expts
intersected_hepg2 = annotated_hepg2[
    (annotated_hepg2['eCLIP on HepG2']!=0) | \
    (annotated_hepg2['RNA-Seq on HepG2']=='ENCSR606PVX') | \
    (annotated_hepg2['RNA-Seq on HepG2']=='ENCSR278NFF')
]
intersected_k562 = annotated_k562[
    annotated_k562['eCLIP on K562']!=0
]

print("number of eclip + shrna/crisper knockdowns (HepG2): {}".format(intersected_hepg2.shape[0]))
print("number of eclip + shrna/crisper knockdowns (K562): {}".format(intersected_k562.shape[0]))
set(_annotated_hepg2['RBP name']) - set(intersected_hepg2.index) # there are two (CDC40 and RBM5 that are unaccounted for)

number of eclip + shrna/crisper knockdowns (HepG2): 75
number of eclip + shrna/crisper knockdowns (K562): 84


set()

In [5]:
intersected_hepg2.shape

(75, 13)

In [6]:
def split_row(row):
    pass

def star_or_tophat(link):
    """
    from the link, reads and returns whether or not STAR or TOPHAT are in the filename.
    """
    # print("LINK",os.path.basename(link))
    if os.path.basename(link).find('star') > -1:
        return 'STAR'
    if os.path.basename(link).find('tophat') > -1:
        return 'TOPHAT'
    return 'ambiguous'

def exists(fpath, current, other):
    """
    returns whether or not the file (fpath) exists in the directory(current)
    """
    base = os.path.basename(fpath.rstrip())
    if not os.path.exists(os.path.join(current,fpath)):
        return os.path.exists(os.path.join(other,fpath))
    else:
        return True
    
def exists_and_return_fullpath(fpath, current, other):
    """
    returns whether or not the file (fpath) exists in the directory(current)
    """
    base = os.path.basename(fpath.rstrip())
    if not os.path.exists(os.path.join(current,fpath)):
        if os.path.exists(os.path.join(other,fpath)):
            return os.path.join(other,fpath)
        else:
            return False
    else:
        return os.path.join(current,fpath)
    
def get_bams_from_expt_id(expt_id, assembly='hg19'):
    # print(expt_id)
    sample_bams = []
    control_expts = []
    
    experiments = "https://www.encodeproject.org/experiments/"
    url = experiments+expt_id+"/?format=json"
    response = urllib.urlopen(url)
    data = json.loads(response.read())
    if 'code' in data.keys():
        next
    else:
        for i in range(0,len(data['files'])):
            if (
                (host+data['files'][i]['href']).endswith('bam') & 
                (data['files'][i]['output_type'] == u'alignments') &
                (data['files'][i]['lab'][u'name'] == 'encode-processing-pipeline') &
                (star_or_tophat(data['files'][i]['submitted_file_name']) == 'STAR')
            ):
                if(data['files'][i]['assembly'] == assembly):
                    sample_bams.append(
                        {
                            os.path.basename(data['files'][i]['href']):data['files'][i]['md5sum']
                        }
                    )
        try:
            control_expts.append(data['possible_controls'][0]['accession'])
        except IndexError:
            pass
            # print("this is a control")
    if(len(control_expts) > 1):
        print("Warning, this expt {} has more than 1 associated control expt".format(expt_id))
    return data['target']['label'], sample_bams, control_expts

In [None]:
X = intersected_hepg2
samples = defaultdict(dict)
links = list()
error = list()
assembly = 'hg19'
for expt_id in X['RNA-Seq on HepG2'].dropna():
    if ',' in expt_id:
        expt_id = expt_id.split(',')
        print(expt_id),
    else:
        expt_id = [expt_id]
        print(expt_id),
    for h in expt_id:
        sname, s, control_expt = get_bams_from_expt_id(h)
        cname, c, _ = get_bams_from_expt_id(control_expt[0])
        for i in range(0,len(s)):
            samples_key = 'expt_rep{}'.format(i+1)
            samples_md5sum_key = 'expt_rep{}_md5sum'.format(i+1)
            control_key = 'control_rep{}'.format(i+1)
            controls_md5sum_key = 'control_rep{}_md5sum'.format(i+1)
            name_key = 'name'
            samples[h][name_key] = sname
            samples[h][samples_key] = s[i].keys()[0]
            samples[h][control_key] = c[i].keys()[0]
            samples[h][samples_md5sum_key] = s[i].values()[0]
            samples[h][controls_md5sum_key] = c[i].values()[0]
hepg2_bams = pd.DataFrame(samples).T
hepg2_bams.head()

In [None]:
X = intersected_k562
samples = defaultdict(dict)
links = list()
error = list()
assembly = 'hg19'
for expt_id in X['RNA-Seq on K562'].dropna():
    if ',' in expt_id:
        expt_id = expt_id.split(',')
        print(expt_id),
    else:
        expt_id = [expt_id]
        print(expt_id),
    for h in expt_id:
        sname, s, control_expt = get_bams_from_expt_id(h)
        cname, c, _ = get_bams_from_expt_id(control_expt[0])
        for i in range(0,len(s)):
            samples_key = 'expt_rep{}'.format(i+1)
            samples_md5sum_key = 'expt_rep{}_md5sum'.format(i+1)
            control_key = 'control_rep{}'.format(i+1)
            controls_md5sum_key = 'control_rep{}_md5sum'.format(i+1)
            name_key = 'name'
            samples[h][name_key] = sname
            samples[h][samples_key] = s[i].keys()[0]
            samples[h][control_key] = c[i].keys()[0]
            samples[h][samples_md5sum_key] = s[i].values()[0]
            samples[h][controls_md5sum_key] = c[i].values()[0]
k562_bams = pd.DataFrame(samples).T
k562_bams.head()

In [None]:
hepg2_bams.to_csv(
    '/projects/ps-yeolab3/encode/hepg2_bams_for_integrated_analysis.txt',sep='\t'
)
k562_bams.to_csv(
    '/projects/ps-yeolab3/encode/k562_bams_for_integrated_analysis.txt',sep='\t'
)

In [None]:
def is_md5sum_equal(row, rep):
    website_md5sum = row['{}_md5sum'.format(rep)]
    filepath = exists_and_return_fullpath(row[rep],downloaded_dir,other_dir)
    calculated_md5sum = ! md5sum $filepath
    if website_md5sum != calculated_md5sum[0].split(' ')[0]:
        print('in trouble: {}, {}, {}'.format(filepath, website_md5sum, calculated_md5sum))
        return False
    else:
        # print('good: {}, {}, {}'.format(filepath, website_md5sum, calculated_md5sum))
        return True
    
def check_all_bams(bams, check_md5sum_too=False):
    cols = ['expt_rep1','expt_rep2','control_rep1','control_rep2']
    for unused,row in bams.iterrows():
        for c in cols:
            if not(exists(row[c],downloaded_dir,other_dir)):
                print(row[c], " doesn't exist.")
            else:
                if check_md5sum_too:
                    is_md5sum_equal(row, c)

In [None]:
check_all_bams(hepg2_bams)

In [None]:
check_all_bams(k562_bams)

# Run FeatureCounts

In [None]:
all_k562_bams = set(
    k562_bams['control_rep1'].append(
        k562_bams['control_rep2'].append(
            k562_bams['expt_rep1'].append(
                k562_bams['expt_rep2']
            )
        )
    )
)
all_hepg2_bams = set(
    hepg2_bams['control_rep1'].append(
        hepg2_bams['control_rep2'].append(
            hepg2_bams['expt_rep1'].append(
                hepg2_bams['expt_rep2']
            )
        )
    )
)
print(len(all_k562_bams))
print(len(all_hepg2_bams))

In [None]:
annotation = '/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf'
k562_outfile = '/home/bay001/projects/encode/permanent_data/k562_hg19_featurecounts_intersect_CLIP_RNASEQ.txt'
hepg2_outfile = '/home/bay001/projects/encode/permanent_data/hepg2_hg19_featurecounts_intersect_CLIP_RNASEQ.txt'

cmd = "featureCounts -T 8 -a {} -s 2 -p -o {}".format(annotation, k562_outfile)
for bam in all_k562_bams:
    cmd = cmd + '{} '.format(os.path.join(exists_and_return_fullpath(bam, downloaded_dir, other_dir)))
qtools.Submitter(
    cmd, 'k562_featureCounts', array=False, nodes=1, ppn=8, walltime='48:00:00', submit=True, queue='home-scrm'
)

cmd = "featureCounts -T 8 -a {} -s 2 -p -o {}".format(annotation, hepg2_outfile)
for bam in all_hepg2_bams:
    cmd = cmd + '{} '.format(os.path.join(exists_and_return_fullpath(bam, downloaded_dir, other_dir)))
qtools.Submitter(
    cmd, 'hepg2_featureCounts', array=False, nodes=1, ppn=8, walltime='48:00:00', submit=True, queue='home-scrm'
)

In [None]:
cmd = "featureCounts -T 8 -a {} -s 2 -p -o {}".format(annotation, k562_outfile)
for bam in all_k562_bams:
    cmd = cmd + '{} '.format(os.path.join(exists_and_return_fullpath(bam, downloaded_dir, other_dir)))
cmd