# Notebook for demonstrating how to access encodeproject.org metadata for RBP clip and rna-seq data

In [1]:
import pandas as pd
import urllib
import json
import requests
import os
import glob
from collections import defaultdict
import qtools
from tqdm import tnrange, tqdm_notebook

pd.set_option("display.max_columns",500)
host = 'https://www.encodeproject.org'
experiments = "https://www.encodeproject.org/experiments/"

In [166]:
def get_bams_from_expt_id(
    expt_id, assembly, lab
):
    """
    Given an expt id, return a list: [rbp name, [rep1bam, rep2bam], control_expt_id]
    control_expt_id is None if the expt_id given to this function is itself a control.
    
    params:
    
    expt_id: string
        ie. "ENCSR767LLP"
    assembly: string
        ie. "hg19"
    lab: string
        ie. "brenton-graveley"
          
    returns:
        
    result: tuple
        (rbp_name, replicate_bams, control_expts) where:
        rbp_name is the str representation of the name
        replicate_bams is a dictionary with keys=(filename, md5sum), and
        control_expts is a list of expt_ids associated with control experiments. 
    """
    sample_bams = []
    control_expts = []
    
    experiments = "https://www.encodeproject.org/experiments/"
    url = experiments+expt_id+"/?format=json"
    response = urllib.urlopen(url)
    data = json.loads(response.read())
    if 'code' in data.keys():
        next
    else:
        
        for i in range(0,len(data['files'])):
            try:
                cell_line = data['files'][i]['replicate']['experiment']['biosample_summary']
            except:
                pass
            if (
                (host+data['files'][i]['href']).endswith('bam') & 
                (data['files'][i]['output_type'] == u'alignments') &
                (data['files'][i]['lab'][u'name'] == lab)
            ):
                if(data['files'][i]['assembly'] == assembly):
                    metadata_dict = {}
                    sample_bams.append(
                        {
                            'filename':os.path.basename(data['files'][i]['href']),
                            'md5sum':data['files'][i]['md5sum'],
                            # 'rep':int(data['files'][i]['replicate']['biological_replicate_number'])
                        }
                    )
                    # print(data['files'][i].keys())
        for i in range(len(data['files'])):
            # need to get bams first
            if (data['files'][i]['href'].endswith('bigWig'))and (data['files'][i]['assembly'] == assembly):
                bam_from = os.path.basename(os.path.split(data['files'][i]['derived_from'][0])[0])+'.bam'
                strand = data['files'][i]['output_type'].split(' ')[0]
                
                
                
                for bam in sample_bams:
                    
                    if bam['filename'] == bam_from:
                        #print(strand, i)
                        bam[strand] = os.path.basename(data['files'][i]['href'])
        try:
            control_expts.append(data['possible_controls'][0]['accession'])
        except IndexError:
            pass
            # print("this is a control")
        
        try: 
            target = data['target']['label']
        except:
            target = None # control experiment
    if(len(control_expts) > 1):
        print("Warning, this expt {} has more than 1 associated control expt".format(expt_id))
    return target, sample_bams, control_expts, cell_line


# If you want to pull metadata for RNASeq Knockdown studies (Graveley lab):

In [5]:
RBP, bams, controls = get_bams_from_expt_id(
    expt_id='ENCSR767LLP', 
    assembly='hg19', 
    lab='brenton-graveley'
)

# If you want to pull metadata for eCLIP studies (Yeo lab):

In [167]:
RBP, bams, controls, cell_line = get_bams_from_expt_id(
    expt_id='ENCSR550DVK', 
    assembly='hg19', 
    lab='gene-yeo'
)
controls

[]

In [3]:
df = pd.read_csv('~/projects/allbigwig.tsv', header = None, sep = '\t')

In [4]:
all_experiment = df[6].unique()

In [134]:
all_data_id = pd.DataFrame(index = all_experiment)
control= []
for exp_id in all_experiment:
    try:
        RBP, bams, controls, cell_line = get_bams_from_expt_id(
            expt_id=exp_id, 
            assembly='GRCh38', 
            lab='gene-yeo'
            )
        for i in range(len(bams)):
            
            all_data_id.loc[exp_id, 'bam_'+str(i)] = bams[i]['filename']
            all_data_id.loc[exp_id, 'minus_'+str(i)] = bams[i]['minus']
            all_data_id.loc[exp_id, 'plus_'+str(i)] = bams[i]['plus']
            
        all_data_id.loc[exp_id, 'control experiment'] = controls[0]
        all_data_id.loc[exp_id, 'RBP'] = RBP
        all_data_id.loc[exp_id, 'cell_line'] = cell_line
    except:
        control.append(exp_id)
all_data_id.dropna(inplace = True)

# get control

In [169]:
for exp_id in all_data_id.index:
    control_id = all_data_id.loc[exp_id, 'control experiment']
    RBP, bams, controls, cell_line = get_bams_from_expt_id(
            expt_id=control_id, 
            assembly='GRCh38', 
            lab='gene-yeo'
            )
    all_data_id.loc[exp_id, 'bam_control'] = bams[0]['filename']
    all_data_id.loc[exp_id, 'minus_control'] = bams[0]['minus']
    all_data_id.loc[exp_id, 'plus_control'] = bams[0]['plus']

In [171]:
all_data_id.to_pickle('~/projects/eclip_encode_id.pickle')