# Notebook for demonstrating how to access encodeproject.org metadata for RBP clip and rna-seq data

In [1]:
import pandas as pd
import urllib
import json
import requests
import os
import glob
from collections import defaultdict
import qtools
from tqdm import tnrange, tqdm_notebook

pd.set_option("display.max_columns",500)
host = 'https://www.encodeproject.org'
experiments = "https://www.encodeproject.org/experiments/"

In [2]:
def get_bams_from_expt_id(
    expt_id, assembly, lab
):
    """
    Given an expt id, return a list: [rbp name, [rep1bam, rep2bam], control_expt_id]
    control_expt_id is None if the expt_id given to this function is itself a control.
    
    params:
    
    expt_id: string
        ie. "ENCSR767LLP"
    assembly: string
        ie. "hg19"
    lab: string
        ie. "brenton-graveley"
          
    returns:
        
    result: tuple
        (rbp_name, replicate_bams, control_expts) where:
        rbp_name is the str representation of the name
        replicate_bams is a dictionary with keys=(filename, md5sum), and
        control_expts is a list of expt_ids associated with control experiments. 
    """
    sample_bams = []
    control_expts = []
    
    experiments = "https://www.encodeproject.org/experiments/"
    url = experiments+expt_id+"/?format=json"
    response = urllib.urlopen(url)
    data = json.loads(response.read())
    if 'code' in data.keys():
        next
    else:
        for i in range(0,len(data['files'])):
            if (
                (host+data['files'][i]['href']).endswith('bam') & 
                (data['files'][i]['output_type'] == u'alignments') &
                (data['files'][i]['lab'][u'name'] == lab)
            ):
                if(data['files'][i]['assembly'] == assembly):
                    metadata_dict = {}
                    sample_bams.append(
                        {
                            'filename':os.path.basename(data['files'][i]['href']),
                            'md5sum':data['files'][i]['md5sum'],
                            # 'rep':int(data['files'][i]['replicate']['biological_replicate_number'])
                        }
                    )
                    # print(data['files'][i].keys())
        try:
            control_expts.append(data['possible_controls'][0]['accession'])
        except IndexError:
            pass
            # print("this is a control")
    if(len(control_expts) > 1):
        print("Warning, this expt {} has more than 1 associated control expt".format(expt_id))
    return data['target']['label'], sample_bams, control_expts


# If you want to pull metadata for RNASeq Knockdown studies (Graveley lab):

In [5]:
RBP, bams, controls = get_bams_from_expt_id(
    expt_id='ENCSR767LLP', 
    assembly='hg19', 
    lab='brenton-graveley'
)

# If you want to pull metadata for eCLIP studies (Yeo lab):

In [7]:
RBP, bams, controls = get_bams_from_expt_id(
    expt_id='ENCSR828ZID', 
    assembly='hg19', 
    lab='gene-yeo'
)
controls

[u'ENCSR354KAS']