# Basic download from encode script.
- Formerly: download_encode_data
- Now we have new annotations from Xintao
- Current stats as of 2-28:
    - 526 Hg19 bams total (254 HepG2, 272 K562).  
    - 474 Hg19 bams (230 HepG2, 244 K562)
    - 52 control Hg19 bams (24 HepG2, 28 K562)
    - 2 missing possible controls (TIA1 K562 and PPIL4 K562)
    

In [1]:
import pandas as pd
import urllib
import json
import requests
import os
import glob
from collections import defaultdict
import qtools
from tqdm import tnrange, tqdm_notebook

pd.set_option("display.max_columns",500)
host = 'https://www.encodeproject.org'
experiments = "https://www.encodeproject.org/experiments/"
date = '3-9-2017'
annotated_manifest_hepg2 = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_HepG2.csv'
annotated_manifest_k562 = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_K562.csv'
downloaded_dir = '/projects/ps-yeolab3/encode/rnaseq/shrna_knockdown_graveley_tophat/'

# Do some checks on how many samples we have from the two lists.

In [2]:
hepg2_df = pd.read_table(annotated_manifest_hepg2)
k562_df = pd.read_table(annotated_manifest_k562)

print("number of k562 rnaseq: ",k562_df.shape[0])
print("number of RBPs in K562",len(set(k562_df['Official_RBP'])))
print("number of Expts in K562",len(set(k562_df['RNA-Seq on K562'])))
print("number of Expts in Hepg2",len(set(hepg2_df['RNA-Seq on HepG2'])))
print("number of hepg2 rnaseq: ",hepg2_df.shape[0])
print("number of RBPs in Hepg2",len(set(hepg2_df['Official_RBP'])))
k562_df.head()

('number of k562 rnaseq: ', 224)
('number of RBPs in K562', 224)
('number of Expts in K562', 224)
('number of Expts in Hepg2', 228)
('number of hepg2 rnaseq: ', 228)
('number of RBPs in Hepg2', 228)


Unnamed: 0,Official_RBP,GENCODE_v19_ID,target,RBP,RNA-Seq on K562,EXP
0,AARS,ENSG00000090861.11,AARS-human,AARS,ENCSR599UDS,AARS-BGKLV21-K562
1,AATF,ENSG00000108270.6,AATF-human,AATF,ENCSR973QSV,AATF-BGKLV13-K562
2,ABCF1,ENSG00000204574.8,ABCF1-human,ABCF1,ENCSR721MXZ,ABCF1-BGKLV29-K562
3,ADAR,ENSG00000160710.11,ADAR-human,ADAR,ENCSR164TLB,ADAR-BGKLV24-K562
4,AGGF1,ENSG00000164252.8,AGGF1-human,AGGF1,ENCSR812TLY,AGGF1-BGKLV32-K562


In [39]:
def get_expt_from_rbp(manifest_df, rbp, official=False, cell='HepG2'):
    """
    From the manifest generated by xintao, return the expt_id given an RBP name
    """
    colname = 'Official_RBP' if official else 'RBP'
    cellcol = 'RNA-Seq on HepG2' if cell == 'HepG2' else 'RNA-Seq on K562'
    return manifest_df[manifest_df[colname]==rbp][cellcol].to_string(index=False)
    
def star_or_tophat(link):
    """
    from the link, reads and returns whether or not STAR or TOPHAT are in the filename.
    """
    # print("LINK",os.path.basename(link))
    if os.path.basename(link).find('star') > -1:
        return 'STAR'
    if os.path.basename(link).find('tophat') > -1:
        return 'TOPHAT'
    return 'ambiguous'

def exists(fpath, current):
    """
    returns whether or not the file (fpath) exists in the directory(current)
    """
    base = os.path.basename(fpath.rstrip())
    if not os.path.exists(os.path.join(current,fpath)):
        return False
    else:
        return True
    
def exists_and_return_fullpath(fpath, current):
    """
    returns whether or not the file (fpath) exists in the directory(current)
    """
    base = os.path.basename(fpath.rstrip())
    if not os.path.exists(os.path.join(current,fpath)):
        return False
    else:
        return os.path.join(current,fpath)
    
def get_bams_from_expt_id(
    expt_id, assembly, aligner, lab
):
    """
    Given an expt id, return a list: [rbp name, [rep1bam, rep2bam], control_expt_id]
    
    control_expt_id is None if the expt_id given to this function is itself a control.
    
    """
    sample_bams = []
    control_expts = []
    
    experiments = "https://www.encodeproject.org/experiments/"
    url = experiments+expt_id+"/?format=json"
    response = urllib.urlopen(url)
    data = json.loads(response.read())
    if 'code' in data.keys():
        next
    else:
        for i in range(0,len(data['files'])):
            
            if (
                (host+data['files'][i]['href']).endswith('bam') & 
                (data['files'][i]['output_type'] == u'alignments') &
                (data['files'][i]['lab'][u'name'] == lab) &
                (star_or_tophat(data['files'][i]['submitted_file_name']) == aligner)
            ):
                if(data['files'][i]['assembly'] == assembly):
                    sample_bams.append(
                        {
                            'filename':os.path.basename(data['files'][i]['href']),
                            'md5sum':data['files'][i]['md5sum'],
                            'rep':int(data['files'][i]['replicate']['biological_replicate_number'])
                        }
                    )
        try:
            control_expts.append(data['possible_controls'][0]['accession'])
        except IndexError:
            pass
            # print("this is a control")
    if(len(control_expts) > 1):
        print("Warning, this expt {} has more than 1 associated control expt".format(expt_id))
    return data['target']['label'], sample_bams, control_expts

def make_expt_dataframe_from_expt_list(expt_list, assembly='hg19', aligner='STAR', lab='encode-processing-pipeline'):
    """
    Given a list of expt ids, return the reps and controls (and md5sums in case we need to check)
    """
    x = tnrange(len(expt_list))
    samples = defaultdict(dict)
    for h in expt_list:
        x.update(1)
        sname, s, control_expt = get_bams_from_expt_id(
            h, assembly, aligner, lab
        ) # gets the sample name, sample, and control expt id
        cname, c, _ = get_bams_from_expt_id(
            control_expt[0], assembly, aligner, lab
        ) # gets the control name, control
        for i in range(0,len(s)):
            samples_key = 'expt_rep{}'.format(s[i]['rep'])
            samples_md5sum_key = 'expt_rep{}_md5sum'.format(s[i]['rep'])
            control_key = 'control_rep{}'.format(s[i]['rep'])
            controls_md5sum_key = 'control_rep{}_md5sum'.format(s[i]['rep'])
            name_key = 'name'
            samples[h][name_key] = sname
            samples[h][samples_key] = s[i]['filename']
            samples[h][control_key] = c[i]['filename']
            samples[h][samples_md5sum_key] = s[i]['md5sum']
            samples[h][controls_md5sum_key] = c[i]['md5sum']
    bams = pd.DataFrame(samples).T
    return bams

# Make a list of expt IDs 

In [40]:
hepg2_expts = list(set(hepg2_df['RNA-Seq on HepG2']))
k562_expts = list(set(k562_df['RNA-Seq on K562']))

In [41]:
aligner = 'ambiguous' # graveley lab pipeline doesn't adhere to the same file name structure as encode processing pipeline, but we trust that it's all tophat-aligned.
lab = 'brenton-graveley' # or 'encode-processing-pipeline'

hepg2_bams = make_expt_dataframe_from_expt_list(hepg2_expts, assembly='hg19', aligner=aligner, lab=lab)
k562_bams = make_expt_dataframe_from_expt_list(k562_expts, assembly='hg19', aligner=aligner, lab=lab)
print(hepg2_bams.shape)
print(k562_bams.shape)

(228, 9)
(224, 9)


In [42]:
k562_bams

Unnamed: 0,control_rep1,control_rep1_md5sum,control_rep2,control_rep2_md5sum,expt_rep1,expt_rep1_md5sum,expt_rep2,expt_rep2_md5sum,name
ENCSR000KYM,ENCFF309OOI.bam,006834b1e534abfc0a766464ce4f60d9,ENCFF209PLP.bam,b50208a4553de637758e93ceaa099569,ENCFF354JPP.bam,aa41d43a1f51b87c0691d0526f4f3ac5,ENCFF243AMZ.bam,dea667416a4d4c3c582637c2f9ad0f19,DDX3X
ENCSR000YYN,ENCFF695KEA.bam,14e5ef8ccef0d53859e5f67375f4c13f,ENCFF848JJM.bam,c04240081b7bb137e69d298837baa648,ENCFF065KHB.bam,84a492145ce5bf8f806aa2b7f9b30765,ENCFF640HPA.bam,7f4e51a76b95e8699789d5bb36c5df85,AKAP8
ENCSR004RGI,ENCFF098BEA.bam,d3be652a5058aacf2e9d95311bcba657,ENCFF178NNK.bam,7b1be9be800298f9f8958b2adc14090c,ENCFF459YMO.bam,22e6c3066888702577d180c635d0280b,ENCFF922IOC.bam,a2588d2a5a56edeb4f96ea6f4c17d4d9,RPS10
ENCSR007XKL,ENCFF985IXD.bam,2e7f51c61c5c545dfba6906772ac5716,ENCFF461TSD.bam,9e3ce9878e3352b44855f4a2d1c66d37,ENCFF878JKR.bam,4ee27925a7f0f3ce95964686da0a8927,ENCFF466OZP.bam,c69b58950871a86055b9f24744add8c6,NFX1
ENCSR023HWI,ENCFF804JHE.bam,f14e222eebd2030a3da0d086acd48bf7,ENCFF819ORB.bam,14d4a76c9237f92eff0e40f19ef4ffcb,ENCFF656QEN.bam,f409a646dd82e6b47c88869cd1921a55,ENCFF738SSD.bam,83fe79368dde05a9405107b333b0eca3,KHDRBS1
ENCSR029LGJ,ENCFF742XEQ.bam,5739adf7b6b6165e9a2bb9a17c3a5310,ENCFF362TIW.bam,2135657d1727c78ac2aba8176d6323f2,ENCFF402FXS.bam,868de7091e2b4ab872cb07180b6c219f,ENCFF772MUC.bam,b9d28c8eddbe97c0db19681eb2cef1c5,DDX51
ENCSR034VBA,ENCFF490QNF.bam,43ae4f234533b09969780ec26dd956ef,ENCFF669VSB.bam,997fae2a9785ac356107efc36f662d84,ENCFF836HER.bam,259850b04aa20c46a845e7e745076957,ENCFF552SMO.bam,621654e8353221c1022db8aa72f66a85,HNRNPUL1
ENCSR040FSN,ENCFF341FFK.bam,71bd5d702b7df954831b2b664461f1cd,ENCFF256JTT.bam,0173b3b380fe0cbcfa75a0068cdfbed9,ENCFF617NHT.bam,0e7c5274649aa448cbae6a437d0c6c83,ENCFF240PBV.bam,d45cbba792a437e4c1322bbc80e6b8ff,EIF4G2
ENCSR047AJA,ENCFF985IXD.bam,2e7f51c61c5c545dfba6906772ac5716,ENCFF461TSD.bam,9e3ce9878e3352b44855f4a2d1c66d37,ENCFF979CLD.bam,0e0e7f4c3cfbd19da90b98e607ed7f0a,ENCFF252MOZ.bam,08df33b474e55992dbc6fb89ce3daba2,SUB1
ENCSR047EEG,ENCFF092XAP.bam,b02348cbc3ecb4852ca8266a7f519b89,ENCFF265ZZB.bam,3e64c51b808dcc83bc54eafe7e92d930,ENCFF199ZBI.bam,60bc0e4296a168560c15d4bd668d603b,ENCFF710FGW.bam,09011dd591c7fe9e2ad6466e3d60ac9d,PABPC4


In [43]:
hepg2_bams.to_csv(
    '/projects/ps-yeolab3/encode/hepg2_{}_{}_bams_for_integrated_analysis.txt'.format(lab, aligner),sep='\t'
)
k562_bams.to_csv(
    '/projects/ps-yeolab3/encode/k562_{}_{}_bams_for_integrated_analysis.txt'.format(lab, aligner),sep='\t'
)

# Check md5sum and file existance.
- If I've done this once, I don't need to do it again. 

In [None]:
def already_checked_md5sum(filepath, md5sum_checked_file):
    """
    given filepaths list, check the md5sum_checked_file to see if any file has been checked.
    Parameters
    ----------
    filepath : list
        list of files
    md5sum_checked_file : file
        dataframe of files
        
    returns list of files that haven't been checked.
    """
    exists_in_file = ! grep $filepath $md5sum_checked_file
    if len(exists_in_file) > 0:
        if "no such file" not in exists_in_file[0]:
            # print(exists_in_file)
            return True
        else:
            return False
    else:
        return False

def is_md5sum_equal(row, rep, downloaded_dir):
    """
    Takes a row in a bams dataframe and the rep prefix and determines
    whether or not the md5sum is equal to what has been downloaded.
    """
    website_md5sum = row['{}_md5sum'.format(rep)]
    filepath = exists_and_return_fullpath(row[rep],downloaded_dir)
    calculated_md5sum = ! md5sum $filepath
    if website_md5sum != calculated_md5sum[0].split(' ')[0]:
        print('TROUBLE: {}, {}, {}'.format(filepath, website_md5sum, calculated_md5sum))
        return False
    else:
        return True
    
def check_all_bams_for_existance(bams, downloaded_dir, to_download_file='download.sh'):
    """
    Writes and prints the files that don't exist to a file
    
    Parameters
    ----------
    bams : pandas.DataFrame
        dataframe of bams with columns defined in cols.
    to_download_file : string
        file to download (download.sh)
        
    """
    encodeproject_prefix = 'https://www.encodeproject.org/files/'
    o = open(to_download_file,'w')
    cols = ['expt_rep1','expt_rep2','control_rep1','control_rep2']
    for unused,row in bams.iterrows():
        for c in cols:
            if not(exists_and_return_fullpath(row[c],downloaded_dir)):
                print(row[c], " doesn't exist in {}.".format(downloaded_dir))
                # print("mv /projects/ps-yeolab3/encode/rnaseq/shrna_knockdown/{} ./".format(row[c]))
                o.write(
                    encodeproject_prefix + "{}/@@download/{}\n".format(
                        os.path.splitext(row[c])[0],
                        row[c]
                    )
                )
    o.close()
    
def check_md5sums(bams, downloaded_dir, md5sums_checked_file):
    """
    Checks the md5sums of the bams file
    """
    cols = ['expt_rep1','expt_rep2','control_rep1','control_rep2']
    x = tnrange(bams.shape[0])
    
    for _, row in bams.iterrows(): # for each row (RBP)
        x.update(1)
        for t in tnrange(len(cols), leave=False): # for each column (rep1, rep2, ctrl1, ctrl2)
            c = cols[t]
            filepath = exists_and_return_fullpath(row[c], downloaded_dir)
            if filepath:
                if not already_checked_md5sum(
                    os.path.join(downloaded_dir,row[c]),
                    md5sums_checked_file
                ): # if not already checked, check the md5sum
                    if is_md5sum_equal(row, c, downloaded_dir):
                        with open(md5sums_checked_file,'a') as o:
                            o.write(os.path.join(downloaded_dir,row[c]) + '\n')
                    else:
                        print(row[c], "needs to be downloaded again.")
            else:
                print(row[c], "needs to be downloaded again.")

# Check for existence of file, otherwise create a list to download.

In [None]:
# use: xargs -n 1 curl -O -L < FILE to download these...
def check_all_bams():
    download_hepg2_file = '/projects/ps-yeolab3/encode/rnaseq/shrna_knockdown/to_download_{}_hepg2_{}_{}'.format(aligner,lab,date)
    download_k562_file = '/projects/ps-yeolab3/encode/rnaseq/shrna_knockdown/to_download_{}_k562_{}_{}'.format(aligner,lab,date)
    check_all_bams_for_existance(hepg2_bams, downloaded_dir, download_hepg2_file)
    check_all_bams_for_existance(k562_bams, downloaded_dir, download_k562_file)

In [None]:
check_all_bams()

# Check md5sum of the ones we need to download (or just all of them)

In [None]:
# periods indicate row progress, commas indicate rep progress
bams = hepg2_bams
md5sums_checked_file = '/projects/ps-yeolab3/encode/rnaseq/shrna_knockdown/md5sums.graveley-tophat.checked'
check_md5sums(bams, downloaded_dir, md5sums_checked_file)

In [None]:
bams

# Run FeatureCounts

In [None]:
all_k562_bams = set(
    k562_bams['control_rep1'].append(
        k562_bams['control_rep2'].append(
            k562_bams['expt_rep1'].append(
                k562_bams['expt_rep2']
            )
        )
    )
)
all_hepg2_bams = set(
    hepg2_bams['control_rep1'].append(
        hepg2_bams['control_rep2'].append(
            hepg2_bams['expt_rep1'].append(
                hepg2_bams['expt_rep2']
            )
        )
    )
)
print(len(all_k562_bams))
print(len(all_hepg2_bams))

In [None]:
i = 0
jobs = ['_k562','_hepg2']
# jobs = ['_hepg2']
for bams in [k562_bams, hepg2_bams]:
    cmds = []
    for unused,row in bams.iterrows():
        p = '/home/bay001/software/subread-1.5.1-Linux-x86_64/bin/featureCounts'
        s = '/home/bay001/projects/encode/analysis/featureCounts/individual_expts/'
        o = '/home/bay001/projects/encode/analysis/featureCounts/individual_expts/{}{}.counts.txt'.format(row['name'],jobs[i])
        a = '/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf'
        ctrl_rep1 = exists_and_return_fullpath(row['control_rep1'],downloaded_dir)
        ctrl_rep2 = exists_and_return_fullpath(row['control_rep2'],downloaded_dir)
        expt_rep1 = exists_and_return_fullpath(row['expt_rep1'],downloaded_dir)
        expt_rep2 = exists_and_return_fullpath(row['expt_rep2'],downloaded_dir)
        cmd = '{} -s 2 --tmpDir {} -p -a {} -o {} {} {} {} {}'.format(
            p, s, a, o, ctrl_rep1, ctrl_rep2, expt_rep1, expt_rep2
        )
        cmds.append(cmd)
    qtools.Submitter(
        cmds, jobs[i], array=True, nodes=1, ppn=1, walltime='4:00:00', submit=True, queue='home-scrm', sh=s+'{}.sh'.format(jobs[i])
    )
    i = i + 1


# Create normalized bedgraph files from RNASEQ bams
- Temporarily do this here, eventually pull these functions out into a script or something

In [4]:
rbp_list = ['HNRNPC','HNRNPK','SRSF1','U2AF2','U2AF1','PUF60','EIF4A3','MAGOH','PTBP1','MATR3']

In [5]:
expt_list = []
for rbp in rbp_list:
    expt_list.append(get_expt_from_rbp(hepg2_df,rbp,official=False))

In [18]:
df = make_expt_dataframe_from_expt_list(expt_list, assembly='hg19', aligner='ambiguous', lab='brenton-graveley')
df['Cell line'] = 'HepG2'

In [7]:
list_of_bams_to_transform = set(
    pd.concat(
        [df['expt_rep1'], df['expt_rep2'], df['control_rep1'], df['control_rep2']]
    )
)

In [8]:
chrom_sizes = '/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes'
output_temp_bedgraph_dir = '/home/bay001/projects/encode/analysis/rnaseq_bedgraphs/'

In [None]:
cmds = []
for bam in list_of_bams_to_transform:
    bam_fullpath = exists_and_return_fullpath(bam,downloaded_dir)
    output_file = os.path.join(output_temp_bedgraph_dir, '{}.bg'.format(os.path.splitext(bam)[0]))
    cmd = 'bedtools genomecov '
    cmd = cmd + '-ibam {} '.format(bam_fullpath)
    cmd = cmd + '-bg '
    cmd = cmd + '-g {} '.format(chrom_sizes)
    cmd = cmd + '> {}'.format(output_file)
    cmds.append(cmd)
qtools.Submitter(
    cmds, 'make_bedgraphs_rnaseq', array=True, nodes=1, ppn=1, walltime='4:00:00', submit=True, queue='home-scrm', sh=output_temp_bedgraph_dir+'make_bedgraphs.sh'
)

In [12]:
unnormalized_bedgraphs = glob.glob(output_temp_bedgraph_dir + "*.bg")
progress = tnrange(len(unnormalized_bedgraphs))

for bg in unnormalized_bedgraphs:
    bam = os.path.join(downloaded_dir, os.path.splitext(os.path.basename(bg))[0] + '.bam')
    output_file = os.path.splitext(bg)[0] + '.norm.bg'
    cmd = 'normalize_bedGraph.py '
    cmd = cmd + '--bg {} '.format(bg)
    cmd = cmd + '--bam {} '.format(bam)
    cmd = cmd + '> {}'.format(output_file)
    ! samtools index $bam
    ! $cmd
    progress.update(1)

In [14]:
for bam in list_of_bams_to_transform:
    bam_fullpath = exists_and_return_fullpath(bam,downloaded_dir)
    bam_softlink = os.path.join(output_temp_bedgraph_dir, os.path.basename(bam))
    ! ln -s $bam_fullpath $bam_softlink


In [15]:
for bam in list_of_bams_to_transform:
    bam_fullpath = exists_and_return_fullpath(bam,downloaded_dir).replace('.bam','.bam.bai')
    bam_softlink = os.path.join(output_temp_bedgraph_dir, os.path.basename(bam)).replace('.bam','.bam.bai')
    ! ln -s $bam_fullpath $bam_softlink


In [32]:
bedgraph_dir = '/home/bay001/projects/encode/analysis/rnaseq_bedgraphs'
def add_bedgraph_dir_r1(row):
    return os.path.join(bedgraph_dir, row['expt_rep1'])
def add_bedgraph_dir_input(row):
    return os.path.join(bedgraph_dir, row['control_rep1'])

dfx = pd.concat([df['name'], df['Cell line'], df['expt_rep1'], df['control_rep1']], axis=1).reset_index()
dfx['CLIP'] = dfx.apply(add_bedgraph_dir_r1, axis=1)
dfx['INPUT'] = dfx.apply(add_bedgraph_dir_input, axis=1)
del dfx['expt_rep1']
del dfx['control_rep1']
dfx.columns = ['uID','RBP','Cell line', 'CLIP','INPUT']
dfx.to_csv(os.path.join(bedgraph_dir + "/input_normish_manifest.tsv"), sep='\t', index=None)

In [19]:
df

Unnamed: 0,control_rep1,control_rep1_md5sum,control_rep2,control_rep2_md5sum,expt_rep1,expt_rep1_md5sum,expt_rep2,expt_rep2_md5sum,name,Cell line
ENCSR052IYH,ENCFF460AAE.bam,b691efe23fdd3873e7b5d5e6a9b20ba4,ENCFF058QDT.bam,d8f3f5606b74eccd282f560e20b8c893,ENCFF031QZG.bam,eddbeb519d034412d304f5ef2aa687d2,ENCFF531AUK.bam,9c3f52480237457fe97298e7cb0b30be,HNRNPC,HepG2
ENCSR064DXG,ENCFF571ARP.bam,ca8e470d0d42e42ee79162a245c647ec,ENCFF755GKJ.bam,d05681b0af8f612e9a16b07aa8a0e492,ENCFF155FIK.bam,7fe0ac1e954e9153f75bb15364f9e99d,ENCFF527FPF.bam,fd05926a2b33db5c2312a562a2a28715,PTBP1,HepG2
ENCSR094KBY,ENCFF571ARP.bam,ca8e470d0d42e42ee79162a245c647ec,ENCFF755GKJ.bam,d05681b0af8f612e9a16b07aa8a0e492,ENCFF707GZQ.bam,36790eca0dbdb238f3be857942435a7e,ENCFF223XHU.bam,94dfa343e8fe2590612ea958fcdd90dc,SRSF1,HepG2
ENCSR372UWV,ENCFF124DWE.bam,edd78ccc3f9bdc9d399b7c109f0daac6,ENCFF774FAK.bam,77ca88c3500f177ce1d5b5803714c41f,ENCFF882SMH.bam,7abc08fbcc2c8702465bd69ea715403a,ENCFF811FWH.bam,86b24fd4889e3f99b456bad33669f33b,U2AF1,HepG2
ENCSR492UFS,ENCFF052HTH.bam,e6d81c60dffc32e98f491f1ee02236b8,ENCFF857QSU.bam,2dd2e67ec047cb195b6da4207d0cc26d,ENCFF387CQS.bam,da9d045cf89512ac4acb030671964a26,ENCFF878ELZ.bam,6cf17c1e12dc069e4bd99f6a11a7fcbc,MATR3,HepG2
ENCSR622MCX,ENCFF499UUZ.bam,b0fdad0ee704046b511099cf04e7eb5e,ENCFF326XKY.bam,ecd206b9d417019fa0c50604afef3094,ENCFF992OHI.bam,8a9fbdba7091cfe89469aeca3f728e2b,ENCFF057UPR.bam,5f81c5e73450692b20ca0fdebd7d2555,U2AF2,HepG2
ENCSR648BSC,ENCFF829TUN.bam,95f6b60d575d3dd430860c48488129c7,ENCFF350VVR.bam,b9b9274301c8a890e860b19e59ee66ea,ENCFF450QLW.bam,838145dc05a19d800ec951f02311aae7,ENCFF584LUS.bam,9b00a546db42aaff9d5299ee03b9b39d,PUF60,HepG2
ENCSR746EKS,ENCFF052HTH.bam,e6d81c60dffc32e98f491f1ee02236b8,ENCFF857QSU.bam,2dd2e67ec047cb195b6da4207d0cc26d,ENCFF809RUA.bam,8d7288d20dd32eaaaa69a324d2b90b63,ENCFF629OTS.bam,7f5288bc43dbb1ebab149051ab1318fe,MAGOH,HepG2
ENCSR853ZJS,ENCFF052HTH.bam,e6d81c60dffc32e98f491f1ee02236b8,ENCFF857QSU.bam,2dd2e67ec047cb195b6da4207d0cc26d,ENCFF589WIS.bam,43f010e32ab0923fc8c21bd551c1a5bb,ENCFF033EZX.bam,61391a78e41d707880d6f78b80226db6,HNRNPK,HepG2
ENCSR957EEG,ENCFF052HTH.bam,e6d81c60dffc32e98f491f1ee02236b8,ENCFF857QSU.bam,2dd2e67ec047cb195b6da4207d0cc26d,ENCFF224ALJ.bam,34753ae6cad39fa00f40a133a0e49f5e,ENCFF037BCP.bam,768fe465ee8ae4ce498f2cbba2a37f63,EIF4A3,HepG2


# Fix rep1 and rep2

In [None]:
df = pd.read_table('/projects/ps-yeolab3/encode/')