In [1]:
import os 
import pandas as pd
import numpy as np

**Filtered dataset**
https://www.encodeproject.org/search/?type=Experiment&status=released&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&biosample_ontology.term_name=HepG2&assay_title=TF+ChIP-seq&biosample_ontology.classification=cell+line&perturbed=false&target.investigated_as=transcription+factor&assembly=GRCh38&audit.ERROR.category!=extremely+low+read+depth&audit.NOT_COMPLIANT.category!=insufficient+read+depth&audit.NOT_COMPLIANT.category!=insufficient+read+length&audit.NOT_COMPLIANT.category!=poor+library+complexity&audit.ERROR.category!=missing+compliant+biosample+characterization&audit.ERROR.category!=not+compliant+biosample+characterization&audit.NOT_COMPLIANT.category!=insufficient+replicate+concordance&audit.NOT_COMPLIANT.category!=severe+bottlenecking&audit.NOT_COMPLIANT.category!=partially+characterized+antibody&audit.WARNING.category!=low+read+depth&audit.WARNING.category!=low+read+length&audit.WARNING.category!=moderate+library+complexity&audit.WARNING.category!=borderline+replicate+concordance&audit.WARNING.category!=missing+compliant+biosample+characterization&audit.WARNING.category!=antibody+characterized+with+exemption&audit.WARNING.category!=improper+control_type+of+control+experiment&audit.WARNING.category!=missing+biosample+characterization&audit.WARNING.category!=inconsistent+platforms&audit.WARNING.category!=control+low+read+depth&audit.WARNING.category!=missing+controlled_by&audit.WARNING.category!=inconsistent+control+read+length&audit.WARNING.category!=missing+genetic+modification+characterization&audit.WARNING.category!=inconsistent+control+run_type&audit.WARNING.category!=mixed+read+lengths

**CTCF file** https://www.encodeproject.org/search/?type=Experiment&status=released&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&biosample_ontology.term_name=HepG2&assay_title=TF+ChIP-seq&biosample_ontology.classification=cell+line&perturbed=false&target.investigated_as=transcription+factor&assembly=GRCh38&audit.ERROR.category!=extremely+low+read+depth&target.label=CTCF&audit.WARNING.category!=low+read+length

In [16]:
def make_directory(path):

    if not os.path.isdir(path):
        os.mkdir(path)
        print("Making directory: " + path)
    else:
        print("Directory already exists!")
    return path

In [17]:
base_dir = '.'
files_dir = os.path.join(base_dir, 'files')
met_dir = os.path.join(base_dir, 'metadata')

In [18]:
met_files = [os.path.join(met_dir, m) for m in os.listdir(met_dir) if m.endswith('.tsv')]
metadata_list = [pd.read_csv(met_file, sep='\t') for i, met_file in enumerate(met_files)]

In [19]:
metadata = pd.concat(metadata_list)

In [20]:
selected = []
for i, df in metadata.groupby(['Experiment accession', 'Experiment target']):
    outputs = df['Output type'].values
    assert ('alignments' in outputs) and ('fold change over control' in outputs) and ('signal p-value' in outputs) and ('bed' in df['File type'].values), 'Incomplete dataset detected'

In [28]:
all_tfs = list(set(metadata['Experiment target']))
np.random.seed(42)
tf_selection = list(np.random.choice(all_tfs, 100, replace=False)) # 'CTCF-human' magically is in there


In [29]:
select_exp = []
for tf in tf_selection:
    tf_df = metadata[metadata['Experiment target']==tf]
    one_exp = list(iter(tf_df.groupby('Experiment accession')))[0][1]
    select_exp.append(one_exp)
    

In [30]:
select_df = pd.concat(select_exp)
assert len(set(select_df['Experiment target'].values)) == 100, 'Incorrect number of experiments selected!'
select_df.to_csv(os.path.join(met_dir, 'selected_metadata.tsv'))