# FRiP Filter Pipeline (JupyterNB Version)
This notebook performs all steps of `FRiP_filter.py` for better interpretability.

In [None]:
# 1. Setup: imports and parameters
import pandas as pd
import os
import time
import threading
import requests
import json
from pathlib import Path

# --- User parameters ---
input_file = '/mnt/disk/home/akang/projects/manuscript/data/input/TF_files.tsv' # Path to your ENCODE TSV
min_peaks = 1000    # Minimum reproducible peaks threshold
parallel = 8    # Number of parallel downloads


## 2. Download Metadata JSONs
Downloads the metadata json of all files in the ENCODE tsv.

In [4]:
# Create metadata directory and clear old files
os.makedirs('../data/metadata', exist_ok=True)
for filename in os.listdir('../data/metadata'):
    file_path = os.path.join('../data/metadata', filename)
    if os.path.isfile(file_path):
        os.remove(file_path)

# Read ENCODE TSV
tsv = pd.read_csv(input_file, sep='\t', skiprows=1)
tsv
accessions = tsv['Accession'].tolist()

# Download JSON metadata for each accession
for acc in accessions:
    url = f'https://www.encodeproject.org/files/{acc}/?format=json'
    res = requests.get(url, headers={'Accept': 'application/json'})
    if res.ok:
        with open(f'../data/metadata/{acc}.json', 'w') as f:
            json.dump(res.json(), f)
    else:
        print(f'Failed to download metadata for {acc}')


## 3. Read and Filter Metadata
Extract FRiP, dataset, target, and reproducible peaks; apply thresholds and select top per target.

In [5]:
import glob

# Prepare storage
bed_json_files = glob.glob('../data/metadata/*.json')
records = []

for json_path in bed_json_files:
    with open(json_path) as f:
        data = json.load(f)
    # FRiP extraction
    qm = data.get('quality_metrics', [])
    frip = None
    reproducible_peaks = None
    for entry in qm:
        if frip is None and 'frip' in entry:
            frip = entry['frip']
        if reproducible_peaks is None and 'reproducible_peaks' in entry:
            reproducible_peaks = entry['reproducible_peaks']
    # Skip if below threshold
    if reproducible_peaks is None or reproducible_peaks < min_peaks:
        continue
    # Dataset and target
    dataset = data.get('dataset', '')
    target = data.get('target', {}).get('label', '')
    accession = Path(json_path).stem
    records.append({
        'accession': accession,
        'target': target,
        'dataset': dataset,
        'frip': frip,
        'reproducible_peaks': reproducible_peaks
    })

# Build DataFrame
df = pd.DataFrame(records)
# Sort by FRiP descending and keep first per target
df = df.sort_values(by='frip', ascending=False).drop_duplicates(subset='target', keep='first').reset_index(drop=True)
df

Unnamed: 0,accession,target,dataset,frip,reproducible_peaks
0,ENCFF535TAL,POLR2A,/experiments/ENCSR000BGD/,0.355083,44729
1,ENCFF085TTI,CTCF,/experiments/ENCSR000DKV/,0.293318,33127
2,ENCFF978JKI,BHLHE40,/experiments/ENCSR987MTA/,0.280499,48122
3,ENCFF511QGY,PKNOX1,/experiments/ENCSR711XNY/,0.222362,29824
4,ENCFF081FJM,RUNX3,/experiments/ENCSR000BRI/,0.206538,61704
...,...,...,...,...,...
143,ENCFF339YTO,EZH2,/experiments/ENCSR000ARD/,0.001665,1913
144,ENCFF581FMI,HDAC2,/experiments/ENCSR330OEO/,0.001475,1496
145,ENCFF488IGF,YBX1,/experiments/ENCSR205SKQ/,0.001335,1269
146,ENCFF014PHS,UBTF,/experiments/ENCSR459FTB/,0.001286,1216


## 4. Download BED Files in Parallel
This cell inlines the download helper logic.

In [6]:
from concurrent.futures import ThreadPoolExecutor

def download_bed(target, accession):
    url = f"https://www.encodeproject.org/files/{accession}/@@download/{accession}.bed.gz"
    out_path = f"../data/bed/{target}.bed.gz"
    os.makedirs('../data/bed', exist_ok=True)
    os.system(f"wget -q -O {out_path} '{url}'")
    print(f"Downloaded {target}.bed.gz")

# Run downloads
with ThreadPoolExecutor(max_workers=parallel) as executor:
    for _, row in df.iterrows():
        executor.submit(download_bed, row['target'], row['accession'])


Downloaded CTCF.bed.gz
Downloaded PKNOX1.bed.gz
Downloaded RUNX3.bed.gz
Downloaded POLR2AphosphoS5.bed.gz
Downloaded BHLHE40.bed.gz
Downloaded IKZF1.bed.gz
Downloaded RAD21.bed.gz
Downloaded POLR2A.bed.gz
Downloaded EBF1.bed.gz
Downloaded NFYB.bed.gz
Downloaded YY1.bed.gz
Downloaded ZBTB40.bed.gz
Downloaded BATF.bed.gz
Downloaded ELF1.bed.gz
Downloaded SPI1.bed.gz
Downloaded TBX21.bed.gz
Downloaded MEF2B.bed.gz
Downloaded CREB1.bed.gz
Downloaded IKZF2.bed.gz
Downloaded ATF7.bed.gz
Downloaded SMC3.bed.gz
Downloaded MLLT1.bed.gz
Downloaded RELB.bed.gz
Downloaded CREM.bed.gz
Downloaded NFIC.bed.gz
Downloaded EED.bed.gz
Downloaded NR2F1.bed.gz
Downloaded MTA2.bed.gz
Downloaded JUNB.bed.gz
Downloaded DPF2.bed.gz
Downloaded GABPA.bed.gz
Downloaded MEF2A.bed.gz
Downloaded SP1.bed.gz
Downloaded TCF12.bed.gz
Downloaded PAX5.bed.gz
Downloaded ATF2.bed.gz
Downloaded ETV6.bed.gz
Downloaded TRIM22.bed.gz
Downloaded GATAD2B.bed.gz
Downloaded TBP.bed.gz
Downloaded NBN.bed.gz
Downloaded EP300.bed.gz
D

## 5. Logging
Write out a log of downloaded files and their FRiP scores.

In [7]:
# Create log directory
log_dir = Path('../data/log')
log_dir.mkdir(parents=True, exist_ok=True)
log_file = log_dir / 'log.txt'
i = 1
while log_file.exists():
    log_file = log_dir / f'log_{i}.txt'
    i += 1

with log_file.open('w') as f:
    for _, row in df.iterrows():
        f.write(f"{row['accession']}\t{row['target']}\t{row['dataset']}\t{row['frip']}\n")
print(f"Log file created at {log_file}")


Log file created at ../data/log/log.txt


In [2]:
"""
If running a new analysis with different sets of features, make sure to erase all BED files to use
"""
os.system('python Reset_bed.py')

0