# Imports & Constants

In [10]:
import copy
import os

import pandas as pd
import urllib

pd.set_option('display.max_colwidth', None)

In [15]:
TARGET_CELL_TYPE = 'Hepg2'
DATA_DIR = "../dat/deepsea"
%cd ~/dev/an1lam/deepmr/src

/home/ubuntu/dev/an1lam/deepmr/src


# Load data and sanity check it

In [17]:
cols_df = pd.read_csv(os.path.join(DATA_DIR, 'deepsea_cols.tsv'), sep='\t')
urls = pd.read_csv(os.path.join(DATA_DIR, "deepsea_peak_urls.tsv"), sep="\t", header=0, names=["url"]).values.squeeze()
target_tfs_df = pd.read_csv(os.path.join(DATA_DIR, "encode_hepg2_deepsea_cols.csv"))

In [18]:
cols_df.head()

Unnamed: 0,Data Source,Cell Type,TF/DNase/HistoneMark,Treatment,Positive Proportion
0,ENCODE,8988T,DNase,,0.039105
1,ENCODE,AoSMC,DNase,,0.039264
2,ENCODE,Chorion,DNase,,0.033475
3,ENCODE,CLL,DNase,,0.023512
4,ENCODE,Fibrobl,DNase,,0.060983


In [19]:
target_tfs_df.drop_duplicates(subset=('Data Source', 'Cell Type', 'TF/DNase/HistoneMark', 'Treatment'))

Unnamed: 0,Data Source,Cell Type,TF/DNase/HistoneMark,Treatment,Positive Proportion
0,ENCODE,HepG2,CTCF,,0.024789
4,ENCODE,HepG2,EZH2,,0.004818
5,ENCODE,HepG2,ATF3,,0.001461
6,ENCODE,HepG2,BHLHE40,,0.001074
8,ENCODE,HepG2,CEBPB,,0.006658
9,ENCODE,HepG2,CEBPB,forskolin,0.007623
11,ENCODE,HepG2,CEBPD,,0.006112
12,ENCODE,HepG2,ELF1,,0.008831
13,ENCODE,HepG2,FOSL2,,0.011383
14,ENCODE,HepG2,FOXA1,,0.020508


In [20]:
cols_df = cols_df.drop_duplicates(subset=('Data Source', 'Cell Type', 'TF/DNase/HistoneMark', 'Treatment'))

In [21]:
def url_matches(url, row):
    url = url.lower()
    return (
        row['Cell Type'].lower() in url and
        row['TF/DNase/HistoneMark'].lower() in url
    )

def find_urls(row):
    matching_urls = []
    for url in urls:
        if url_matches(url, row): matching_urls.append(url)
    return matching_urls


cols_with_urls = list()
i = 0
for idx, row in cols_df.iterrows():
    matching_urls = find_urls(row)
    for url in matching_urls:
        new_row = copy.copy(row)
        new_row['url'] = url
        cols_with_urls.append(new_row)
        
cols_urls_df = pd.DataFrame(cols_with_urls)

In [22]:
cols_urls_df[(cols_urls_df['Cell Type'] == 'HepG2') & (cols_urls_df['TF/DNase/HistoneMark'] == 'CTCF')]

Unnamed: 0,Data Source,Cell Type,TF/DNase/HistoneMark,Treatment,Positive Proportion,url
137,ENCODE,HepG2,CTCF,,0.024789,http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeAwgTfbsUniform/wgEncodeAwgTfbsBroadHepg2CtcfUniPk.narrowPeak.gz
137,ENCODE,HepG2,CTCF,,0.024789,http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeAwgTfbsUniform/wgEncodeAwgTfbsHaibHepg2Ctcfsc5916V0416101UniPk.narrowPeak.gz
137,ENCODE,HepG2,CTCF,,0.024789,http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeAwgTfbsUniform/wgEncodeAwgTfbsUtaHepg2CtcfUniPk.narrowPeak.gz
137,ENCODE,HepG2,CTCF,,0.024789,http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeAwgTfbsUniform/wgEncodeAwgTfbsUwHepg2CtcfUniPk.narrowPeak.gz


The TFs DF is generated using the `filter_deepsea_cols.py` script. Eventually I should probably just combine these two scripts together, but for now, given the time crunch, I'm going to leave things as-is.

In [23]:
final_tfs_df = target_tfs_df.merge(
    cols_urls_df,
    how='inner',
    on=('Data Source', 'Cell Type', 'TF/DNase/HistoneMark', 'Treatment')
)

In [24]:
final_tfs_df = final_tfs_df.drop_duplicates(subset=('Data Source', 'Cell Type', 'TF/DNase/HistoneMark', 'Treatment'))

In [25]:
len(final_tfs_df)

41

In [26]:
print("TFs to pass to mutagenesis")
tfs_str = ""
for idx, row in final_tfs_df.iterrows():
    cell_type, feature, url = row['Cell Type'], row['TF/DNase/HistoneMark'], row['url']
    fpath = os.path.join(DATA_DIR, f"{cell_type}_{feature}.gz")
    urllib.request.urlretrieve(url, fpath)
    tfs_str += f"{feature} "
print(tfs_str)

TFs to pass to mutagenesis
CTCF EZH2 ATF3 BHLHE40 CEBPB CEBPB CEBPD ELF1 FOSL2 FOXA1 FOXA2 HDAC2 HNF4A HNF4A HNF4G MBD4 MYBL2 NFIC RXRA SP1 SP2 SRF TAF1 TCF12 TEAD4 USF1 YY1 ZBTB33 ZBTB7A ARID3A BRCA1 CHD2 HSF1 IRF3 MAZ RFX5 SMC3 TBP TCF7L2 USF2 ZNF274 
