In [None]:
import pandas as pd
from tqdm import tqdm

#### DHSs

In [None]:
df_dhs = pd.read_csv('Data/Raw/GRCh38_ENCFF503GCK.bed', index_col=0, sep='\t', low_memory=False) # read in DHS data

# reformat DHS data
df_dhs['chr'] = df_dhs.index
df_dhs.reset_index(inplace=True)
df_dhs.rename_axis(index = None, inplace = True)
df_dhs['chr'] = df_dhs['chr'].map(lambda x: x[3:])
df_dhs['chr'] = df_dhs['chr'].astype(str)
df_dhs = df_dhs[df_dhs['chr'] != 'X'] # select for autosomal chromosomes
df_dhs = df_dhs[df_dhs['chr'] != 'Y'] # select for autosomal chromosomes
df_dhs['mid'] = (df_dhs['start'] + df_dhs['end']) /2 # calculate midpoint
df_dhs = df_dhs[['chr', 'mid','start','end']]
df_dhs["mid"] = df_dhs["mid"].astype("int64")
df_dhs

In [None]:
df_dhs.to_csv('Data/dhs_ENCFF503GCK_GRCh38_clean_2.txt') # save DHS data to txt file

#### CHROMHMM -- ENHANCERS + PROMOTERS

In [None]:
df_chromhmm = pd.read_csv('Data/Raw/Epilogos_FullDataDownload_15-state_hg38.txt', sep = '\t') # read in ChromHMM data

In [None]:
# filter ChromHMM data

drop_values = ['chrX', 'chrY']
df_chromhmm = df_chromhmm[~df_chromhmm.Chrom.isin(drop_values)]
df_chromhmm['Chrom'] = df_chromhmm['Chrom'].map(lambda x: x[3:])
df_chromhmm['Chrom'] = df_chromhmm['Chrom'].astype(str)
df_chromhmm = df_chromhmm[df_chromhmm['Chrom'] != 'X'] #select for autosomal chromosomes
df_chromhmm = df_chromhmm[df_chromhmm['Chrom'] != 'Y'] #select for autosomal chromosomes

cols_to_keep = ['state{}'.format(i) for i in range(1,16)]
df_chromhmm_states = df_chromhmm[cols_to_keep]

df_chromhmm_states['state'] = df_chromhmm_states.idxmax(axis=1)
df_chromhmm_states['max_val'] = df_chromhmm_states.max(axis=1)

df_chromhmm_filt = pd.concat([df_chromhmm[['Chrom', 'start', 'end']], df_chromhmm_states[['state', 'max_val']]], axis=1)
drop_values = ['state{}'.format(i) for i in range(8, 16)]
df_chromhmm_filt = df_chromhmm_filt[~df_chromhmm_filt.state.isin(drop_values)]
df_chromhmm_filt.reset_index(inplace=True, drop=True)


In [None]:
df_chromhmm_filt['final_state'] = None

# based on ChromHMM state, label loci as enhancers and promoters
for idx in tqdm(df_chromhmm_filt.index):
    if df_chromhmm_filt.loc[idx, 'state'] in ['state1', 'state2', 'state3']:
        df_chromhmm_filt.loc[idx, 'final_state'] = 'promoter'
    else:
        df_chromhmm_filt.loc[idx, 'final_state'] = 'enhancer'
df_chromhmm_filt

In [None]:
df_chromhmm_filt['mid'] = (df_chromhmm_filt['start'] + df_chromhmm_filt['end']) / 2 # calculate midpoint
df_chromhmm_final = df_chromhmm_filt[['Chrom', 'mid', 'final_state', 'max_val', 'start', 'end', 'state']]
df_chromhmm_final.rename(columns={'Chrom':'chr'}, inplace=True)

In [None]:
df_chromhmm_final

In [None]:
df_chromhmm_final.to_csv('Data/chromhmm_epilogos15statehg38_clean.txt') # export enhancer/promoter loci to txt

#### CTCFbs

In [None]:
df_ctcf = pd.read_csv('Data/Raw/GRCh38_ctcf_locs_liftover_OUTPUT.bed', names = ['start','end'], index_col=0, sep='\t')  # read in CTCFbs data
df_ctcf['chr'] = df_ctcf.index
df_ctcf.reset_index(inplace=True)
df_ctcf['chr'] = df_ctcf['chr'].map(lambda x: x[3:])
df_ctcf['chr'] = df_ctcf['chr'].astype(str)
df_ctcf = df_ctcf[df_ctcf['chr'] != 'X'] # select for autosomal chromosomes
df_ctcf = df_ctcf[df_ctcf['chr'] != 'Y'] # select for autosomal chromosomes
df_ctcf['mid'] = (df_ctcf['start'] + df_ctcf['end']) /2  # calculate midpoint
df_ctcf['mid'] = df_ctcf['mid'].astype('int64')
df_ctcf = df_ctcf[['chr', 'mid','start','end']]
df_ctcf


df_ctcf.dropna(inplace=True)

In [None]:
df_ctcf.to_csv('Data/ctcf_loci_GRCh38_clean_2.txt') # save CTCFbs

In [None]:
## Filter CTCFs using insulation scores to identify strong CTCFs

# load the insulation ranges 
df_i_score = pd.read_csv('Data/Raw/insulation_scores.csv', index_col=0) 
ranges = []
for i in df_i_score.index:
     ranges.append((df_i_score.loc[i, 'start'], df_i_score.loc[i, 'end']))
len(ranges)

In [None]:
## Filter CTCFs using insulation scores to identify strong CTCFs

df_ctcf_selected = None
for chr_id in range(1, 23):
    chr_id = str(chr_id)
    print('Going for chr {}'.format(chr_id))

    # select subset correct for the chr 
    df_i_score_chr = df_i_score[df_i_score['Chr'] == int(chr_id)]
    df_i_score_chr.reset_index(inplace=True)
    
    df_ctcf_chr = df_ctcf[df_ctcf['chr'] == chr_id]
    df_ctcf_chr.reset_index(inplace=True, drop=True)

    mask = []
    for ctcf_idx in tqdm(df_ctcf_chr.index):
        mid_point_ctcf = df_ctcf_chr.loc[ctcf_idx, 'mid']
        if any(lower <= mid_point_ctcf <= upper for (lower, upper) in ranges):
            mask.append(True)
        else:
            mask.append(False)

    df_ctcf_chr = df_ctcf_chr[mask].reset_index(drop=True)

    if chr_id == '1':
        df_ctcf_selected = df_ctcf_chr
    else:
        df_ctcf_selected = pd.concat([df_ctcf_selected, df_ctcf_chr])

df_ctcf_selected.reset_index(inplace=True)

In [None]:
df_ctcf_selected.drop(columns=['index'], inplace=True)
df_ctcf_selected["mid"] = df_ctcf_selected["mid"].astype("int64")

In [None]:
df_ctcf_selected.to_csv('Data/ctcf__STRONG_loci_GRCh38_clean_2.csv') # save strong CTCFbs positions to csv