# Annotate 1 Mbp bins with large scale covariates

This notebook runs different scripts to intersect bin coordinates with large-scale covariates, including: 
    
- Cromatin accessibility (DNase)
- Replication timming (Repli-seq)
- Gene expression (RNA-seq)

In [1]:
import os

import pandas as pd

In [2]:
info = [
    '[params]',
    'cores=1',
    'memory=4G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate pybigwig\n',
    '[jobs]',
]

In [3]:
main_dir = ''

In [4]:
bins_f = f'{main_dir}/genomic_bins/data/hg38_1000kb_bin_liftover_hg19.nodrivers.filtered.mappable_positions.autosomes.binids.txt'

### DNase

In [5]:
map_file = f'{main_dir}/genomic_bins/code/dnase_ctype.map'
code_path = f'{main_dir}/genomic_bins/code/dnase_ctype.py'

In [6]:
metadata_f = f'{main_dir}/genomic_bins/code/Roadmap.metadata.consolidated.txt.csv'
cov_f = f'{main_dir}/genomic_bins/code/covariates_list.csv'

In [7]:
url='https://egg2.wustl.edu/roadmap/data/byFileType/signal/consolidated/macs2signal/foldChange/'

In [8]:
output_path = f'{main_dir}/genomic_bins/data/large_scale_cov/dnase'

In [9]:
histone_marks = ['DNase']

In [10]:
cancertypes_df = pd.read_csv(cov_f, sep='\t', header=0)
cancertypes = cancertypes_df['CANCER_TYPE'].tolist()
cancertypes

['BLADDER_URI',
 'BRCA',
 'COADREAD',
 'ESOPHA_STOMACH',
 'LNM',
 'NSCLC',
 'PROSTATE',
 'SKCM']

In [11]:
with open(map_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
    for ctype in cancertypes: 
        for mark in histone_marks: 
            output_f = f'{output_path}/{ctype}_hg38_1000kb_bin.{mark}.filtered.bed.gz'
            ofd.write(f'python {code_path} --cancer-type {ctype} --bins_f {bins_f} --metadata_f {metadata_f} --output_f {output_f} --cov_f {cov_f} --url {url}\n')

In [12]:
map_file

'/workspace/projects/hartwig/hotspots/hotspotfinder/2023_01/genomic_bins/code/dnase_ctype.map'

# RNA

In [28]:
map_file = f'{main_dir}/genomic_bins/code/rna_ctype.map'
code_path = f'{main_dir}/genomic_bins/code/rna_ctype.py'

In [29]:
metadata_f = f'{main_dir}/genomic_bins/code/epigenome_names.csv'
cov_f = f'{main_dir}/genomic_bins/code/covariates_list.csv'

In [30]:
url = 'https://egg2.wustl.edu/roadmap/data/byDataType/rna/signal/normalized_bigwig/stranded/'

In [31]:
output_path = f'{main_dir}/genomic_bins/data/large_scale_cov/rna'

In [32]:
cancertypes_df = pd.read_csv(cov_f, sep='\t', header=0)
cancertypes = cancertypes_df['CANCER_TYPE'].tolist()
cancertypes

['BLADDER_URI',
 'BRCA',
 'COADREAD',
 'ESOPHA_STOMACH',
 'LNM',
 'NSCLC',
 'PROSTATE',
 'SKCM']

In [33]:
bins = [1000000]

In [34]:
with open(map_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
    for ctype in cancertypes: 
        for binsize in bins: 
            output_f = f'{output_path}/{ctype}_hg38_1000kb_bin.rna.filtered.bed.gz'
            ofd.write(f'python {code_path} --cancer-type {ctype} --bins_f {bins_f} --metadata_f {metadata_f} --output_f {output_f} --cov_f {cov_f} --url {url}\n')

# Replication timing

In [43]:
map_file = f'{main_dir}/genomic_bins/code/replication_ctype.map'
code_path = f'{main_dir}/genomic_bins/code/replication_ctype.py'

In [44]:
output_path = f'{main_dir}/genomic_bins/data/large_scale_cov/replication'

In [47]:
url = 'http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeUwRepliSeq/'

In [48]:
bins = [1000000]
cancertypes = [
    'SOLID',
    'NON_SOLID'
]

In [49]:
with open(map_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
    for ctype in cancertypes: 
        for binsize in bins: 
            output_f = f'{output_path}/{ctype}_hg38_1000kb_bin.RepliSeq.filtered.bed.gz'
            ofd.write(f'python {code_path} --cancer-type {ctype} --bins_f {bins_f} --output_f {output_f} --url {url}\n')