# This notebook goes through dataset creation for quantitative models. 
**!!!** If not running on bizon or not using GRCh38 or mm9 genomes please download fasta files and change paths in the set_genome function

**required scripts for running this notebook:**
- /home/shush/profile/QuantPred/datasets/basenji_data.py
- /home/shush/profile/QuantPred/datasets/basenji_data_read.py
- /home/shush/profile/QuantPred/datasets/basenji_data_write.py
- /home/shush/profile/QuantPred/datasets/genome.py
- /home/shush/profile/QuantPred/datasets/util.py
- /home/shush/profile/QuantPred/datasets/dna_io.py

or just pull git@github.com:amberT15/QuantPred.git

In [10]:
import os
import pandas as pd
from glob import glob
import subprocess
import yaml, os, shutil, sys
import util

In [11]:
def set_genome(genome): 
    '''choose genome file paths for chr size, fa and unmappable genome segments (optional)'''
    genome_dict = {'hg38': {'size':  '/home/shush/genomes/GRCh38_EBV.chrom.sizes.tsv',
                              'fa':  '/home/shush/genomes/hg38.fa',
                              'unmap':  '/home/shush/genomes/GRCh38_unmap.bed'},
                  'mm9': {'size':  '/home/shush/genomes/mm9.chrom.sizes',
                          'fa':  '/home/shush/genomes/mm9.fa',
                          'unmap':  '/home/shush/genomes/mm9-blacklist.bed'}}
            
    assert genome in genome_dict.keys(), 'Unknown genome!'
    return genome_dict[genome]

def write_basenji_samplefile(bigwig_filepaths, basenji_samplefile):
    df = pd.DataFrame(columns =['index', 'identifier', 'file', 'sum_stat', 'description'])
    # per file, get the filename
    for b, bigwig_filepath in enumerate(bigwig_filepaths):
        # make entry in basenji samplefile
        df.loc[b] = [b, os.path.basename(bigwig_filepath).split('.b')[0], bigwig_filepath, 'mean', '']

    # write to csv files
    df.to_csv(basenji_samplefile, index=None, sep='\t')
    
def write_basset_samplefile(bed_filepaths, basset_samplefile):
    print('Generating merged samplefile for the entire bedfile set')
    df = pd.DataFrame(columns =['identifier', 'file'])
    # per file, get the filename
    for b, bedfile_path in enumerate(bed_filepaths):
        # make entry in basenji samplefile
        df.loc[b] = [os.path.basename(bedfile_path).split('.b')[0], bedfile_path]
    # write to csv files
    df.to_csv(basset_samplefile, index=None, header=None, sep='\t')


The most relevant options here are:

| Argument | Note |
|:---|:---|
| chroms_only | if 'all' creates train, val, test, if specific chromosomes then creates test set from only those|
| input_size| input size of the genomic|
| base_dir | the output directory for the tfr files |
| bigwig_paths_pattern | regexp pattern that will collect all the bigwig files |
| bigwig_filepaths | can set this instead as a list of all the bws if don't want to use glob |
| bedfile_paths_pattern | same for bed files of genomic regions to focus on IF you want peak centered dataset |
| bed_filepaths | --.-- |
| pool_window | bin size, if set to 1 can bin later in the training |
| dilation_rate | fraction of data to include, can set to 0.1 to test pipelines|



In [23]:
input_size = 3072
base_dir = '/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc'
bigwig_paths_pattern = '/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/datasets/*/Bigwig/*'
bigwig_filepaths = [f for f in glob(bigwig_paths_pattern) if f.endswith('bigwig') or f.endswith('bw')]
bedfile_paths_pattern = '/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/datasets/*/Bed/*'
bed_filepaths = [f for f in glob(bedfile_paths_pattern) if f.endswith('bed') or f.endswith('gz')]

# optional arguments
pool_window = 1 
dilation_rate =0.1 
valid_chr = 'chr9'
test_chr = 'chr8'
genome = 'mm9'
chroms_only = 'all'
norm = 'none'
threshold = 0 # threshold for train and val
test_threshold = 0 # test threshold
# only change these if you want overlapping inputs
step = 0
stride_test = 1 #e.g. set to 0.333333333333 to move by 1K if 3K input
padding = 'none'
util.make_directory(base_dir) # create dir if not there already
basset_samplefile = os.path.join(base_dir, 'basset_samplefile.csv') # change to random for thresholded
basenji_samplefile = os.path.join(base_dir, 'basenji_samplefile.csv')


Directory already exists!


In [24]:

write_basenji_samplefile(bigwig_filepaths, basenji_samplefile) # write pre-requisite file for the pipeline specifying bw paths

if basset_samplefile != 'random':
    write_basset_samplefile(bed_filepaths, basset_samplefile)# write pre-requisite file for the pipeline specifying bed paths

Generating merged samplefile for the entire bedfile set


In [25]:
config_path = os.path.join(base_dir, 'config.yaml')
config = {}

config['genomefile'] = set_genome(genome)

config['chroms']={'valid': valid_chr, 'test': test_chr, 'only': chroms_only}
config['input'] = {'downsample': dilation_rate, 'size':input_size, 'pool':pool_window, 
                   'norm': norm, 'step':step, 'padding':padding}
config['samplefile'] = {'basset': basset_samplefile, 'basenji':basenji_samplefile}

config['threshold'] = threshold
config['test_threshold'] = test_threshold
config['output'] = {'dir': base_dir, 
                   'prefix': 'i_%i_w_%i' % (config['input']['size'], config['input']['pool'])}
config['stride_test'] = stride_test 

In [26]:
with open('config.yaml', 'w') as file:
    documents = yaml.dump(config, file, default_flow_style=False)

In [27]:
! ./bw_to_tfr.sh

!!!
/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc
/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/basset_samplefile.csv
Peak centering
Generating bed region combined file for all TFs
Sorting bedfile and genome file
Generating bed file complementary to peak regions
Merging nonpeak and blacklisted regions
Running basenji data processing
/home/shush/genomes/mm9.fa /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/basenji_samplefile.csv -g merged_avoid_regions.bed -l 3072 -o /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1 -t chr8 -v chr9 -w 1 --local -d 0.1 --norm none --step 0 --padding none -p 19 --threshold 0 --test_threshold 0 --only_chroms all --stride_test 1
['/home/shush/genomes/mm9.fa', '/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/basenji_samplefile.csv']
Using test set threshold of 0.0
stride_train 1 converted to 3072.000000
stride_test 1 converted to 3072.000000
CHR
Contigs divided into
 Tra

./basenji_data_write.py -s 1536 -e 1792 --umap_clip 1.000000 /home/shush/genomes/mm9.fa /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/sequences.bed /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/seqs_cov /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/tfrecords/train-6.tfr train -o /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 1792 -e 2048 --umap_clip 1.000000 /home/shush/genomes/mm9.fa /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/sequences.bed /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/seqs_cov /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/tfrecords/train-7.tfr train -o /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 204

*
*
*
*
*
256
*
*
*
*
*
*
*
*
*
*
256
*
*
*
*
*
  values = values.flatten().tostring()
  values = values.flatten().tostring()
*
*
*
*
*
256
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
256
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
256
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
256
*
*
*
*
*
  values = values.flatten().tostring()
./basenji_data_write.py -s 4864 -e 5120 --umap_clip 1.000000 /home/shush/genomes/mm9.fa /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/sequences.bed /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/seqs_cov /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/tfrecords/train-19.tfr train -o /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 5120 -e 5376 --umap_clip 1.000000 /home/shush/genomes/mm9.fa /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/

./basenji_data_write.py -s 8704 -e 8960 --umap_clip 1.000000 /home/shush/genomes/mm9.fa /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/sequences.bed /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/seqs_cov /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/tfrecords/train-34.tfr train -o /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 8960 -e 9216 --umap_clip 1.000000 /home/shush/genomes/mm9.fa /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/sequences.bed /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/seqs_cov /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/tfrecords/train-35.tfr train -o /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 9

./basenji_data_write.py -s 12544 -e 12800 --umap_clip 1.000000 /home/shush/genomes/mm9.fa /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/sequences.bed /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/seqs_cov /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/tfrecords/train-49.tfr train -o /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py -s 12800 -e 13056 --umap_clip 1.000000 /home/shush/genomes/mm9.fa /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/sequences.bed /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/seqs_cov /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/tfrecords/train-50.tfr train -o /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1 --threshold 0.000000 --test_threshold 0.000000
./basenji_data_write.py 

*
*
*
*
*
256
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
256
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
89
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
256
*
*
*
*
*
  values = values.flatten().tostring()
*
*
*
*
*
843
*
*
*
*
*
  values = values.flatten().tostring()
~~~
~~~
~~~
~~~
~~~
~~~
~~~
~~~
~~~
~~~
/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset_pc/i_3072_w_1/statistics.json
~~~
~~~
~~~
~~~
~~~
~~~
~~~
~~~
~~~
~~~


In [30]:
! head /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset/basenji_samplefile.csv

BalbC_Virgin_27ac_q1e-03	/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/datasets/A/Bed/BalbC_Virgin_27ac_q1e-03.bed
BalbC_During_2ndPreg_27ac_q1e-03	/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/datasets/A/Bed/BalbC_During_2ndPreg_27ac_q1e-03.bed
BalbC_During_1stPreg_27ac_q1e-03	/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/datasets/A/Bed/BalbC_During_1stPreg_27ac_q1e-03.bed
BalbC_Parous_27a_q1e-03	/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/datasets/A/Bed/BalbC_Parous_27a_q1e-03.bed


In [85]:
# sanity check for dataset splits
! cut -f4 /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset/i_3072_w_1/sequences.bed | sort | uniq -c

  42787 test
 717678 train
  40239 valid


In [86]:
# summary of the configs corresponding to the new dataset
! cat /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset/i_3072_w_1/config.yaml

chroms:
  only: all
  test: chr8
  valid: chr9
genomefile:
  fa: /home/shush/genomes/mm9.fa
  size: /home/shush/genomes/mm9.chrom.sizes
  unmap: /home/shush/genomes/mm9-blacklist.bed
input:
  downsample: 1
  norm: none
  padding: none
  pool: 1
  size: 3072
  step: 0
output:
  dir: /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset
  prefix: i_3072_w_1
samplefile:
  basenji: /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/tfr_dataset/basenji_samplefile.csv
  basset: random
stride_test: 1
test_threshold: -1
threshold: 2


# Make just the test set with a 2K window size for evaluation

this is useful if you already have a dataset but want to make a smaller one only from specific chromosomes and potentially with other thresholds, peak files, etc.

In [74]:
input_size = 2048
base_dir = '/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/chr8'
bigwig_paths_pattern = '/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/datasets/*/Bigwig/*'
bigwig_filepaths = [f for f in glob(bigwig_paths_pattern) if f.endswith('bigwig') or f.endswith('bw')]
bedfile_paths_pattern = '/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/datasets/*/Bed/*'
bed_filepaths = [f for f in glob(bedfile_paths_pattern) if f.endswith('bed') or f.endswith('gz')]

# optional arguments
pool_window = 1 
dilation_rate =0.1 
valid_chr = 'chr9'
test_chr = 'chr8'
genome = 'mm9'
chroms_only = 'chr8' #*********** <------ ********* change this to comma separated list of chr you want to include
norm = 'none'
threshold = 0 # threshold for train and val
test_threshold = 0 # test threshold
# only change these if you want overlapping inputs
step = 0
stride_test = 1 #e.g. set to 0.333333333333 to move by 1K if 3K input
padding = 'none'
util.make_directory(base_dir) # create dir if not there already
basset_samplefile =  'random' #os.path.join(base_dir, 'basset_samplefile.csv') 
basenji_samplefile = os.path.join(base_dir, 'basenji_samplefile.csv')



Making directory: /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/chr8


In [75]:

write_basenji_samplefile(bigwig_filepaths, basenji_samplefile) # write pre-requisite file for the pipeline specifying bw paths

if basset_samplefile != 'random':
    write_basset_samplefile(bed_filepaths, basset_samplefile)# write pre-requisite file for the pipeline specifying bed paths

In [76]:

config_path = os.path.join(base_dir, 'config.yaml')
config = {}

In [77]:
config_path = os.path.join(base_dir, 'config.yaml')
config = {}

config['genomefile'] = set_genome(genome)

config['chroms']={'valid': valid_chr, 'test': test_chr, 'only': chroms_only}
config['input'] = {'downsample': dilation_rate, 'size':input_size, 'pool':pool_window, 
                   'norm': norm, 'step':step, 'padding':padding}
config['samplefile'] = {'basset': basset_samplefile, 'basenji':basenji_samplefile}

config['threshold'] = threshold
config['test_threshold'] = test_threshold
config['output'] = {'dir': base_dir, 
                   'prefix': 'i_%i_w_%i' % (config['input']['size'], config['input']['pool'])}
config['stride_test'] = stride_test 

In [78]:
with open('config.yaml', 'w') as file:
    documents = yaml.dump(config, file, default_flow_style=False)

In [79]:
! ./bw_to_tfr.sh

!!!
/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/chr8
random
Chopping randomly
Running basenji data processing
/home/shush/genomes/mm9.fa /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/chr8/basenji_samplefile.csv -g /home/shush/genomes/mm9-blacklist.bed -l 2048 -o /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/chr8/i_2048_w_1 -t chr8 -v chr9 -w 1 --local -d 0.1 --norm none --step 0 --padding none -p 19 --threshold 0 --test_threshold 0 --only_chroms chr8 --stride_test 1
['/home/shush/genomes/mm9.fa', '/mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/chr8/basenji_samplefile.csv']
Using test set threshold of 0.0
stride_train 1 converted to 2048.000000
stride_test 1 converted to 2048.000000
chr8
CHR
Contigs divided into
 Train:     0 contigs,          0 nt (0.0000)
 Valid:     0 contigs,          0 nt (0.0000)
 Test:     18 contigs,  131470271 nt (1.0000)
<generator object divide_contigs_chr.<locals>.<genexpr> at 0x7fdadafe9228>
./basenji_data_read.py --crop 0 -w 1 -u mean -s 1.000000 

In [80]:
! cut -f4 /mnt/31dac31c-c4e2-4704-97bd-0788af37c5eb/colab/chr8/i_2048_w_1/sequences.bed | sort | uniq -c

   6418 test
