### Install dependencies

In [1]:
%env HDF5_USE_FILE_LOCKING=FALSE
# %env WANDB_MODE=dryrun
%env WANDB_NOTEBOOK_NAME=deepmr

env: HDF5_USE_FILE_LOCKING=FALSE
env: WANDB_NOTEBOOK_NAME=deepmr


In [2]:
import bpnet
from bpnet.cli.contrib import ContribFile
from bpnet.plot.tracks import plot_tracks, to_neg

import os
import uuid
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import clear_output, HTML
from pathlib import Path
import pandas as pd
import numpy as np
clear_output()

#### Optional: Setup wandb

In [3]:
import wandb

wandb.init(project="deepmr", entity="an1lam")

[34m[1mwandb[0m: Currently logged in as: [33man1lam[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [4]:
# config variables
n_reps = 5

# file paths
config_dir = Path('./bpnet/') 

model_config_fname = 'ChIP-nexus-default.gin'
data_config_fname = 'ChIP-nexus.dataspec.yml'

timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
project_dir = '/home/ubuntu/dev/an1lam/deepmr/'
data_dir = os.path.join(project_dir, 'dat')
output_dir = os.path.join(data_dir, f'res-bpnet-training-{timestamp}')
output_dir

'/home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-training-2022-06-18-14-11-05'

In [5]:
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, 'output_ensemble'), exist_ok=True)

In [6]:
!cat {config_dir}/{data_config_fname}

task_specs:
  Oct4:
    tracks:
    - /home/ubuntu/dev/an1lam/deepmr/dat/bpnet-manuscript-data/data/chip-nexus/Oct4/counts.pos.bw
    - /home/ubuntu/dev/an1lam/deepmr/dat/bpnet-manuscript-data/data/chip-nexus/Oct4/counts.neg.bw
    peaks: /home/ubuntu/dev/an1lam/deepmr/dat/bpnet-manuscript-data/data/chip-nexus/Oct4/idr-optimal-set.summit.bed.gz
  Sox2:
    tracks:
    - /home/ubuntu/dev/an1lam/deepmr/dat/bpnet-manuscript-data/data/chip-nexus/Sox2/counts.pos.bw
    - /home/ubuntu/dev/an1lam/deepmr/dat/bpnet-manuscript-data/data/chip-nexus/Sox2/counts.neg.bw
    peaks: /home/ubuntu/dev/an1lam/deepmr/dat/bpnet-manuscript-data/data/chip-nexus/Sox2/idr-optimal-set.summit.bed.gz
  Nanog:
    tracks:
    - /home/ubuntu/dev/an1lam/deepmr/dat/bpnet-manuscript-data/data/chip-nexus/Nanog/counts.pos.bw
    - /home/ubuntu/dev/an1lam/deepmr/dat/bpnet-manuscript-data/data/chip-nexus/Nanog/counts.neg.bw
    peaks: /home/ubuntu/dev/an1lam/deepmr/dat/bpnet-manuscript-data/data/chip-nexus/

### Data stats

In [7]:
# chromsomome names of differnet peaks
!zcat {project_dir}dat/bpnet-manuscript-data/data/chip-nexus/Sox2/idr-optimal-set.summit.bed.gz \
    | cut -f 1 | sort -u

chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrX
chrY


Each task (or TF) can specify a set of peaks associated with it. Here are the number of peaks per TF we will use in this tutorial:

In [8]:
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

# number of peaks per task
for task in tasks:
    print(task)
    !zcat {data_dir}/bpnet-manuscript-data/data/chip-nexus/{task}/idr-optimal-set.summit.bed.gz | wc -l

Oct4
25849
Sox2
10999
Nanog
56459
Klf4
57601


## 2. Train the model

Having specified `dataspec.yml`, we are now ready to train the model with 

```
bpnet train <dataspec.yml> <output dir> [optional flags]`
```


We will use a pre-made model [bpnet9](../bpnet/premade/bpnet9.gin) as a starting point and modify a few parameters specified in the config.gin file. Specifically, we will 
- train the model only on chromosomes 16-19
- evaluate the model on chromosome 2
- use only 3 layers of dilated convolutions 
- use an input sequence length of 200 bp and accordingly lower the augmentation shift to 100 bp

In [9]:
!cat {config_dir}/{model_config_fname} 
# NOTE: test_chr will be also excluded similar to 'exclude_chr'

b_loss_weight = 0
c_loss_weight = 10
p_loss_weight = 1
filters = 64
tconv_kernel_size = 25
lr = 0.004
n_dil_layers = 9
train.batch_size = 128
merge_profile_reg = False
dataspec = 'ChIP-nexus.dataspec.yml'

batchnorm = False

padding = 'same'
seq_width = 1000

tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']


Have a look at the original gin file of bpnet9 here: https://github.com/kundajelab/bpnet/blob/master/bpnet/premade/bpnet9-ginspec.gin. For more information on using gin files see <https://github.com/google/gin-config>. 

To track model training and evaluation, we will use [wandb](http://wandb.com/) by adding `--wandb=avsec/bpnet-demo` to `bpnet train`. You can navigate to https://app.wandb.ai/avsec/bpnet-demo to see the training progress.

Let's train!

In [10]:
# setup all the file paths
example_model_dir = os.path.join(output_dir, 'output_ensemble', '0')

In [None]:
# Train for at most 10 epochs
for i in range(n_reps):
    # setup a new run_id (could be done automatically, but then the output directory would change)
    run_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_" + str(uuid.uuid4())
    !cd {config_dir} && bpnet train {data_config_fname} --premade=bpnet9 \
        --config={model_config_fname} {output_dir} \
        --run-id '{run_id}' \
        --override='train.epochs=20; train.seed={i}'
    # softlink the new output directory
    !rm -rf {output_dir}/output_ensemble/{i} && ln -srf {output_dir}/{run_id} {output_dir}/output_ensemble/{i}

Using TensorFlow backend.


2022-06-18 14:11:11,679 [INFO] NumExpr defaulting to 4 threads.
The mpl_toolkits.axes_grid1.colorbar module was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use matplotlib.colorbar instead.
  from mpl_toolkits.axes_grid1.colorbar import colorbar
[32mINFO[0m [44m[06-18 14:11:13][0m Using gpu: 0, memory fraction: 0.45[0m
2022-06-18 14:11:13.792797: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-06-18 14:11:13.801700: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300020000 Hz
2022-06-18 14:11:13.802015: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56205e4c82c0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-06-18 14:11:13.802059: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-

Epoch 1/20
2022-06-18 14:11:33.118727: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2022-06-18 14:11:33.301675: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
[32mINFO[0m [44m[06-18 15:36:57][0m Evaluating dataset: valid-peaks[0m
229it [01:10,  3.25it/s]                                                        
  fracs = yt / yt.sum(axis=1, keepdims=True)
[32mINFO[0m [44m[06-18 15:40:00][0m Evaluating dataset: train-peaks[0m
711it [03:36,  3.29it/s]                                                        
[32mINFO[0m [44m[06-18 15:49:43][0m Saved metrics to /home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-training-2022-06-18-14-11-05/2022-06-18_14-11-06_10b79897-24d4-4e0d-8d3e-08837998b69a/evaluation.valid.json[0m
[32mINFO[0m [44m[06-18 15:49:43][0m Done![0m
----------------------------------------
Final metrics: 
{
  "valid-peaks": {
    "Oct4/profile/binsize=1/auprc": 0.1842908885317406,
    "Oct4/profile/binsize=1/random_auprc": 0.0029172058898126905,
    "Oct4/profile/binsize=1/n_positives": 49497,
    "Oct4/profile/binsize=1/frac_ambigous": 0.07206144485858364,
   

Executing:  10%|███▏                           | 3/29 [00:05<00:46,  1.80s/cell]2022-06-18 15:49:49.304394: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-06-18 15:49:49.318914: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300020000 Hz
2022-06-18 15:49:49.319320: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55d5e20f6090 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-06-18 15:49:49.319373: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-06-18 15:49:49.322535: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2022-06-18 15:49:49.413484: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be 

Using TensorFlow backend.


2022-06-18 15:52:10,748 [INFO] NumExpr defaulting to 4 threads.
The mpl_toolkits.axes_grid1.colorbar module was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use matplotlib.colorbar instead.
  from mpl_toolkits.axes_grid1.colorbar import colorbar
[32mINFO[0m [44m[06-18 15:52:12][0m Using gpu: 0, memory fraction: 0.45[0m
2022-06-18 15:52:12.919423: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-06-18 15:52:12.928673: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300020000 Hz
2022-06-18 15:52:12.929016: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55bab5ca2b60 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-06-18 15:52:12.929064: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-

  .replace(" = ", ": "))
Epoch 1/20
2022-06-18 15:52:32.739476: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2022-06-18 15:52:32.923726: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
[32mINFO[0m [44m[06-18 17:16:03][0m Evaluating dataset: valid-peaks[0m
229it [01:08,  3.34it/s]                                                        
  fracs = yt / yt.sum(axis=1, keepdims=True)
[32mINFO[0m [44m[06-18 17:18:57][0m Evaluating dataset: train-peaks[0m
711it [03:27,  3.42it/s]                                                        
[32mINFO[0m [44m[06-18 17:28:10][0m Saved metrics to /home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-training-2022-06-18-14-11-05/2022-06-18_15-52-05_086e45a7-8659-42cf-add6-28c633600ab5/evaluation.valid.json[0m
[32mINFO[0m [44m[06-18 17:28:10][0m Done![0m
----------------------------------------
Final metrics: 
{
  "valid-peaks": {
    "Oct4/profile/binsize=1/auprc": 0.16296390542983225,
    "Oct4/profile/binsize=1/random_auprc": 0.0029512151980041917,
    "Oct4/profile/binsize=1/n_positives": 49666,
    "Oct4/profile/b

Executing:  10%|███▏                           | 3/29 [00:05<00:45,  1.74s/cell]2022-06-18 17:28:16.411797: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-06-18 17:28:16.422817: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300020000 Hz
2022-06-18 17:28:16.423152: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x557d01947c50 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-06-18 17:28:16.423196: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-06-18 17:28:16.426254: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2022-06-18 17:28:16.490542: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be 

Using TensorFlow backend.


2022-06-18 17:30:33,042 [INFO] NumExpr defaulting to 4 threads.
The mpl_toolkits.axes_grid1.colorbar module was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use matplotlib.colorbar instead.
  from mpl_toolkits.axes_grid1.colorbar import colorbar
[32mINFO[0m [44m[06-18 17:30:34][0m Using gpu: 0, memory fraction: 0.45[0m
2022-06-18 17:30:34.963418: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-06-18 17:30:34.972330: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300020000 Hz
2022-06-18 17:30:34.972623: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56053f302cf0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-06-18 17:30:34.972668: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-

Epoch 1/20
2022-06-18 17:30:54.143310: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2022-06-18 17:30:54.335492: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[32mINFO[0m [44m[06-18 19:01:37][0m Evaluating dataset: valid-peaks[0m
229it [01:08,  3.33it/s]                                                        
  fracs = yt / yt.sum(axis=1, keepdims=True)
[32mINFO[0m [44m[06-18 19:04:31][0m Evaluating dataset: train-peaks[0m
711it [03:26,  3.44it/s]                                                        
[32mINFO[0m [44m[06-18 19:13:43][0m Saved metrics to /home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-training-2022-06-18-14-11-05/2022-06-18_17-30-27_95271244-2f5e-4427-8866-571e7fad1636/evaluation.valid.json[0m
[32mINFO[0m [44m[06-18 19:13:43][0m Done![0m
----------------------------------------
Final metrics: 
{
  "valid-peaks": {
    "Oct4/profile/binsize=1/auprc": 0.18430455210194352,
    "Oct4/profile/binsize=1/random_auprc": 0.0029166269473294266,
    "Oct4/profile/binsize=1/n_positives": 49570,
    "Oct4/profile/binsize=1/fra

Executing:  10%|███▏                           | 3/29 [00:04<00:41,  1.61s/cell]2022-06-18 19:13:49.054585: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-06-18 19:13:49.065272: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300020000 Hz
2022-06-18 19:13:49.065710: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55f80d0198e0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-06-18 19:13:49.065737: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-06-18 19:13:49.068779: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2022-06-18 19:13:49.140202: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be 

Using TensorFlow backend.


2022-06-18 19:16:02,798 [INFO] NumExpr defaulting to 4 threads.
The mpl_toolkits.axes_grid1.colorbar module was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use matplotlib.colorbar instead.
  from mpl_toolkits.axes_grid1.colorbar import colorbar
[32mINFO[0m [44m[06-18 19:16:04][0m Using gpu: 0, memory fraction: 0.45[0m
2022-06-18 19:16:04.653344: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-06-18 19:16:04.662099: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300020000 Hz
2022-06-18 19:16:04.662409: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56395fd171c0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-06-18 19:16:04.662474: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-

Epoch 1/20
2022-06-18 19:16:22.880637: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2022-06-18 19:16:23.064291: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20

In [None]:
! echo {output_dir}
! echo
! ls {output_dir}/output_ensemble/0

In [None]:
!ls -latr {example_model_dir}/