### Install dependencies

In [None]:
%env HDF5_USE_FILE_LOCKING=FALSE
%env WANDB_MODE=dryrun
%env WANDB_NOTEBOOK_NAME=deepmr

In [None]:
import bpnet
from bpnet.cli.contrib import ContribFile
from bpnet.plot.tracks import plot_tracks, to_neg

import os
import uuid
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import clear_output, HTML
from pathlib import Path
import pandas as pd
import numpy as np
clear_output()

#### Optional: Setup wandb

In [None]:
import wandb

wandb.init(project='deepmr', entity='an1lam', mode='disabled')

In [None]:
# config variables
n_reps = 5

# file paths
config_dir = Path('./bpnet/') 

model_config_fname = 'ChIP-nexus-default.gin'
data_config_fname = 'ChIP-nexus.dataspec.yml'

timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
project_dir = '/home/ubuntu/dev/an1lam/deepmr/'
data_dir = os.path.join(project_dir, 'dat')
output_dir = os.path.join(data_dir, f'res-bpnet-training-{timestamp}')
output_dir

In [None]:
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, 'output_ensemble'), exist_ok=True)

In [None]:
!cat {config_dir}/{data_config_fname}

### Data stats

In [None]:
# chromsomome names of differnet peaks
!zcat {project_dir}dat/bpnet-manuscript-data/data/chip-nexus/Sox2/idr-optimal-set.summit.bed.gz \
    | cut -f 1 | sort -u

Each task (or TF) can specify a set of peaks associated with it. Here are the number of peaks per TF we will use in this tutorial:

In [None]:
tasks = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

# number of peaks per task
for task in tasks:
    print(task)
    !zcat {data_dir}/bpnet-manuscript-data/data/chip-nexus/{task}/idr-optimal-set.summit.bed.gz | wc -l

## 2. Train the model

Having specified `dataspec.yml`, we are now ready to train the model with 

```
bpnet train <dataspec.yml> <output dir> [optional flags]`
```


We will use a pre-made model [bpnet9](../bpnet/premade/bpnet9.gin) as a starting point and modify a few parameters specified in the config.gin file. Specifically, we will 
- train the model only on chromosomes 16-19
- evaluate the model on chromosome 2
- use only 3 layers of dilated convolutions 
- use an input sequence length of 200 bp and accordingly lower the augmentation shift to 100 bp

In [None]:
!cat {config_dir}/{model_config_fname} 
# NOTE: test_chr will be also excluded similar to 'exclude_chr'

Have a look at the original gin file of bpnet9 here: https://github.com/kundajelab/bpnet/blob/master/bpnet/premade/bpnet9-ginspec.gin. For more information on using gin files see <https://github.com/google/gin-config>. 

To track model training and evaluation, we will use [wandb](http://wandb.com/) by adding `--wandb=avsec/bpnet-demo` to `bpnet train`. You can navigate to https://app.wandb.ai/avsec/bpnet-demo to see the training progress.

Let's train!

In [None]:
# setup all the file paths
example_model_dir = os.path.join(output_dir, 'output_ensemble', '0')

In [None]:
# Train for at most 10 epochs
for i in range(n_reps):
    # setup a new run_id (could be done automatically, but then the output directory would change)
    run_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_" + str(uuid.uuid4())
    !cd {config_dir} && bpnet train {data_config_fname} --premade=bpnet9 \
        --config={model_config_fname} {output_dir} \
        --run-id '{run_id}' \
        --override='train.epochs=10; train.seed={i}'
    # softlink the new output directory
    !rm -rf {output_dir}/output_ensemble/{i} && ln -srf {output_dir}/{run_id} {output_dir}/output_ensemble/{i}

In [None]:
! echo {output_dir}
! echo
! ls {output_dir}/output_ensemble/0

In [None]:
!ls -latr {example_model_dir}/