In [1]:
%load_ext autoreload
%autoreload 2

import csv
from datetime import datetime
import math
import os
from pathlib import Path
import pickle

from IPython.display import clear_output
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from tqdm.notebook import tqdm
from utils import detect_device
import uncertainty_toolbox
import uncertainty_toolbox.data as udata
import uncertainty_toolbox.metrics as umetrics
from uncertainty_toolbox.metrics_calibration import (
    get_proportion_lists_vectorized,
)
import uncertainty_toolbox.viz as uviz
from uncertainty_toolbox.recalibration import iso_recal

import bpnet
from bpnet.datasets import StrandedProfile
from bpnet.dataspecs import DataSpec, TaskSpec
from bpnet.utils import create_tf_session
from bpnet.utils import read_json
from bpnet.seqmodel import SeqModel
from bpnet.plot.evaluate import plot_loss, regression_eval

from filter_instrument_candidates import filter_variants_by_score
from in_silico_mutagenesis import compute_summary_statistics, generate_wt_mut_batches, write_results

Using TensorFlow backend.






2022-06-24 01:35:37,574 [INFO] NumExpr defaulting to 4 threads.


In [2]:
timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
output_dir = f'/home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-{timestamp}'
model_base_dir = "/home/ubuntu/dev/an1lam/deepmr/dat/res-bpnet-training-2022-01-29-22-58-12/output_ensemble"
factor_names = ['Oct4', 'Sox2', 'Nanog', 'Klf4']

exposure_outcome_pairs = [
    (exposure_name, outcome_name)
    for exposure_name in factor_names
    for outcome_name in factor_names
    if exposure_name != outcome_name
]

results_fnames = [
    f'{exposure_name}_{outcome_name}_effect_sizes.csv' for exposure_name, outcome_name in exposure_outcome_pairs
]
n_seqs = 2000
n_reps = 5
alphabet_size = 4

In [3]:
!cat {model_base_dir}/0/evaluation.valid.json

{
  "valid-peaks": {
    "Oct4/profile/binsize=1/auprc": 0.08467554491740614,
    "Oct4/profile/binsize=1/random_auprc": 0.0029037278008280194,
    "Oct4/profile/binsize=1/n_positives": 49453,
    "Oct4/profile/binsize=1/frac_ambigous": 0.07213581058862216,
    "Oct4/profile/binsize=1/imbalance": 0.002921060543502851,
    "Oct4/profile/binsize=10/auprc": 0.35810432332139863,
    "Oct4/profile/binsize=10/random_auprc": 0.03350862566740526,
    "Oct4/profile/binsize=10/n_positives": 39442,
    "Oct4/profile/binsize=10/frac_ambigous": 0.36082977090869234,
    "Oct4/profile/binsize=10/imbalance": 0.03382008694682867,
    "Oct4/counts/mse": 0.7959021925926208,
    "Oct4/counts/var_explained": -0.5864921808242798,
    "Oct4/counts/pearsonr": -0.13039048718437327,
    "Oct4/counts/spearmanr": -0.07904258291467911,
    "Oct4/counts/mad": 0.6649706363677979,
    "Sox2/profile/binsize=1/auprc": 0.1665607337788616,
    "Sox2/profile/binsize=1/random_auprc": 0.005742239420075821,

In [4]:
gin_config = read_json(os.path.join(model_base_dir, '0', 'config.gin.json'))
gin_config

{'Adam.amsgrad': False,
 'Adam.beta_1': 0.9,
 'Adam.beta_2': 0.999,
 'Adam.decay': 0.0,
 'Adam.epsilon': 'None',
 'Adam.lr': 0.004,
 'DeConv1D.batchnorm': False,
 'DeConv1D.filters': 64,
 'DeConv1D.n_hidden': 0,
 'DeConv1D.n_tasks': 2,
 'DeConv1D.padding': 'same',
 'DeConv1D.tconv_kernel_size': 25,
 'DilatedConv1D.add_pointwise': False,
 'DilatedConv1D.batchnorm': False,
 'DilatedConv1D.conv1_kernel_size': 25,
 'DilatedConv1D.filters': 64,
 'DilatedConv1D.n_dil_layers': 9,
 'DilatedConv1D.padding': 'same',
 'DilatedConv1D.skip_type': 'residual',
 'GlobalAvgPoolFCN.batchnorm': False,
 'GlobalAvgPoolFCN.dropout': 0,
 'GlobalAvgPoolFCN.dropout_hidden': 0,
 'GlobalAvgPoolFCN.hidden': 'None',
 'GlobalAvgPoolFCN.n_splines': 0,
 'GlobalAvgPoolFCN.n_tasks': 2,
 'MovingAverages.window_sizes': [1, 50],
 'PeakPredictionProfileMetric.binsizes': [1, 10],
 'PeakPredictionProfileMetric.neg_max_threshold': 0.005,
 'PeakPredictionProfileMetric.pos_min_threshold': 0.015,
 'PeakPredictionProfileMetric.re

In [5]:
ds = DataSpec.load(os.path.join(model_base_dir, '0', 'dataspec.yml')) # remember to re-add 0
tasks = list(ds.task_specs)
tasks

['Oct4', 'Sox2', 'Nanog', 'Klf4']

In [6]:
dl_valid = StrandedProfile(ds, 
                           incl_chromosomes=gin_config['bpnet_data.valid_chr'], 
                           peak_width=gin_config['bpnet_data.peak_width'],
                           seq_width=gin_config['bpnet_data.seq_width'],
#                            intervals_file="/home/ubuntu/motif-instances.bed",
                           shuffle=False)

In [7]:
valid = dl_valid.load_all(batch_size=256, num_workers=1)

100%|██████████| 115/115 [04:36<00:00,  2.40s/it]


In [8]:
valid['targets']['Oct4/counts'].shape

(29277, 2)

In [9]:
np.unique(valid['metadata']['range']['chr'])

array(['chr2', 'chr3', 'chr4'], dtype='<U4')

In [10]:
mfi_df = pd.read_csv('/home/ubuntu/motif-instances.bed', sep='\t', header=None)
mfi_df.columns = ["chrom", "start", "end", "tf", "match-score", "strand", "contrib-score", "log-odds-score"]

In [11]:
mfi_df.query('chrom in ["chr2", "chr3", "chr4"]').head(50)

Unnamed: 0,chrom,start,end,tf,match-score,strand,contrib-score,log-odds-score
121154,chr2,3100228,3100244,Oct4-Sox2,0.612059,-,0.495667,0.495014
121155,chr2,3100228,3100238,Oct4,0.219802,-,0.961056,0.322772
121156,chr2,3100256,3100265,Nanog,0.451569,+,0.865339,0.403645
121157,chr2,3118160,3118170,Klf4,0.565084,-,0.537957,0.554911
121158,chr2,3143379,3143395,Oct4-Sox2,0.552605,+,0.105209,0.176218
121159,chr2,3143385,3143395,Oct4,0.335974,+,0.515512,0.079868
121160,chr2,3143588,3143604,Sox2,0.415792,+,0.050975,0.162419
121161,chr2,3195604,3195620,Sox2,0.333833,+,0.294853,0.109945
121162,chr2,3195651,3195660,Nanog,0.355383,+,0.727978,0.168073
121163,chr2,3195733,3195749,Oct4-Sox2,0.272016,+,0.02982,0.067189


In [13]:
valid_ranges = pd.DataFrame(
    dict(chrom=valid['metadata']['range']['chr'],
         sequence_start=valid['metadata']['range']['start'], 
         sequence_end=valid['metadata']['range']['end'],
         strand=valid['metadata']['range']['strand'])
).sort_values(by=['chrom', 'sequence_start'])

In [14]:
valid_mfi_df = mfi_df.query('chrom in ["chr2", "chr3", "chr4"]').sort_values(by=['chrom', 'start'])

In [26]:
%%time
mfi_idx = 0
tf_records = []
for idx, row in valid_ranges.head(1000).iterrows():
    mfi_rows = valid_mfi_df.query(
        f"start >= {row.sequence_start} and end <= {row.sequence_end} and chrom == '{row.chrom}'"
    )
    # mfi_rows = mfi_rows.query(f"strand == '{row.strand}'")
    for _, mfi_row in mfi_rows.iterrows():
        valid_ranges.loc[idx, f"{mfi_row.tf}_start"] = mfi_row.start
        valid_ranges.loc[idx, f"{mfi_row.tf}_end"] = mfi_row.end  
        valid_ranges.loc[idx, f"{mfi_row.tf}_match_score"] = mfi_row["match-score"]                

CPU times: user 10.4 s, sys: 0 ns, total: 10.4 s
Wall time: 10.1 s


In [30]:
tfs = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
tfs

['Oct4', 'Sox2', 'Nanog', 'Klf4']

In [33]:
for i, tf1 in enumerate(tfs):
    for tf2 in tfs[i:]:
        if tf1 != tf2:
            print(
                tf1, 
                tf2, 
                (~pd.isna(valid_ranges[f"{tf1}_start"]) & ~pd.isna(valid_ranges[f"{tf2}_start"])).sum(),
                np.nanmin(np.abs(valid_ranges[f"{tf1}_start"] - valid_ranges[f"{tf2}_start"])),
                np.nanmax(np.abs(valid_ranges[f"{tf1}_start"] - valid_ranges[f"{tf2}_start"])),
                
            )

Oct4 Sox2 130 5.0 756.0
Oct4 Nanog 249 12.0 717.0
Oct4 Klf4 200 11.0 892.0
Sox2 Nanog 145 3.0 488.0
Sox2 Klf4 130 13.0 449.0
Nanog Klf4 208 7.0 841.0
