## Imports and model initialization

In [1]:
# !pip install kipoi
# !pip install kipoiseq
# !pip install pybedtools
# !pip uninstall -y kipoi_veff
# !pip install git+https://github.com/an1lam/kipoi-veff
!pip install pyvcf
import kipoi
import kipoiseq
from kipoi_veff import MutationMap
from kipoi_veff import MutationMapPlotter
import pandas as pd
import pybedtools
import vcf
import cyvcf2

from data_loader import BedPeaksDataset

from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'limit_output': 1000})




In [2]:
df = kipoi.list_models()
deepsea_models = df[df.model.str.contains("DeepSEA")]
deepsea_models.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat(pd_list)[pd_list[0].columns]


Unnamed: 0,source,model,version,authors,contributors,doc,type,inputs,targets,veff_score_variants,license,cite_as,trained_on,training_procedure,tags
4,kipoi,DeepSEA/beluga,0.1,"[Author(name='Jian Zhou', github=None, email=N...",[],This model (DeepSEA Beluga) is a part of the E...,,seq,TFBS_DHS_probs,False,Non-comercial,https://doi.org/10.1038/s41588-018-0160-6,Chromosome 8 and 9 were excluded from training...,,"[Histone modification, DNA binding, DNA access..."
5,kipoi,DeepSEA/variantEffects,0.94,"[Author(name='Jian Zhou', github=None, email=N...","[Author(name='Roman Kreuzhuber', github='krrom...",This CNN is based on the DeepSEA model from Zh...,pytorch,seq,TFBS_DHS_probs,True,CC-BY 3.0,https://doi.org/10.1038/nmeth.3547,Chromosome 8 and 9 were excluded from training...,https://www.nature.com/articles/nmeth.3547#met...,"[Histone modification, DNA binding, DNA access..."
6,kipoi,DeepSEA/predict,0.94,"[Author(name='Jian Zhou', github=None, email=N...","[Author(name='Roman Kreuzhuber', github='krrom...",This CNN is based on the DeepSEA model from Zh...,pytorch,seq,TFBS_DHS_probs,False,CC-BY 3.0,https://doi.org/10.1038/nmeth.3547,Chromosome 8 and 9 were excluded from training...,https://www.nature.com/articles/nmeth.3547#met...,"[Histone modification, DNA binding, DNA access..."


In [3]:
deepsea = kipoi.get_model("DeepSEA/variantEffects", source="kipoi")

Using downloaded and verified file: /home/jupyter/.kipoi/models/DeepSEA/variantEffects/downloaded/model_files/weights/35956ab9c28960b5a3693f470fe980c1


## Predictions and in-silico mutagenesis

In [4]:
CHROM_ACC_COL = 'A549_DNase_None'
TF_COL = 'A549_CTCF_None'


In [5]:
dl_kwargs = {'fasta_file': '../dat/hg19.fa'}
mmp_fname = "ChIPseq.A549.CTCF.1000.random.narrowPeak.hdf5"
random_seqs_fpath = "../dat/ChIPseq.A549.CTCF.1000.random.narrowPeak.gz"
mut_effects_fpath = "../dat/ChIPseq.A549.CTCF.1000.random.narrowPeak.mut.csv"

In [6]:
import math
import numpy as np

log_uniform_prop = math.log(.05/(1-.05))
def compute_normalized_prob(prob, train_prob):
    denom = 1+np.exp(-(np.log(prob/(1-prob))+log_uniform_prop-np.log(train_prob/(1-train_prob))))
    return 1 / denom

# Ratios and normalization formula drawn from here: http://deepsea.princeton.edu/media/help/posproportion.txt
tf_compute_normalized_prob = lambda prob: compute_normalized_prob(prob, .020029)
# ENCODE	A549	DNase	None	0.048136
chrom_acc_normalized_prob = lambda prob: compute_normalized_prob(prob, 0.048136)

In [7]:
predict_dl_kwargs = dl_kwargs = {'fasta_file': '../dat/hg19.fa', 'intervals_file': random_seqs_fpath}
predictions = deepsea.pipeline.predict(predict_dl_kwargs, batch_size=100)
df_cols = [f'{label}_{i}' for i, label in enumerate(deepsea.schema.targets.column_labels)]
chrom_acc_col = next(col for col in df_cols if 'A549_DNase_None' in col)
tf_cols = [col for col in df_cols if 'A549_CTCF_None' in col]

df = pd.DataFrame(predictions, columns=df_cols)
# tf_chrom_acc_preds_df = deepsea_predictions_df[[TF_COL, CHROM_ACC_COL]]
# ENCODE, A549, CTCF, None, 0.020029
# 1/(1+exp(-( log(P/(1-P))+log(5%/(1-5%))-log(c_train/(1-c_train ))) ))
df[tf_cols[0]] = df[tf_cols[0]].apply(tf_compute_normalized_prob)
df[tf_cols[1]] = df[tf_cols[1]].apply(tf_compute_normalized_prob)

(df[(df[tf_cols[0]] <= .1) & (df[tf_cols[1]] <= .1)][tf_cols]).count()

11it [00:02,  4.68it/s]


A549_CTCF_None_720    99
A549_CTCF_None_763    99
dtype: int64

In [8]:


# cm = ConfigManager().update('cell.metadata', {'limit_output': 10})

mm = MutationMap(deepsea, deepsea.default_dataloader, dataloader_args=dl_kwargs)
# print(deepsea.default_dataloader.postprocessing['variant_effects'].bed_input)
# print(deepsea.default_dataloader)
mmp = mm.query_bed(random_seqs_fpath, scores=['ref', 'diff'])
# score_kwargs=[{'rc_merging': 'max'}, {'rc_merging': 'max'}]);

mmp.save_to_file(mmp_fname)



11761it [08:50, 22.17it/s]
11764it [08:50, 22.20it/s]
11767it [08:51, 22.24it/s]
11770it [08:51, 22.25it/s]
11773it [08:51, 22.21it/s]
11776it [08:51, 22.23it/s]
11779it [08:51, 22.25it/s]
11782it [08:51, 22.25it/s]
11785it [08:51, 22.32it/s]
11788it [08:52, 22.17it/s]
11791it [08:52, 22.14it/s]
11794it [08:52, 22.19it/s]
11797it [08:52, 22.21it/s]
11800it [08:52, 22.20it/s]
11803it [08:52, 22.18it/s]
11806it [08:52, 22.12it/s]
11809it [08:53, 22.05it/s]
11812it [08:53, 21.91it/s]
11815it [08:53, 22.05it/s]
11818it [08:53, 22.18it/s]
11821it [08:53, 22.11it/s]
11824it [08:53, 22.11it/s]
11827it [08:53, 22.15it/s]
11830it [08:53, 22.19it/s]
11833it [08:54, 22.12it/s]
11836it [08:54, 22.10it/s]
11839it [08:54, 21.73it/s]
11842it [08:54, 21.91it/s]
11845it [08:54, 22.01it/s]
11848it [08:54, 21.99it/s]
11851it [08:54, 22.00it/s]
11854it [08:55, 22.01it/s]
11857it [08:55, 21.98it/s]
11860it [08:55, 21.97it/s]
11863it [08:55, 22.03it/s]
11866it [08:55, 22.03it/s]
11869it [08:55, 22.04it/s]


MemoryError: Unable to allocate array with shape (919, 102000) and data type float32

In [9]:
mmp

OSError: ``ChIPseq.A549.CTCF.1000.random.narrowPeak.hdf5`` does not exist

In [10]:
ctcf_mutation_maps = [
    mmp.mutation_map[i]['seq']['diff']['A549_CTCF_None_763']
    for i in range(1000)
]
ctcf_original_preds = [
    tf_compute_normalized_prob(mmp.mutation_map[i]['seq']['ref']['A549_CTCF_None_763'])
    for i in range(1000)
]

NameError: name 'mmp' is not defined

In [None]:
chrom_acc_mutation_maps = [
    mmp.mutation_map[i]['seq']['diff']['A549_DNase_None_52'],
    for i in range(1000)
]
chrom_acc_original_preds = [
    chrom_acc_normalized_prob(mmp.mutation_map[i]['seq']['ref']['A549_DNase_None_52'])
    for i in range(1000)
]

In [None]:
# mmp.plot_mutmap(0, 'seq', 'logit', 'A549_CTCF_None_720')
ctcf_mutation_maps

[0.         0.26206362 0.26206362 0.26206362]
[0.26622224 0.         0.26622224 0.26622224]
[0.17965873 0.         0.17965873 0.17965873]
[0.2005928 0.        0.2005928 0.2005928]
[0.20922153 0.         0.20922153 0.20922153]
[0.21954447 0.         0.21954447 0.21954447]
[0.         0.27018908 0.27018908 0.27018908]
[0.41303042 0.         0.41303042 0.41303042]
[0.23649934 0.23649934 0.         0.23649934]


In [None]:
IDX_TO_NT = 'ACGT'

def _convert_to_mutation(pos_nt_pair):
    return "%d%s" % (pos_nt_pair[1], IDX_TO_NT[pos_nt_pair[0]])

def find_best_mutation(mutation_map, sign=1):
    seq_len = mutation_map.shape[1]
    # loop over all positions changing to each position nucleotide
    # note everything is implicitly parallelized over the batch here
    best_mute = None
    best_mute_effect = 0
    second_best_mute = None
    second_best_mute_effect = 0
    for seq_idx in range(seq_len):  # iterate over sequence
        best_out_of_nts_mute = None
        best_out_of_nts_mute_effect = 0 
        for nt_idx in range(4):  # iterate over nucleotides
            current_effect = mutation_map[nt_idx, seq_idx]
            
            if sign * current_effect > sign * best_out_of_nts_mute_effect:
                best_out_of_nts_mute_effect = current_effect
                best_out_of_nts_mute = (nt_idx, seq_idx)

        # TODO(Stephen): the right way to do this is to have a heap, squish the 2D array into a 1D
        # array of (effect, (seq_idx, nt_idx)) tuples and then take the top-k from the heap, but
        # I'm too lazy to do this right now.
        if sign * best_out_of_nts_mute_effect > (sign * best_mute_effect):
            best_mute_effect = best_out_of_nts_mute_effect
            best_mute = best_out_of_nts_mute
        elif sign * best_out_of_nts_mute_effect > (sign * second_best_mute_effect):
            second_best_mute_effect = best_out_of_nts_mute_effect
            second_best_mute = best_out_of_nts_mute 
    print(best_mute, best_mute_effect, second_best_mute, second_best_mute_effect)
    return [best_mute, second_best_mute]


with open(mut_effects_fpath, 'w', newline="") as out_file:
    fieldnames = [
        "sequence",
        "initial X prediction",
        "new X prediction",
        "initial Y prediction",
        "new Y prediction",
        "mutation",
    ]
    writer = csv.DictWriter(f, delimiter=",", fieldnames=fieldnames)
    writer.writeheader()
    
    for i, ctcf_mutation_map in enumerate(ctcf_mutation_maps):
        best_mut_idx, best_mut_effect = find_best_mutation(ctcf_mutation_map['mutation_map'], sign=1)
        nt_idx, seq_idx = best_mut_idx
        original_ctcf_pred = ctcf_original_preds[i][nt_idx, seq_idx]
        new_ctcf_pred = original_ctcf_pred + ctcf_mutation_map[nt_idx, seq_idx]
        original_chrom_acc_pred = chrom_acc_original_preds[i][nt_idx, seq_idx]
        new_chrom_acc_pred = original_chrom_acc_pred + chrom_acc_mutation_maps[i][nt_idx, seq_idx]
        writer.writerow(
            {
                "sequence": ctcf_original_preds[i]['ref'],
                "initial X prediction": original_ctcf_pred,
                "new X prediction": new_ctcf_pred,
                "initial Y prediction": original_chrom_acc_pred,
                "new Y prediction": new_chrom_acc_pred,
                "mutation": _convert_to_mutation(best_mut_idx),
            }
        )
        


In [None]:
from kipoi_veff import predict_snvs 
dl_kwargs = {'fasta_file': '../dat/hg19.fa', 'intervals_file': '../dat/ChIPseq.A549.CTCF.conservative.train.narrowPeak.gz'}
predict_snvs(deepsea, deepsea.default_dataloader, '../dat/A549__CTCF__100_random_seqs.vcf.gz', 101, dataloader_args=dl_kwargs)