In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import torch
import glob
import pickle
import matplotlib.pyplot as plt

from interpretation.interpret import compute_importance_score_c_type, compute_importance_score_bias, visualize_sequence_imp
from models.models import CATAC2, CATAC_w_bias

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Sample from peak sequences

In [6]:
with open('../results/peaks_seq.pkl', 'rb') as file:
    seq = pickle.load(file)

seq = seq[seq.chr.isin(['6','13', '22'])].sequence
seq

Unnamed: 0_level_0,chr,start,end,middle_peak,sequence,GC_cont
peakID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13:18211588-18212586,13,18211588,18212586,18212087,AAGACTCTGAGAAAAACTGTAAATTAAACACAAATATGGTGGAAGC...,0.347412
13:18212943-18213672,13,18212943,18213672,18213308,GGTACATACAGACCATTGAATACTATGCAGCCACAAAAAAGAATAA...,0.295654
13:19587105-19588284,13,19587105,19588284,19587694,AAGGAAGTTATACAAAAAAACAATTTTGGGTGAGGTTGTTATCTAG...,0.452148
13:19597289-19598094,13,19597289,19598094,19597692,CAAACAGTCACCTGGAGCAGAGAGGTCAGTGTGTTTACGTGTTTCC...,0.466309
13:19601665-19602525,13,19601665,19602525,19602095,TTAGTAGAGAAGGGGTTTCACCATGTGTTGGTCAGGCTGGTCTCCA...,0.529541
...,...,...,...,...,...,...
6:170553912-170554797,6,170553912,170554797,170554354,TTCAATTTACAGCTCTTCCCTGTCAAGAGTCTTAAACAGAGCATCT...,0.453369
6:170574897-170575869,6,170574897,170575869,170575383,TGTAAAAGTACATCTTCAGCTGACTCAGGAATAAAATCAGAAAGGG...,0.375244
6:170584150-170585077,6,170584150,170585077,170584614,TGCTGGATAAATGTTGGCTACTATAATAAAATAAGCCTCTAAGATA...,0.483154
6:170596247-170597169,6,170596247,170597169,170596708,CAGGGGGTCCTCAGGATCCTCCTGGAATCTCTTCCTCAGCATCTGG...,0.416992


# Compute importance scores

In [None]:
path_model = '../results/train_res/128_10_model.pkl'

all_c_type = ['Immature', 'Mesenchymal', 'Myoblast', 'Myogenic', 'Neuroblast',
       'Neuronal', 'Somite']
time_point = ["D8", "D12", "D20", "D22"]

first_kernel = 21
nb_conv = 10
size_final_conv = 4096 - (first_kernel - 1)
cropped = [2**l for l in range(0,nb_conv-1)] * (2*(3-1))

for c in cropped:
       size_final_conv -= c

#Load the model
model = CATAC_w_bias(nb_conv=10, nb_filters=128, first_kernel=21, 
                      rest_kernel=3, out_pred_len=1024, 
                      nb_pred=4, size_final_conv=size_final_conv)
        
model.load_state_dict(torch.load(path_model, map_location=torch.device('cpu')))

path_model_bias = "../data/Tn5_NN_model.h5"

#Compute attribution scores
seq, shap_scores, proj_scores = compute_importance_score_bias(model, path_model_bias, seq, device, "Myogenic", all_c_type, 1)

In [None]:
#Save encoded seq + scores
np.savez('../results/encod_seq.npz', seq[:,:4,:])
np.savez('../results/seq_scores.npz', shap_scores[:,:4,:], proj_scores[:,:4,:])

print("Shap scores saved!")

In [None]:
seq = np.load('../results/encod_seq.npz')["arr_0"]
shap_scores = np.load('../results/seq_scores.npz')
proj_scores = shap_scores['arr_1']; shap_scores = shap_scores['arr_0']

# Visualize few examples

In [None]:
visualize_sequence_imp(proj_scores[[73],:4,:] ,0, 4096)
visualize_sequence_imp(proj_scores[[1266],:4,:] ,0, 4096)
visualize_sequence_imp(proj_scores[[563],:4,:] ,0, 4096)

# Use TF-modisco to find TFBS
Following tutorial at: https://github.com/jmschrei/tfmodisco-lite/blob/main/examples/ModiscoDemonstration.ipynb

In [None]:
!modisco motifs -s  ../results/encod_seq.npz -a  ../results/seq_scores.npz -n 2000 -o modisco_results.h5

In [None]:
!modisco report -i modisco_results.h5 -o report/

In [None]:
from IPython.display import HTML
HTML('report/motifs.html')

# Run TOMTOM on modisco results

In [None]:
!modisco report -i modisco_results.h5 -o report/TOMTOM/ -s report/TOMTOM/ -m ../data/JASPAR_motif.txt

In [None]:
from IPython.display import HTML

HTML('report/motifs.html')