In [None]:
!git clone https://github.com/arnavmdas/epiphany.git

Cloning into 'epiphany'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 65 (delta 17), reused 37 (delta 8), pack-reused 6[K
Unpacking objects: 100% (65/65), done.


In [None]:
%%capture

##########################
#    Loading packages    #
##########################

# 1. Load packages

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch import randn
from torch.nn import MSELoss
import torch.optim as optim
from torch.optim import Adam
from torch.utils.data import DataLoader
import numpy as np
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
import random
import pandas as pd
import seaborn as sns
import time
import pickle
from datetime import datetime
!pip install hickle
import hickle as hkl
from torch.autograd import Variable
import gzip
import sys
import os 
from sklearn.decomposition import TruncatedSVD, PCA
torch.set_default_tensor_type(torch.DoubleTensor)
!pip install pyBigWig
import pyBigWig

# 2. Load data - part 2

!wget https://s3.amazonaws.com/hicfiles.tc4ga.com/public/juicer/juicer_tools_1.22.01.jar
!wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.refGene.gtf.gz
!gunzip /content/hg38.refGene.gtf.gz

chrom_list = ["chr"+str(i) for i in range(1,23)] #for human hg38
length_list = [248956422,242193529,198295559,190214555,181538259,170805979,159345973,145138636,
               138394717,133797422,135086622,133275309,114364328,107043718,101991189,90338345,
               83257441,80373285,58617616,64444167,46709983,50818468]
chrom_len_dict = dict(zip(chrom_list,length_list))

## Load pre-trained model 

- Pretrained on GM12878 cell line
- Predicts Hi-C contact matrices at 10Kb resolution. 

In [None]:
%cd /content/epiphany/downstream/utils
from model_architecture_util import * 

%cd /content/epiphany/
!mkdir pretrained
%cd pretrained
!wget -O pretrained_GM12878.pt_model https://wcm.box.com/shared/static/vv8xzxnurfk8ddjwuc9evkhapl6fj0tu.pt_model

#Load model 
wsize = 14000
net = Net(window_size=wsize)
restore(net,'/content/epiphany/pretrained/pretrained_GM12878.pt_model')
net.eval()

/content/epiphany/downstream/utils
/content/epiphany
/content/epiphany/pretrained
--2022-06-10 01:31:51--  https://wcm.box.com/shared/static/vv8xzxnurfk8ddjwuc9evkhapl6fj0tu.pt_model
Resolving wcm.box.com (wcm.box.com)... 107.152.29.197
Connecting to wcm.box.com (wcm.box.com)|107.152.29.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/vv8xzxnurfk8ddjwuc9evkhapl6fj0tu.pt_model [following]
--2022-06-10 01:31:51--  https://wcm.box.com/public/static/vv8xzxnurfk8ddjwuc9evkhapl6fj0tu.pt_model
Reusing existing connection to wcm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://wcm.app.box.com/public/static/vv8xzxnurfk8ddjwuc9evkhapl6fj0tu.pt_model [following]
--2022-06-10 01:31:51--  https://wcm.app.box.com/public/static/vv8xzxnurfk8ddjwuc9evkhapl6fj0tu.pt_model
Resolving wcm.app.box.com (wcm.app.box.com)... 107.152.29.201
Connecting to wcm.app.box.com (wcm.app.box.com)|107.152.29.201|:443.

Net(
  (conv1): ConvBlock(
    (conv): Conv1d(5, 70, kernel_size=(17,), stride=(1,))
    (act): ReLU()
    (pool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (do1): Dropout(p=0.1, inplace=False)
  (conv2): ConvBlock(
    (conv): Conv1d(70, 90, kernel_size=(7,), stride=(1,))
    (act): ReLU()
    (pool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (do2): Dropout(p=0.1, inplace=False)
  (conv3): ConvBlock(
    (conv): Conv1d(90, 70, kernel_size=(5,), stride=(1,))
    (act): ReLU()
    (pool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (do3): Dropout(p=0.1, inplace=False)
  (conv4): ConvBlock(
    (conv): Conv1d(70, 20, kernel_size=(5,), stride=(1,))
    (act): ReLU()
  )
  (pool): AdaptiveMaxPool1d(output_size=45)
  (do4): Dropout(p=0.1, inplace=False)
  (rnn1): LSTM(900, 1200, batch_first=True, bidirectional=True)
  (rnn2): LSTM(2400, 1200, batch_first=True, bidirectional=Tru

## Download data

### Download input epigenomic data for GM12878

In [None]:
%cd /content/epiphany 
!mkdir bigWig
%cd bigWig 
!wget -O GM12878_CTCF_merge.bigWig https://wcm.box.com/shared/static/d1hzwihi97o9eghcqp03ec5e6rf60gre.bigwig
!wget -O GM12878_DNaseI_merge.bigWig https://wcm.box.com/shared/static/hiowh46s1yyps7hx5sk9qt10f62ls0rw.bigwig 
!wget -O GM12878_H3K27ac_merge.bigWig https://wcm.box.com/shared/static/l60ucwsmbxczuikralm8y6swnex7516q.bigwig
!wget -O GM12878_H3K27me3_merge.bigWig https://wcm.box.com/shared/static/te5tx8ygg69q2my9tm8wqlalmtd4am67.bigwig
!wget -O GM12878_H3K4me3_merge.bigWig https://wcm.box.com/shared/static/mi9inmxxyhlpqhep2mtviroy02wtnloy.bigwig

### Download HiC-DC+ normalized obs/exp ground truth data

Normalized data for each chromosome can be downloaded from the [Box drive](https://wcm.box.com/s/q5k0lm1050lkzigxi1ynh7muv9x2b2mq)

In [None]:
%cd /content/epiphany 
!mkdir ground_truth
%cd ground_truth 

!wget -O chr3_ground_truth.txt https://wcm.box.com/shared/static/nood2xtxdak9ln50k30yu0kjjolc8r8g.txt 

## Generate predictions for a single chromosome

The `results_generation` function to generate predicted contact maps (1Mb distance band for the entire chromosome)
- chrom: which chromosome to generate
- cell_type: find the epigenomic bigWig files for the corresponding cell type
- bwfile_dir: the folder where bigWig files for each epigenomic tracks are stored
- submatrix_location: location for saving intermediate file (submatrices along the chromosome)
- assemble_matrix_location: location for saving intermediate file2 (assembled predicted submatrices along the chromosome)
- ground_truth_file: location of the ground truth contact matrices (saved as lists of lists in pickle format)
- ground_truth_location: location for saving subset ground truth with consistent coordinates with the predictions
- window_size: window size used in the model 
- seq_length: length of the submatrix along the diagonal
- resolution_hic: resolution of the Hi-C contact maps (default is 10kb)


In [None]:
%cd /content/epiphany/downstream/utils
from generate_predictions_util import *

#2. generate predictions for chromosomes

chrom = "chr3"
print(chrom,datetime.now())
results_generation(chrom = chrom, net=net, 
                    cell_type = "GM12878", 
                    bwfile_dir = "/content/epiphany/bigWig",
                    submatrix_location = "/content/intermediate_matrices.txt", assemble_matrix_location = "/content/assembled_chromosome.txt",
                    ground_truth_file = '/content/epiphany/ground_truth/chr3_ground_truth.txt', ground_truth_location = "/content/ground_truth_corresponding_location.txt", 
                    window_size = wsize) #normcounts, zvalue, zfull