In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

# Concatenate all data into single object

In [2]:
""" from data_preprocessing import * """

In [3]:
adata = preprocess_data('../data/initial_10x_outputs/', '../results/concat.h5ad')



In [10]:
pseudo_bulk = pseudo_bulk(adata=adata,col='cell_type')

In [14]:
pseudo_bulk

AnnData object with n_obs × n_vars = 9 × 1742749

# Create pseudo-bulk bigwig file 

In [33]:
!samtools index ../data/initial_10x_outputs/atac_peaks/D20_1_ATAC.bam ../data/initial_10x_outputs/atac_peaks/D20_1_ATAC.bam.bai

In [44]:
!python create_bw_cell_type.py

bamFilesList: ['../results/bam_cell_type/D20_1/Myogenic.bam']
binLength: 50
numberOfSamples: None
blackListFileName: None
skipZeroOverZero: False
bed_and_bin: False
genomeChunkSize: None
defaultFragmentLength: read length
numberOfProcessors: 4
verbose: False
region: None
bedFile: None
minMappingQuality: None
ignoreDuplicates: False
chrsToSkip: []
stepSize: 50
center_read: False
samFlag_include: None
samFlag_exclude: None
minFragmentLength: 0
maxFragmentLength: 0
zerosToNans: False
smoothLength: None
save_data: False
out_file_for_raw_data: None
maxPairedFragmentLength: 1000
[W::bam_hdr_read] EOF marker is absent. The input is probably truncated
[E::bgzf_read_block] Failed to read BGZF block data at offset 12081241563 expected 25580 bytes; hread returned 22386
[E::bgzf_read] Read block operation failed with error 4 after 0 of 4 bytes
samtools index: failed to create index for "../results/bam_cell_type/D20_1/Mesenchymal.bam": No such file or directory
The file '../results/bam_cell_type/D2

# Dataloader for discrete values

In [None]:
adata = anndata.read_h5ad('../results/concat.h5ad')

adata = pseudo_bulk(adata,'cell_type')
fetch_sequence(adata, path_genome='../data/hg38.fa')

adata

# Create model class + functions

In [3]:
class BPNet(nn.Module):
    def __init__(self, seq_len=1000, nb_conv=10, nb_filters=64, first_kernel=25, rest_kernel=3):
        super().__init__()
        """ BPNet architechture as in paper 
        
        Parameters
        -----------
        seq_len: int (default 1000)
            length of the input DNA sequence

        nb_conv: int (default 10)
            number of convolutional layers

        nb_filters: int (default 64)
            number of filters in the convolutional layers

        first_kernel: int (default 25)
            size of the kernel in the first convolutional layer

        rest_kernel: int (default 3)
            size of the kernel in all convolutional layers except the first one

        Model Architecture 
        ------------------------

        - Body: sequence of convolutional layers with residual skip connections, dilated convolutions, 
        and  ReLU activation functions

        - Head: 
            > Profile prediction head: a multinomial probability of Tn5 insertion counts at each position 
            in the input sequence, deconvolution layer
            > Total count prediction: the total Tn5 insertion counts over the input region, global average
            poooling and linear layer predicting the total count per strand
        
        The predicted (expected) count at a specific position is a multiplication of the predicted total 
        counts and the multinomial probability at that position.

        -------------------------
        
        Reference: Avsec, Ž., Weilert, M., Shrikumar, A. et al. Base-resolution models of transcription-factor binding 
        reveal soft motif syntax. Nat Genet 53, 354–366 (2021). https://doi.org/10.1038/s41588-021-00782-6

        
        """
        #Define parameters
        self.seq_len = seq_len
        self.nb_conv = nb_conv
        self.nb_filters = nb_filters
        self.first_kernel = first_kernel
        self.rest_kernel = rest_kernel

        #Convolutional layers
        self.convlayers = nn.ModuleList()

        self.convlayers.append(nn.Conv1d(in_channels=4, 
                                         out_channels=self.nb_filters,
                                         kernel_size=self.first_kernel,
                                         dilation=1, padding='same'))
        for i in range (1,self.nb_conv):
            self.convlayers.append(nn.Conv1d(in_channels=self.nb_filters, 
                                         out_channels=self.nb_filters,
                                         kernel_size=self.rest_kernel,
                                         dilation=2**i, padding='same'))
        #Profile prediction head   
        self.deconv = nn.ConvTranspose1d(self.nb_filters, 2, kernel_size=25, padding=12)
        
        #Total count prediction head
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.linear = nn.Linear(self.nb_filters,2)

            
    def forward(self,x):
        
        x = F.relu(self.convlayers[0](x))
        for layer in self.convlayers[1:]:
            x = F.relu(layer(x)) + x

        #Profile shape
        px = self.deconv(x)
        px = px.reshape((-1,2))

        #Total count head
        cx = self.global_pool(x)  
        cx = cx.squeeze(-1)
        cx = self.linear(cx)

        return x, px, cx

The model predicts the base-resolution 1,000 bp length Tn5 insertion count profile using two complementary outputs: (1) the total Tn5 insertion counts over the 1,000 bp region, and (2) a multinomial probability of Tn5 insertion counts at each position in the 1,000 bp sequence. The predicted (expected) count at a specific position is a multiplication of the predicted total counts and the multinomial probability at that position.

In [4]:
m = BPNet()
input = torch.randn(4, 1000)
output = m(input)

output[2].shape

torch.Size([2])