# Create pseudo-bulk bigwig file 

In [33]:
!samtools index ../data/initial_10x_outputs/atac_peaks/D20_1_ATAC.bam ../data/initial_10x_outputs/atac_peaks/D20_1_ATAC.bam.bai

In [44]:
!python create_bw_cell_type.py

bamFilesList: ['../results/bam_cell_type/D20_1/Myogenic.bam']
binLength: 50
numberOfSamples: None
blackListFileName: None
skipZeroOverZero: False
bed_and_bin: False
genomeChunkSize: None
defaultFragmentLength: read length
numberOfProcessors: 4
verbose: False
region: None
bedFile: None
minMappingQuality: None
ignoreDuplicates: False
chrsToSkip: []
stepSize: 50
center_read: False
samFlag_include: None
samFlag_exclude: None
minFragmentLength: 0
maxFragmentLength: 0
zerosToNans: False
smoothLength: None
save_data: False
out_file_for_raw_data: None
maxPairedFragmentLength: 1000
[W::bam_hdr_read] EOF marker is absent. The input is probably truncated
[E::bgzf_read_block] Failed to read BGZF block data at offset 12081241563 expected 25580 bytes; hread returned 22386
[E::bgzf_read] Read block operation failed with error 4 after 0 of 4 bytes
samtools index: failed to create index for "../results/bam_cell_type/D20_1/Mesenchymal.bam": No such file or directory
The file '../results/bam_cell_type/D2

# Dataloader for discrete values

In [None]:
adata = anndata.read_h5ad('../results/concat.h5ad')

adata = pseudo_bulk(adata,'cell_type')
fetch_sequence(adata, path_genome='../data/hg38.fa')

adata

# Create model class + functions

In [45]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

class BPNet(nn.Module):
    def __init__(self, nb_conv=8, nb_filters=64, first_kernel=21, rest_kernel=3, profile_kernel_size=75, out_pred_len=1000):
        super().__init__()
        """ BPNet architechture as in paper 
        
        Parameters
        -----------
        nb_conv: int (default 8)
            number of convolutional layers

        nb_filters: int (default 64)
            number of filters in the convolutional layers

        first_kernel: int (default 25)
            size of the kernel in the first convolutional layer

        rest_kernel: int (default 3)
            size of the kernel in all convolutional layers except the first one

        profile_kernel_size: int (default 75)
            size of the kernel in the profile convolution

        out_pred_len: int (default 1000)
            number of bp for which ATAC signal is predicted

        Model Architecture 
        ------------------------

        - Body: sequence of convolutional layers with residual skip connections, dilated convolutions, 
        and  ReLU activation functions

        - Head: 
            > Profile prediction head: a multinomial probability of Tn5 insertion counts at each position 
            in the input sequence, deconvolution layer
            > Total count prediction: the total Tn5 insertion counts over the input region, global average
            poooling and linear layer predicting the total count per strand
        
        The predicted (expected) count at a specific position is a multiplication of the predicted total 
        counts and the multinomial probability at that position.

        -------------------------
        
        Reference: Avsec, Ž., Weilert, M., Shrikumar, A. et al. Base-resolution models of transcription-factor binding 
        reveal soft motif syntax. Nat Genet 53, 354–366 (2021). https://doi.org/10.1038/s41588-021-00782-6

        
        """
        #Define parameters
        self.nb_conv = nb_conv
        self.nb_filters = nb_filters
        self.first_kernel = first_kernel
        self.rest_kernel = rest_kernel
        self.profile_kernel = profile_kernel_size
        self.out_pred_len = out_pred_len

        #Convolutional layers
        self.convlayers = nn.ModuleList()

        self.convlayers.append(nn.Conv1d(in_channels=4, 
                                         out_channels=self.nb_filters,
                                         kernel_size=self.first_kernel))
        for i in range (1,self.nb_conv):
            self.convlayers.append(nn.Conv1d(in_channels=self.nb_filters, 
                                         out_channels=self.nb_filters,
                                         kernel_size=self.rest_kernel,
                                         dilation=2**i))
        #Profile prediction head   
        self.profile_conv = nn.ConvTranspose1d(self.nb_filters, 1, kernel_size=self.profile_kernel)
        self.flatten = nn.Flatten()

        #Total count prediction head
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.linear = nn.Linear(self.nb_filters,1)

            
    def forward(self,x):
        
        #Residual + Dilated convolution layers
        #-----------------------------------------------
        x = F.relu(self.convlayers[0](x))

        for layer in self.convlayers[1:]:
            
            conv_x = F.relu(layer(x))

            #Crop output previous layer to size of current 
            x_len = x.size(2); conv_x_len = conv_x.size(2)
            cropsize = (x_len - conv_x_len) // 2
            x = x[:, :, cropsize:-cropsize] 

            #Skipped connection
            x = conv_x + x    

        #Profile head
        #-----------------------------------------------
        profile = self.profile_conv(x)
        
        cropsize = int((profile.size(2)/2) - (self.out_pred_len/2))
        profile = profile[:,:, cropsize:-cropsize]
        
        profile = self.flatten(profile)

        #Total count head
        #-----------------------------------------------
        count = self.global_pool(x)  
        count = count.squeeze()
        count = self.linear(count)

        return x, profile, count

The model predicts the base-resolution 1,000 bp length Tn5 insertion count profile using two complementary outputs: (1) the total Tn5 insertion counts over the 1,000 bp region, and (2) a multinomial probability of Tn5 insertion counts at each position in the 1,000 bp sequence. The predicted (expected) count at a specific position is a multiplication of the predicted total counts and the multinomial probability at that position.

In [47]:
m = BPNet()
input = torch.randn(32, 4, 2114)
x, profile, count = m(input)

profile

tensor([[-4.9177e+00, -4.1933e+00, -4.1938e+00,  ..., -6.8152e+00,
         -2.7607e+00, -1.0761e+01],
        [-8.8051e+00, -9.7412e-01, -5.7450e+00,  ..., -5.6425e+00,
         -3.1305e+00, -5.7297e+00],
        [-3.5067e+00, -4.0354e+00, -9.2691e+00,  ..., -6.7895e+00,
         -1.0056e+01, -4.8932e+00],
        ...,
        [-2.6666e+00, -1.8417e+00, -6.3831e+00,  ..., -7.5926e+00,
         -7.9752e+00, -3.2860e+00],
        [-5.9738e+00, -1.1024e+01, -6.0759e+00,  ..., -7.0113e+00,
         -1.0954e+01, -4.0736e+00],
        [ 2.6391e-03, -2.6899e+00, -4.6821e+00,  ..., -6.7192e+00,
         -1.1125e+00, -7.1009e+00]], grad_fn=<ReshapeAliasBackward0>)

In [7]:
nn.Conv1d(16, 33, 3, stride=2)

Conv1d(16, 33, kernel_size=(3,), stride=(2,))

In [6]:
m

BPNet(
  (convlayers): ModuleList(
    (0): Conv1d(4, 64, kernel_size=(25,), stride=(1,))
    (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), dilation=(2,))
    (2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), dilation=(4,))
    (3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), dilation=(8,))
    (4): Conv1d(64, 64, kernel_size=(3,), stride=(1,), dilation=(16,))
    (5): Conv1d(64, 64, kernel_size=(3,), stride=(1,), dilation=(32,))
    (6): Conv1d(64, 64, kernel_size=(3,), stride=(1,), dilation=(64,))
    (7): Conv1d(64, 64, kernel_size=(3,), stride=(1,), dilation=(128,))
    (8): Conv1d(64, 64, kernel_size=(3,), stride=(1,), dilation=(256,))
    (9): Conv1d(64, 64, kernel_size=(3,), stride=(1,), dilation=(512,))
  )
  (deconv): ConvTranspose1d(64, 2, kernel_size=(25,), stride=(1,), padding=(12,))
  (global_pool): AdaptiveAvgPool1d(output_size=1)
  (linear): Linear(in_features=64, out_features=2, bias=True)
)