# Predicting TF binding in DNA sequences using convolutional networks

## Imports

In [1]:
import csv
import math 
import random
import gzip
import torch
from sklearn import metrics
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import os 
import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
# datasets can be obtained e.g. from:
# https://github.com/MedChaabane/deepRAM/tree/master/datasets/ChIP-seq

def seqtopad(sequence,motlen):
    rows=len(sequence)+2*motlen-2
    S=np.empty([rows,4])
    base=['A', 'C', 'G', 'T']
    for i in range(rows):
        for j in range(4):
            if (i-motlen+1<len(sequence) and sequence[i-motlen+1]=='N' 
                or i<motlen-1 or i>len(sequence)+motlen-2):
                S[i,j]=np.float32(0.25)
            elif sequence[i-motlen+1]==base[j]:
                S[i,j]=np.float32(1)
            else:
                S[i,j]=np.float32(0)
    return np.transpose(S)

def openFile(path, motiflen=24):
        train_dataset=[]
        sequences=[]
        with gzip.open(path, 'rt') as data:
                next(data)
                reader = csv.reader(data,delimiter='\t')
                for row in reader:
                    train_dataset.append(
                        [seqtopad(row[2],motiflen),[int(row[3])]])
                    sequences.append(row[2])
  
        random.shuffle(train_dataset)
        size=int(len(train_dataset)/5)
        firsttrain=train_dataset[:4*size]
        firstvalid=train_dataset[4*size+1:]
        validseq = sequences[4*size+1:]

        return firsttrain,firstvalid,train_dataset,sequences,validseq

class chipseq_dataset(Dataset):
    def __init__(self,xy=None):
        self.x_data=np.asarray([el[0] for el in xy],dtype=np.float32)
        self.y_data =np.asarray([el[1] for el in xy ],dtype=np.float32)
        self.x_data = torch.from_numpy(self.x_data)
        self.y_data = torch.from_numpy(self.y_data)
        self.len=len(self.x_data)
      

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

train1,valid1,alldataset,sequences,seq_motif=openFile(
    "SRF_H1-hESC_SRF_HudsonAlpha_B.seq.gz")


In [3]:
train1_dataset=chipseq_dataset(train1)
valid1_dataset=chipseq_dataset(valid1)

batch_size = 64
train_loader = DataLoader(dataset=train1_dataset,
                          batch_size=batch_size,shuffle=True)
valid_loader = DataLoader(dataset=valid1_dataset,
                          batch_size=batch_size,shuffle=True)

## Network Architecture

In [6]:
class DeepBind(nn.Module):
    def __init__(self, nummotif, motiflen,
                 sigmaConv, sigmaNeu):
      
        super(DeepBind, self).__init__()
        self.sigmaConv=sigmaConv
        self.sigmaNeu=sigmaNeu
        self.input_channels=4
        self.activation = nn.ReLU()
        self.conv_weights=torch.randn(nummotif,
                                      self.input_channels, 
                                      motiflen).to(device)
        self.conv_bias=torch.randn(nummotif).to(device)
        self.FC_size=nummotif
        torch.nn.init.normal_(self.conv_weights, mean=0,
                              std=sigmaConv)
        torch.nn.init.normal_(self.conv_bias)        
        self.conv_weights.requires_grad=True
        self.conv_bias.requires_grad=True
        
        self.wNeu=torch.randn(self.FC_size,1).to(device)
        self.wNeuBias=torch.randn(1).to(device)
        torch.nn.init.normal_(self.wNeu,mean=0,
                              std=self.sigmaNeu)
        torch.nn.init.normal_(self.wNeuBias,mean=0,
                              std=self.sigmaNeu)
        self.wNeu.requires_grad=True
        self.wNeuBias.requires_grad=True
                
    def get_weights(self):
        return [self.conv_weights, self.conv_bias, model.wNeu, model.wNeuBias]
        
    def forward(self, x):
        # 1d convolution:
        x=F.conv1d(x, self.conv_weights, bias=self.conv_bias, 
                   stride=1, padding=0)
        # activation
        x=self.activation(x)
        # max pooling
        x,_ = torch.max(x, dim=2)
        # fully connected layer
        x=x@self.wNeu + self.wNeuBias
        return torch.sigmoid(x)

## Train the Model

In [7]:
device = 'cpu'
model = DeepBind(16,14,1e-06,0.001).to(device)

optimizer = torch.optim.SGD(model.get_weights(),
                            lr=0.01,momentum=0.9,nesterov=True,
                            weight_decay=3e-06)

import torch.nn.functional as F
learning_steps=0
model.train()
print(model)
while learning_steps<=5000:
    for i, (data, target) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        loss = F.binary_cross_entropy(output, target)
        if learning_steps%100==0 :
            print(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        learning_steps+=1

torch.save(model, 'best_model.pkl')


DeepBind(
  (activation): ReLU()
)
tensor(0.6932, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6906, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6927, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6939, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6867, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.6004, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.4615, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.3564, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.3093, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.3564, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.4020, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.3672, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.3877, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.2545, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.2914, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.2541, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.3287, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.1930, grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.21

## Evaluate the model

In [8]:
with torch.no_grad():
    model.eval()
    pred_list = []
    labels_list = []
    for i, (data, target) in enumerate(valid_loader):
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        pred=output.cpu().detach().numpy().reshape(output.shape[0])
        labels=target.cpu().numpy().reshape(output.shape[0])
        pred_list.append(pred)
        labels_list.append(labels)
    labels = np.concatenate(labels_list)
    predictions = np.concatenate(pred_list)
    auc = metrics.roc_auc_score(labels, predictions)
    print('AUC on validation data ', auc)


AUC on validation data  0.9078787878787878


In [None]:
model.conv_weights[0]