In [1]:
import numpy as np
import pandas as pd
import torch
import torchvision
from torchvision import transforms
import torch.utils.data as td
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import math

froot = './proseqSimulator'

df = pd.read_csv(froot + ".csv")

Sequences were generated with: ./motifSimulator.py --m CTCF --N 2000 --len 500 --proseq 500,50000 --o proseqSimulator

In [2]:
ALPHABET = 'ACGT'
NALPH = len(ALPHABET)
INVALPH = [-1] * ord('Z')
for i, char in enumerate(ALPHABET):
    INVALPH[ord(char)] = i
        
def seq_to_one_hot(seq):
    seqlen = len(seq)
    res = np.zeros(NALPH * seqlen, dtype=np.uint8)    
    arr = np.array(list(seq))
    for j, c in enumerate(arr):
        res[NALPH*j + INVALPH[ord(c)]] = 1
    return res

In [3]:
# convert to X,Y pairs representing single-column counts (Y) 
# and corresponding centered 50bp segments of the sequence (X)
stride = 5
featlen = 50
offset = math.ceil(featlen/2)
seqlen = len(df['seq'][0])
centidx = range(offset, seqlen-offset, stride)

j = 0
y = np.zeros(len(centidx) * len(df),dtype=np.int16)
x = np.zeros((len(centidx)*len(df),4*featlen),dtype=np.uint8)

for i in range(len(df)):
    allcounts = np.array(df['readCounts'][i].strip('[]').split(),dtype=np.int16)  # better way?
    allseq = df['seq'][i]

    for r in centidx:
        y[j] = allcounts[r]
        x[j,:] = seq_to_one_hot(allseq[r-offset:r-offset+featlen])
        j += 1

In [4]:
# set them up as tensors
xtens = torch.Tensor(x)
ytens = torch.Tensor(y)

In [5]:
# split into train, test, and validation sets
allset = TensorDataset(xtens, ytens)
trnset, valset, tstset = td.random_split(allset, [0.5,0.25,0.25])

# set up data loaders
trndl = DataLoader(trnset, batch_size=128, shuffle=True)
tstdl = DataLoader(tstset, batch_size=128, shuffle=True)
valdl = DataLoader(valset, batch_size=128, shuffle=True)

In [8]:
# set up the model
import torch.nn as nn

# this model is slightly adapted from an image-processing CNN in 
#"Machine Learning with PyTorch and Scikit-Learn", Raschka et al.
model = nn.Sequential()
model.add_module(
    'conv1',
    nn.Conv1d(
        in_channels=1, out_channels=8,
        kernel_size=21, padding=10
    )
)
model.add_module('relu1', nn.ReLU())
model.add_module('pool1', nn.MaxPool1d(kernel_size=21))

model.add_module(
    'conv2',
    nn.Conv1d(
        in_channels=8, out_channels=8,
        kernel_size=5, padding=2
    )
)
model.add_module('relu2', nn.ReLU())
model.add_module('pool2', nn.MaxPool1d(kernel_size=5))

model.add_module('flatten', nn.Flatten())
model.add_module('linear', nn.Linear(8,1))

# check model
print(model)

x = torch.ones((128,1,200))
print(model(x).shape)
nparm = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters: " + str(nparm))

Sequential(
  (conv1): Conv1d(1, 8, kernel_size=(21,), stride=(1,), padding=(10,))
  (relu1): ReLU()
  (pool1): MaxPool1d(kernel_size=21, stride=21, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(8, 8, kernel_size=(5,), stride=(1,), padding=(2,))
  (relu2): ReLU()
  (pool2): MaxPool1d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=8, out_features=1, bias=True)
)
torch.Size([128, 1])
Number of parameters: 513


In [9]:
# custom loss function
def my_loss(prediction, label):
#    loss = torch.mean((torch.exp(prediction) - label)**2)
    
    # poisson loss
    poismean = torch.exp(prediction)
    loss = torch.sum(-1*(-poismean + label*torch.log(poismean)))
    
    return loss

In [10]:
# debug the model
xbch, ybch = next(iter(trndl))
xsz = xbch.size()[0]
xbch_re = torch.reshape(xbch, (xsz, 1, 200))
pred = model(xbch_re)

print(my_loss(pred, ybch))

tensor(16274.6553, grad_fn=<SumBackward0>)


In [11]:
def train(model, num_epochs, train_dl, valid_dl):
    loss_hist_train = [0] * num_epochs
    loss_hist_valid = [0] * num_epochs
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}')
        model.train()
        for x_batch, y_batch in train_dl:
            # have to fix dimensionality
            x_batch_sz = x_batch.size()[0]
            x_batch_re = torch.reshape(x_batch, (x_batch_sz, 1, 200))
            pred = model(x_batch_re)    
            loss = my_loss(pred, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss_hist_train[epoch] += loss.item()*y_batch.size(0)
            
        loss_hist_train[epoch] /= len(train_dl.dataset)
        
        model.eval()
        
        with torch.no_grad():
            for x_batch, y_batch in valid_dl:
                # have to fix dimensionality
                x_batch_sz = x_batch.size()[0]
                x_batch_re = torch.reshape(x_batch, (x_batch_sz, 1, 200))
                pred = model(x_batch_re)    
                loss = my_loss(pred, y_batch)
                loss_hist_valid[epoch] += loss.item()*y_batch.size(0)
                
            loss_hist_valid[epoch] /= len(valid_dl.dataset)
            
        print(f'Epoch {epoch+1} trn_loss: '
              f'{loss_hist_train[epoch]:.4f} val_loss: '
              f'{loss_hist_valid[epoch]:.4f}')
        
    return loss_hist_train, loss_hist_valid

In [12]:
#loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
torch.manual_seed(1)
num_epochs = 20
hist = train(model, num_epochs, trndl, valdl)

Epoch 1
Epoch 1 trn_loss: 14485.0515 val_loss: 14286.0882
Epoch 2
Epoch 2 trn_loss: 14448.4256 val_loss: 14077.4855
Epoch 3
Epoch 3 trn_loss: 14444.6594 val_loss: 14245.9547
Epoch 4
Epoch 4 trn_loss: 14432.1860 val_loss: 14072.3658
Epoch 5
Epoch 5 trn_loss: 14434.7362 val_loss: 14095.2608
Epoch 6
Epoch 6 trn_loss: 14426.2765 val_loss: 14074.8800
Epoch 7
Epoch 7 trn_loss: 14429.7737 val_loss: 14077.0009
Epoch 8
Epoch 8 trn_loss: 14423.9435 val_loss: 14142.6516
Epoch 9
Epoch 9 trn_loss: 14427.4596 val_loss: 14213.9407
Epoch 10
Epoch 10 trn_loss: 14438.6623 val_loss: 14109.4650
Epoch 11
Epoch 11 trn_loss: 14424.9856 val_loss: 14168.6757
Epoch 12
Epoch 12 trn_loss: 14430.1867 val_loss: 14138.4609
Epoch 13
Epoch 13 trn_loss: 14416.5405 val_loss: 14072.1584
Epoch 14
Epoch 14 trn_loss: 14428.7585 val_loss: 14143.1187
Epoch 15
Epoch 15 trn_loss: 14424.6293 val_loss: 14079.5368
Epoch 16
Epoch 16 trn_loss: 14417.9162 val_loss: 14077.0470
Epoch 17
Epoch 17 trn_loss: 14419.9357 val_loss: 14124.713