In [34]:
import numpy as np
import pandas as pd
import torch
import torchvision
from torchvision import transforms
import torch.utils.data as td
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import math

froot = './proseqSimulator'

df = pd.read_csv(froot + ".csv")

In [35]:
ALPHABET = 'ACGT'
NALPH = len(ALPHABET)
INVALPH = [-1] * ord('Z')
for i, char in enumerate(ALPHABET):
    INVALPH[ord(char)] = i
        
def seq_to_one_hot(seq):
    seqlen = len(seq)
    res = np.zeros(NALPH * seqlen, dtype=np.uint8)    
    arr = np.array(list(seq))
    for j, c in enumerate(arr):
        res[NALPH*j + INVALPH[ord(c)]] = 1
    return res

In [39]:
# convert to X,Y pairs representing single-column counts (Y) 
# and corresponding centered 50bp segments of the sequence (X)
stride = 10
featlen = 50
offset = math.ceil(featlen/2)
seqlen = len(df['seq'][0])
centidx = range(offset, seqlen-offset, stride)

j = 0
y = np.zeros(len(centidx) * len(df),dtype=np.int16)
x = np.zeros((len(centidx)*len(df),4*featlen),dtype=np.uint8)

for i in range(len(df)):
    allcounts = np.array(df['readCounts'][i].strip('[]').split(),dtype=np.int16)  # better way?
    allseq = df['seq'][i]

    for r in centidx:
        y[j] = allcounts[r]
        x[j,:] = seq_to_one_hot(allseq[r-offset:r-offset+featlen])
        j += 1

In [41]:
# set them up as tensors
xtens = torch.Tensor(x)
ytens = torch.Tensor(y)

In [47]:
# split into train, test, and validation sets
allset = TensorDataset(xtens, ytens)
trnset, valset, tstset = td.random_split(allset, [0.5,0.25,0.25])

# set up data loaders
trndl = DataLoader(trnset, batch_size=64, shuffle=True)
tstdl = DataLoader(tstset, batch_size=64, shuffle=True)
valdl = DataLoader(valset, batch_size=64, shuffle=True)

In [92]:
# set up the model
import torch.nn as nn

# this model is slightly adapted from an image-processing CNN in 
#"Machine Learning with PyTorch and Scikit-Learn", Raschka et al.
model = nn.Sequential()
model.add_module(
    'conv1',
    nn.Conv1d(
        in_channels=1, out_channels=8,
        kernel_size=10, padding=10
    )
)
model.add_module('relu1', nn.ReLU())
model.add_module('pool1', nn.MaxPool1d(kernel_size=10))
model.add_module('flatten', nn.Flatten())
model.add_module('linear', nn.Linear(168,1))

# check model
print(model)

x = torch.ones((64,1,200))
print(model(x).shape)
nparm = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters: " + str(nparm))

Sequential(
  (conv1): Conv1d(1, 8, kernel_size=(10,), stride=(1,), padding=(10,))
  (relu1): ReLU()
  (pool1): MaxPool1d(kernel_size=10, stride=10, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=168, out_features=1, bias=True)
)
torch.Size([64, 1])
Number of parameters: 257


In [95]:
def train(model, num_epochs, train_dl, valid_dl):
    loss_hist_train = [0] * num_epochs
    loss_hist_valid = [0] * num_epochs
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}')
        model.train()
        for x_batch, y_batch in train_dl:
            # have to fix dimensionality
            x_batch_sz = x_batch.size()[0]
            x_batch_re = torch.reshape(x_batch, (x_batch_sz, 1, 200))
            pred = model(x_batch_re)    
            loss = loss_fn(pred, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss_hist_train[epoch] += loss.item()*y_batch.size(0)
            
        loss_hist_train[epoch] /= len(train_dl.dataset)
        
        model.eval()
        
        with torch.no_grad():
            for x_batch, y_batch in valid_dl:
                # have to fix dimensionality
                x_batch_sz = x_batch.size()[0]
                x_batch_re = torch.reshape(x_batch, (x_batch_sz, 1, 200))
                pred = model(x_batch_re)    
                loss = loss_fn(pred, y_batch)
                loss_hist_valid[epoch] += loss.item()*y_batch.size(0)
                
            loss_hist_valid[epoch] /= len(valid_dl.dataset)
            
        print(f'Epoch {epoch+1} trn_loss: '
              f'{loss_hist_train[epoch]:.4f} val_loss: '
              f'{loss_hist_valid[epoch]:.4f}')
        
    return loss_hist_train, loss_hist_valid

In [96]:
loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
torch.manual_seed(1)
num_epochs = 20
hist = train(model, num_epochs, trndl, valdl)

Epoch 1
Epoch 1 trn_loss: 1.1332 val_loss: 1.1484
Epoch 2
Epoch 2 trn_loss: 1.1194 val_loss: 1.1484
Epoch 3
Epoch 3 trn_loss: 1.1184 val_loss: 1.1496
Epoch 4
Epoch 4 trn_loss: 1.1176 val_loss: 1.1444
Epoch 5
Epoch 5 trn_loss: 1.1170 val_loss: 1.1448
Epoch 6
Epoch 6 trn_loss: 1.1166 val_loss: 1.1479
Epoch 7
Epoch 7 trn_loss: 1.1163 val_loss: 1.1444
Epoch 8
Epoch 8 trn_loss: 1.1162 val_loss: 1.1471
Epoch 9
Epoch 9 trn_loss: 1.1163 val_loss: 1.1442
Epoch 10
Epoch 10 trn_loss: 1.1157 val_loss: 1.1432
Epoch 11
Epoch 11 trn_loss: 1.1157 val_loss: 1.1434
Epoch 12
Epoch 12 trn_loss: 1.1159 val_loss: 1.1484
Epoch 13
Epoch 13 trn_loss: 1.1157 val_loss: 1.1473
Epoch 14
Epoch 14 trn_loss: 1.1154 val_loss: 1.1441
Epoch 15
Epoch 15 trn_loss: 1.1158 val_loss: 1.1437
Epoch 16
Epoch 16 trn_loss: 1.1156 val_loss: 1.1432
Epoch 17
Epoch 17 trn_loss: 1.1150 val_loss: 1.1430
Epoch 18
Epoch 18 trn_loss: 1.1150 val_loss: 1.1426
Epoch 19
Epoch 19 trn_loss: 1.1152 val_loss: 1.1426
Epoch 20
Epoch 20 trn_loss: 1.