In [48]:
import argparse
import numpy as np
import math
import random as r
import pandas as pd

import torch
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader

froot = './motifSimulator'

df = pd.read_csv(froot + ".csv")

In [50]:
ALPHABET = 'ACGT'
NALPH = len(ALPHABET)
INVALPH = [-1] * ord('Z')
for i, char in enumerate(ALPHABET):
    INVALPH[ord(char)] = i
        
def seq_to_one_hot(seq):
    seqlen = len(seq)
    res = np.zeros(NALPH * seqlen, dtype=np.uint8)    
    arr = np.array(list(seq))
    for j, c in enumerate(arr):
        res[NALPH*j + INVALPH[ord(c)]] = 1
    return res

In [53]:
from torch.utils.data import TensorDataset

# first convert all sequences to one-hot representation
# also convert both features and labels to tensors
feat = torch.Tensor(np.array(list(map(seq_to_one_hot, df['seq']))))  # is there a simpler way?
labl = torch.Tensor(df['hasMotif']).to(torch.uint8)

In [55]:
# split into train, test, and validation sets
import torch.utils.data as td
allset = TensorDataset(feat, labl)
trnset, valset, tstset = td.random_split(allset, [0.5,0.25,0.25])

# set up data loaders
trndl = DataLoader(trnset, batch_size=8, shuffle=True)
tstdl = DataLoader(tstset, batch_size=8, shuffle=True)
valdl = DataLoader(valset, batch_size=8, shuffle=True)

In [84]:
# set up the model
import torch.nn as nn

# this model is slightly adapted from an image-processing CNN in 
#"Machine Learning with PyTorch and Scikit-Learn", Raschka et al.
model = nn.Sequential()
model.add_module(
    'conv1',
    nn.Conv1d(
        in_channels=1, out_channels=8,
        kernel_size=21, padding=10
    )
)
model.add_module('relu1', nn.ReLU())
model.add_module('pool1', nn.MaxPool1d(kernel_size=21))
model.add_module(
    'conv2',
    nn.Conv1d(
        in_channels=8, out_channels=8,
        kernel_size=10, padding=2
    )
)
model.add_module('relu2', nn.ReLU())
model.add_module('pool2', nn.MaxPool1d(kernel_size=5))

model.add_module('flatten', nn.Flatten())
model.add_module('linear', nn.Linear(80,2))
model.add_module('fc1', nn.Softmax(dim=1))

# check model
print(model)

Sequential(
  (conv1): Conv1d(1, 8, kernel_size=(21,), stride=(1,), padding=(10,))
  (relu1): ReLU()
  (pool1): MaxPool1d(kernel_size=21, stride=21, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(8, 8, kernel_size=(10,), stride=(1,), padding=(2,))
  (relu2): ReLU()
  (pool2): MaxPool1d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=80, out_features=2, bias=True)
  (fc1): Softmax(dim=1)
)


In [85]:
x = torch.ones((8,1,1200))
model(x).shape

torch.Size([8, 2])

In [86]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [87]:
def train(model, num_epochs, train_dl, valid_dl):
    loss_hist_train = [0] * num_epochs
    accuracy_hist_train = [0] * num_epochs
    loss_hist_valid = [0] * num_epochs
    accuracy_hist_valid = [0] * num_epochs
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}')
        model.train()
        for x_batch, y_batch in train_dl:
            # have to fix dimensionality
            x_batch_sz = x_batch.size()[0]
            x_batch_re = torch.reshape(x_batch, (x_batch_sz, 1, 1200))
            pred = model(x_batch_re)    
            loss = loss_fn(pred, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss_hist_train[epoch] += loss.item()*y_batch.size(0)
            is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
            accuracy_hist_train[epoch] += is_correct.sum()
            
        loss_hist_train[epoch] /= len(train_dl.dataset)
        accuracy_hist_train[epoch] /= len(train_dl.dataset)
        
        model.eval()
        
        with torch.no_grad():
            for x_batch, y_batch in valid_dl:
                # have to fix dimensionality
                x_batch_sz = x_batch.size()[0]
                x_batch_re = torch.reshape(x_batch, (x_batch_sz, 1, 1200))
                pred = model(x_batch_re)    
                loss = loss_fn(pred, y_batch)
                loss_hist_valid[epoch] += loss.item()*y_batch.size(0)
                is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
                accuracy_hist_valid[epoch] += is_correct.sum()
                
            loss_hist_valid[epoch] /= len(valid_dl.dataset)
            accuracy_hist_valid[epoch] /= len(valid_dl.dataset)
            
        print(f'Epoch {epoch+1} accuracy: '
              f'{accuracy_hist_train[epoch]:.4f} val_accuracy: '
              f'{accuracy_hist_valid[epoch]:.4f}')
        
    return loss_hist_train, loss_hist_valid, accuracy_hist_train, accuracy_hist_valid

In [88]:
torch.manual_seed(1)
num_epochs = 20
hist = train(model, num_epochs, trndl, valdl)

Epoch 1
Epoch 1 accuracy: 0.6190 val_accuracy: 0.7387
Epoch 2
Epoch 2 accuracy: 0.8260 val_accuracy: 0.8451
Epoch 3
Epoch 3 accuracy: 0.8636 val_accuracy: 0.8751
Epoch 4
Epoch 4 accuracy: 0.8831 val_accuracy: 0.8805
Epoch 5
Epoch 5 accuracy: 0.8949 val_accuracy: 0.8909
Epoch 6
Epoch 6 accuracy: 0.9010 val_accuracy: 0.9061
Epoch 7
Epoch 7 accuracy: 0.9071 val_accuracy: 0.9053
Epoch 8
Epoch 8 accuracy: 0.9089 val_accuracy: 0.9015
Epoch 9
Epoch 9 accuracy: 0.9122 val_accuracy: 0.9181
Epoch 10
Epoch 10 accuracy: 0.9147 val_accuracy: 0.9199
Epoch 11
Epoch 11 accuracy: 0.9164 val_accuracy: 0.9199
Epoch 12
Epoch 12 accuracy: 0.9203 val_accuracy: 0.9181
Epoch 13
Epoch 13 accuracy: 0.9221 val_accuracy: 0.9179
Epoch 14
Epoch 14 accuracy: 0.9244 val_accuracy: 0.9240
Epoch 15
Epoch 15 accuracy: 0.9248 val_accuracy: 0.9233
Epoch 16
Epoch 16 accuracy: 0.9262 val_accuracy: 0.9221
Epoch 17
Epoch 17 accuracy: 0.9304 val_accuracy: 0.9252
Epoch 18
Epoch 18 accuracy: 0.9301 val_accuracy: 0.9268
Epoch 19
E