In [1]:
#Import needed libraries
import pandas as pd
import numpy as np
import scipy
import scipy.stats
import random
import os
import pickle
import theano

#Importing Torch

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda as cuda

import matplotlib.pyplot as plt
%matplotlib notebook

from skorch.net import NeuralNetClassifier



# CUDA initializing
We want to build a device-agnostic code. </br>
- using the documentation: https://pytorch.org/docs/master/notes/cuda.html
- requires to run argparse : see tutorial https://docs.python.org/2/howto/argparse.html (I understand what it does now — don't think it's super useful)

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assume that we are on a CUDA machine, then this should print a CUDA device:

print(device)

cuda:0


# Import data

In [3]:
# Buid the feature matrix
data = pd.read_csv('/home/xsong/Alma/2017---Deep-learning-yeast-UTRs/Data/Random_UTRs.csv')

## One-hot encoding of the sequences.

i.e. we're converting the sequences from being represented as a 50 character string of bases to a 4x50 matrix of 1's and 0's, with each row corresponding to a base and every column a position in the UTR.

<b>Generate different data sets</b>

In [4]:
# a sorted numpy array of UTR indexes, from least reads to most reads
sorted_inds = data.sort_values('t0').index.values

train_inds = sorted_inds[:int(0.95*len(sorted_inds))] # 95% of the data as the training set

test_inds = sorted_inds[int(0.95*len(sorted_inds)):] # UTRs with most reads at time point 0 as the test set

In [5]:
print(train_inds)
print(test_inds)

[279798 363120 118098 ... 438117  73271 122431]
[ 56861 207040 391302 ... 221380  65465 232749]


<b>One-hot encoding of the different data sets</b><br>
Each has a specific name

In [6]:
# From the work of Cuperus et al.
# one hot encoding of UTRs
# X = one hot encoding matrix
# Y = growth rates

def one_hot_encoding(df, seq_column, expression):

    bases = ['A','C','G','T']
    base_dict = dict(zip(bases,range(4))) # {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}

    n = len(df)
    
    # length of the UTR sequence
    # we also add 10 empty spaces to either side
    total_width = df[seq_column].str.len().max() + 20
    
    # initialize an empty numpy ndarray of the appropriate size
    X = np.zeros((n, 1, 4, total_width))
    
    # an array with the sequences that we will one-hot encode
    seqs = df[seq_column].values
    
    # loop through the array of sequences to create an array that keras will actually read
    for i in range(n):
        seq = seqs[i]
        
        # loop through each individual sequence, from the 5' to 3' end
        for b in range(len(seq)):
            # this will assign a 1 to the appropriate base and position for this UTR sequence
            X[i, 0, base_dict[seq[b]], int(b + round((total_width - len(seq))/2.))] = 1.
    
        # keep track of where we are
        if (i%100000)==0:
            print(i),
        
    X = X.astype(theano.config.floatX)
    Y = np.array(df[expression].values,
                   dtype = theano.config.floatX)[:, np.newaxis]
    
    return X, Y, total_width

In [7]:
X, Y, total_width = one_hot_encoding(data, 'UTR', 'growth_rate')

0
100000
200000
300000
400000


In [10]:
Y_torch = torch.from_numpy(Y).float()
print(type(Y_torch))

<class 'torch.Tensor'>


In [12]:
X_torch = torch.from_numpy(X).float()
print(type(X_torch))

<class 'torch.Tensor'>


# Generate Model

## Buid the neural network

In [15]:
hyperparams= {'conv_width' : [9, 13, 17, 25],
               'conv_filters' : [32, 64, 128, 256],
               'conv_layers' : [2, 3, 4],
               'dense_layers' : [1, 2],
               'conv_dropout' : [None, 0.15],
               'dense_dropout' : [None, 0.1, 0.25, 0.5],
               'dense_units' : [32, 64, 128, 256]}

In [25]:
class Net(nn.Module):
    def __init__(self, x):
        super(Net, self).__init__()
        # input channel, output channels = number of filters, convolution kernel size
        # kernel
        self.conv1 = nn.Conv2d(1, hyperparams['conv_filters'], [4, hyperparams['conv_width']])
        self.conv2 = nn.Conv2d(1, 16, [1,hyperparams['conv_width']])
        self.conv3 = nn.Conv2d(1, 16, [1,hyperparams['conv_width']])
        self.fc1 = nn.Linear(1, 12)
        self.lin_out1 = nn.Linear(120, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.fc1(x))
        x = self.lol1(x)
        x = nn.Dropout(p=hyperparams['conv_dropout']) #

        return x

 #   def num_flat_features(self, x):
 #      size = x.size()[1:]  # all dimensions except the batch dimension
 #       num_features = 1
 #       for s in size:
 #           num_features *= s
 #      return num_features

net = Net(data)
print(net)

TypeError: unsupported operand type(s) for %: 'list' and 'int'

In [None]:
params = list(net.parameters())
print(len(params))
print(params[0].size()) 

## Training


# Choice of optimizer & loss function => MSE 
# Using backpropagation

# define model
model = net

# define loss function
loss_func = nn.MSELoss() 

# define optimizer
optimizer = torch.optim.Adam(net.parameters(), lr = 0.0001)
track_loss = []

#Verification & Training
for epoch in range(2):  # loop over the dataset multiple time
        target = Y_torch [train_inds]
        input = X_torch [train_inds]
        target.requires_grad=False
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        output = net(input)
        loss = loss_func(output, target)
        loss.backward()
        optimizer.step()
        #input = torch.randn(3, 5, requires_grad=True)

        
        #Store MSE value
        track_loss.append(loss)

print('Finished Training')

# Training with Skorch
## Either this or next possibility

# define model
model = net

# define loss function
loss_func = nn.MSELoss() 

# define optimizer
optimizer = torch.optim.Adam(net.parameters(), lr = 0.0001)
track_loss = []

import skorch
from skorch.net import NeuralNetClassifier
from skorch.history import History

net = NeuralNetClassifier(
        module=net, 
        criterion=nn.MSELoss(), 
        optimizer='torch.optim.sgd.SGD', 
        lr=0.01, 
        max_epochs=10, 
        batch_size=128, 
        iterator_train= 'torch.utils.X_torch.dataloader.DataLoader', 
        iterator_valid= 'torch.utils.X_torch.dataloader.DataLoader', 
        dataset='skorch.dataset.Dataset', 
        train_split=skorch.dataset.CVSplit(X_torch[train_inds]), 
        callbacks=None, 
        device = 'cpu',
        verbose=1,
)

net.fit(X_torch, 
        y='none', 
        inputs=X_torch
)

Y_pred = net.predict(X_valid)

# GridsearchCV 
## I wonder if this is ok

In [None]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(net, hyperparams, refit=False, cv=3, scoring='accuracy')

gs.fit(X[train_inds], Y[train_inds])
print(gs.best_score_, gs.best_params_)

In [None]:
Y_pred = net.predict(X_valid)

## Plot predictions vs data

In [None]:
# data
x = Y_pred.flatten()
y = Y.flatten()

# calculate R^2
r2 = scipy.stats.pearsonr(x, y)[0]**2


g = sns.jointplot(x,
                  y,
                  stat_func = None,
                  kind = 'scatter',
                  s = 5,
                  alpha = 0.1,
                  size = 5)

g.ax_joint.set_xlabel('Predicted log$_2$ Growth Rate')
g.ax_joint.set_ylabel('Measured log$_2$ Growth Rate')


text = "R$^2$ = {:0.2}".format(r2)
plt.annotate(text, xy=(-5.5, 0.95), xycoords='axes fraction')

plt.title("CNN predictions vs. test set", x = -3, y = 1.25)

In [None]:
# Plotting of the loss function
plt.plot(Loss function)
plt.ylabel('Value of the loss function')
plt.xlabel('Time (epochs)')
plt.show()