In [None]:
#Import needed libraries
import pandas as pd
import numpy as np
import scipy
import scipy.stats
import random
import os
import pickle
import theano
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda as cuda
import matplotlib.pyplot as plt
from skorch.net import NeuralNetClassifier
import torch.utils.data as Data

# CUDA initializing
We want to build a device-agnostic code. </br>
- using the documentation: https://pytorch.org/docs/master/notes/cuda.html
- requires to run argparse : see tutorial https://docs.python.org/2/howto/argparse.html (I understand what it does now — don't think it's super useful)
- 

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # If CUDA is available => cuda:0 prints
print(device)

# Import data

In [None]:
# Buid the feature matrix
data = pd.read_csv('/home/xsong/Alma/2017---Deep-learning-yeast-UTRs/Data/Random_UTRs.csv')

## One-hot encoding of the sequences.

i.e. we're converting the sequences from being represented as a 50 character string of bases to a 4x50 matrix of 1's and 0's, with each row corresponding to a base and every column a position in the UTR.

In [None]:
# From the work of Cuperus et al.
# one hot encoding of UTRs
# X = one hot encoding matrix
# Y = growth rates

def one_hot_encoding(df, seq_column, expression):

    bases = ['A','C','G','T']
    base_dict = dict(zip(bases,range(4))) # {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}

    n = len(df)
    
    # length of the UTR sequence
    # we also add 10 empty spaces to either side
    total_width = df[seq_column].str.len().max() + 20
    
    # initialize an empty numpy ndarray of the appropriate size
    X = np.zeros((n, 1, 4, total_width))
    
    # an array with the sequences that we will one-hot encode
    seqs = df[seq_column].values
    
    # loop through the array of sequences to create an array that keras will actually read
    for i in range(n):
        seq = seqs[i]
        
        # loop through each individual sequence, from the 5' to 3' end
        for b in range(len(seq)):
            # this will assign a 1 to the appropriate base and position for this UTR sequence
            X[i, 0, base_dict[seq[b]], int(b + round((total_width - len(seq))/2.))] = 1.
    
        # keep track of where we are
        if (i%100000)==0:
            print(i),
        
    X = X.astype(theano.config.floatX)
    Y = np.array(df[expression].values,
                   dtype = theano.config.floatX)[:, np.newaxis]
    
    return X, Y, total_width

In [None]:
X, Y, total_width = one_hot_encoding(data, 'UTR', 'growth_rate')

In [None]:
X_torch = torch.from_numpy(X).float().cuda() #change to torch and upload to CUDA
Y_torch = torch.from_numpy(Y).float().cuda() #change to torch and upload to CUDA

## Generate different data sets

In [None]:
# a sorted numpy array of UTR indexes, from least reads to most reads
sorted_inds = data.sort_values('t0').index.values
train_inds = sorted_inds[:int(0.95*len(sorted_inds))] # 95% of the data as the training set
test_inds = sorted_inds[int(0.95*len(sorted_inds)):] # UTRs with most reads at time point 0 as the test set

# set the seed before randomly shuffling the data
seed = 0.5
random.shuffle(train_inds, lambda :seed)

# Generate Model

I need to figure out how to make the dropout happen and Flatten. 
How do hidden units work in fully connected layers?

## Buid the neural network

In [None]:
size=1
batch_size=10
class Net(nn.Module):
    def __init__(self, x):
        super(Net, self).__init__()
        # input channel, output channels = number of filters, convolution kernel size
        # kernel
        self.conv1 = nn.Conv2d(1, size, [4,13])
        self.conv2 = nn.Conv2d(1, size, [1,13])
        self.conv3 = nn.Conv2d(1, size, [1,13])
        self.fc1 = nn.Linear(34, 120)
        self.lin_out1 = nn.Linear(120, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        #print('conv1',x.size())
        x = F.relu(self.conv2(x))
        #print('conv2',x.size())
        x = F.relu(self.conv3(x))
        #print('conv3',x.size())
        x = F.relu(self.fc1(x))
        #print('fc1',x.size())
        x = self.lin_out1(x)
        #print('lol1',x.size())
        #x = nn.Dropout(p=0.15)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net(data)
net = net.to(device)
print(net)
next(net.parameters()).is_cuda

In [None]:
params = list(net.parameters())
print(len(params))
print(params[0].size()) 

## Training & Cross Validation


In [None]:
# Choice of optimizer & loss function => MSE 
# Using backpropagation

# Initiate the hyperparameters
number_epochs = 20
optimizer = torch.optim.Adam(net.parameters(), lr = 0.0001)
track_loss = []
loss_func = nn.MSELoss().cuda()
k_fold = 10

# Define dataset and initialize mini-batch data
x = X_torch[train_inds]
y = Y_torch[train_inds]
dataset = Data.TensorDataset(x, y)
train_loader = Data.DataLoader(dataset, batch_size=128, shuffle=True)


#Training with crossvalidation
for split in range(k_fold):
    dev_index = data.index.isin(list(range(num_dev_data*split, num_dev_data*(split+1))))
    train_data = data[~dev_index]
    dev_data = data[dev_index]
    
    learning_rate = 0.005
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    best_acc = 0.
    
    for epoch in range(number_epochs): # loop over the dataset multiple time
        print(epoch)
        for i, (x,y) in enumerate(train_loader):
            y.requires_grad=False
            y = y[:,0]                 # resize the value as vector
            optimizer.zero_grad()      # zero the parameter gradients

            # forward + backward + optimize
            output = net(x)
            output = output[:,0,0,0]   #resize the output as vector
            loss = loss_func(output, y)# compute the loss of the system
            loss.backward()            # start backward function
            optimizer.step()           # optimizing step
        
            #Store MSE value
            track_loss.append(loss)
            correct_count += torch.sum(torch.max(pred, 1)[1] == y).data[0]
            total_count += batch_data.shape[0]
        return correct_count, total_count
        acc = correct_count / total_count
        print("dev acc: {}".format(acc))
        if acc > best_acc:
            best_acc = acc
            print("save the model")
            torch.save(model.state_dict(), "model-cross-validate/model-{}.th".format(split))
        else:
            learning_rate *= 0.8
            optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print('Finished Training')

In [None]:
# Plotting of the loss function
plt.plot(track_loss)
plt.ylabel('Value of the loss function')
plt.xlabel('Time')
plt.show()

## Plot predictions vs data

In [None]:
X_torch[test_inds]
Y_pred = net(X_torch[test_inds])

In [None]:
# data
x = Y_pred.flatten()
y = Y[test_inds].flatten()

# calculate R^2
r2 = scipy.stats.pearsonr(x, y)[0]**2


g = sns.jointplot(x,
                  y,
                  stat_func = None,
                  kind = 'scatter',
                  s = 5,
                  alpha = 0.1,
                  size = 5)

g.ax_joint.set_xlabel('Predicted log$_2$ Growth Rate')
g.ax_joint.set_ylabel('Measured log$_2$ Growth Rate')


text = "R$^2$ = {:0.2}".format(r2)
plt.annotate(text, xy=(-5.5, 0.95), xycoords='axes fraction')

plt.title("CNN predictions vs. test set", x = -3, y = 1.25)