In [1]:
import numpy as np
import datetime
import matplotlib.pyplot as plt
import matplotlib

import os
import sys

#import pandas as pd # to read csv and handle dataframe

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable


import torchvision
import torchvision.transforms as transforms

matplotlib.rcParams['figure.figsize'] = (20,10)

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(torch.version.cuda)
print(device)
#Ikke i brug endnu

9.0.176
cuda:0


## Kode til at gemme modeltilstande

In [3]:
def SaveState(model):
    x = datetime.datetime.now()
    filename = '../checkpoints/multi_model_state_%s.pt' % (x.strftime("%Y%m%d-%H%M"))
    torch.save(model.state_dict(), filename)
    print('Model saved as:\n%s' % os.path.abspath(filename))
    
def LoadState(filename):
    model = torch.load(filename)
    model.eval()
    return model

## Indlæs data
Det antages at dataen ligger i `'../data'`

In [4]:
data_raw = np.load('../data/cullpdb+profile_5926.npy')

## Omform data

In [5]:
data = data_raw.reshape((-1,700,57))

In [6]:
amino_acid_recidues  = data[...,:22]
amino_seq_profiles   = data[...,35:]
sec_structure_labels = data[...,22:31]
solvent_access       = data[...,33:35]

ext_x = np.concatenate((amino_acid_recidues, amino_seq_profiles), axis=2)
ext_y = np.concatenate((sec_structure_labels, solvent_access), axis=2)

x = ext_x
y = ext_y
#x = amino_acid_recidues
#y = sec_structure_labels
#y = solvent_access

input_channels  = x.shape[2]
output_channels = y.shape[2]

print('Kanaler:\nInput:  %d\nOutput: %d' % (input_channels, output_channels))

print('Fuldt datasæt shape:')
print('X: ', x.shape)
print('Y: ', y.shape)

y_train_unrot = y[:5430]
y_test_unrot = y[5435:5690]
y_validation_unrot = y[5690:5926]

#x = x.reshape(-1,22,700)
#y = y.reshape(-1,9,700)
x = np.rot90(x, axes=(1,2))
y = np.rot90(y, axes=(1,2))

x = np.flip(x, 1)
y = np.flip(y, 1)

print('Fuldt datasæt vendt shape:')
print('X: ', x.shape)
print('Y: ', y.shape)

x_train = x[:5430]
y_train = y[:5430]

x_test = x[5435:5690]
y_test = y[5435:5690]

x_validation = x[5690:5926]
y_validation = y[5690:5926]

print('Splittet ud i training og testing:')
print('(Train) X: ', x_train.shape)
print('(Train) Y: ', y_train.shape)
print('(Test)  X: ', x_test.shape)
print('(Test)  Y: ', y_test.shape)
print('(Validation)  X: ', x_validation.shape)
print('(Validation)  Y: ', y_validation.shape)

torch_X_train = torch.from_numpy(x_train).type(torch.FloatTensor).to(device)
torch_Y_train = torch.from_numpy(y_train).type(torch.FloatTensor).to(device)
torch_X_test  = torch.from_numpy(x_test).type(torch.FloatTensor).to(device)
torch_Y_test  = torch.from_numpy(y_test).type(torch.FloatTensor).to(device)
torch_X_validation  = torch.from_numpy(x_validation).type(torch.FloatTensor).to(device)
torch_Y_validation  = torch.from_numpy(y_validation).type(torch.FloatTensor).to(device)

Kanaler:
Input:  44
Output: 11
Fuldt datasæt shape:
X:  (5926, 700, 44)
Y:  (5926, 700, 11)
Fuldt datasæt vendt shape:
X:  (5926, 44, 700)
Y:  (5926, 11, 700)
Splittet ud i training og testing:
(Train) X:  (5430, 44, 700)
(Train) Y:  (5430, 11, 700)
(Test)  X:  (255, 44, 700)
(Test)  Y:  (255, 11, 700)
(Validation)  X:  (236, 44, 700)
(Validation)  Y:  (236, 11, 700)


## Sæt data sammen i DataLoader

In [7]:
BATCH_SIZE = 50 # 250 har virket godt før

train = torch.utils.data.TensorDataset(torch_X_train, torch_Y_train)
train_loader = torch.utils.data.DataLoader(dataset=train, batch_size=BATCH_SIZE, shuffle=False)

## Definér modellen

In [8]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv1d(
                in_channels=input_channels,       
                out_channels=layer_widths[0],      
                kernel_size=kernel_sizes[0],        
                stride=1,             
                padding=int(kernel_sizes[0]/2),            
            ),                        
            nn.ReLU(),                
        )
        self.conv2 = nn.Sequential(   
            nn.Conv1d(
                in_channels=layer_widths[0],       
                out_channels=layer_widths[1],      
                kernel_size=kernel_sizes[1],        
                stride=1,             
                padding=int(kernel_sizes[1]/2),            
            ),                        
            nn.ReLU(),                
        )
        self.conv3 = nn.Sequential(   
            nn.Conv1d(
                in_channels=layer_widths[1],       
                out_channels=layer_widths[2],      
                kernel_size=kernel_sizes[2],        
                stride=1,             
                padding=int(kernel_sizes[2]/2),            
            ),                        
            nn.ReLU(),                
        )
        self.out = nn.Sequential(     
            nn.Conv1d(
                in_channels=layer_widths[2],       
                out_channels=output_channels,       
                kernel_size=kernel_sizes[3],
                stride=1,             
                padding=int(kernel_sizes[3]/2),            
            ),
            nn.ReLU(),
            #nn.Softmax(dim=1),
        )
        self.soft = nn.Softmax(dim=1)
        self.sig1 = torch.nn.Sigmoid()
        self.sig2 = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        output = self.out(x)
        a = self.soft(output[:,:-2,:])
        b = self.sig1(output[:,-2,:])
        c = self.sig2(output[:,-1,:])
        
        return a, b, c #output #, x    # return x for visualization

## Instantiér modellen

In [54]:
layer_width = 80
layer_widths = [80, 80, 80]
kernel_sizes = [21, 11, 11, 7]   # Bedste so far : [5, 21, 11, 5]

cnn = CNN().to(device)
print(cnn)
print('Model is on device: "%s"' % device)
# Hyperparametre
LR = 0.0025              # learning rate

optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)
train_loss_func = nn.BCELoss()  #Binary Cross Entropy Loss
loss_func2 = nn.BCELoss()  #Binary Cross Entropy Loss
validation_loss_func = nn.BCELoss()
test_loss_func = nn.BCELoss()

CNN(
  (conv1): Sequential(
    (0): Conv1d(44, 80, kernel_size=(21,), stride=(1,), padding=(10,))
    (1): ReLU()
  )
  (conv2): Sequential(
    (0): Conv1d(80, 80, kernel_size=(11,), stride=(1,), padding=(5,))
    (1): ReLU()
  )
  (conv3): Sequential(
    (0): Conv1d(80, 80, kernel_size=(11,), stride=(1,), padding=(5,))
    (1): ReLU()
  )
  (out): Sequential(
    (0): Conv1d(80, 11, kernel_size=(7,), stride=(1,), padding=(3,))
    (1): ReLU()
  )
  (soft): Softmax()
  (sig1): Sigmoid()
  (sig2): Sigmoid()
)
Model is on device: "cuda:0"


In [10]:
# Til visualisering senere
losses     = []
accuracies = []
steps      = []
steps_cum  = []
epochs     = []

## Funktion til at måle accuracy

In [11]:
def CalculateAccuracy(calc_values_structure, calc_values_rel, calc_values_abs, real_values):
    NoSeq = 8
    real_values_structure = real_values[:,:,:-2]
    real_values_relsolv   = real_values[:,:,-2]
    real_values_abssolv   = real_values[:,:,-1]
    
    real_labels = np.argmax(real_values_structure, axis=2)            # Kollaps one-hot til rene labels
    real_mask = real_labels == NoSeq                        # Lav maske af dem der er NoSeq
    
    calc_values_structure = calc_values_structure.cpu().detach().numpy()        # Omform til numpy
    calc_values_structure = np.flip(calc_values_structure, 1)                   # Omgør spejlning og 
    calc_values_structure = np.rot90(calc_values_structure, k=-1, axes=(1,2))   # rotation
    
    calc_relsolv = calc_values_rel.cpu().detach().numpy()        # Omform til numpy
    #calc_relsolv = np.flip(calc_relsolv, 1)                   # Omgør spejlning og 
    #calc_relsolv = np.rot90(calc_relsolv, k=-1)#, axes=(1,2))   # rotation
    
    calc_abssolv = calc_values_abs.cpu().detach().numpy()        # Omform til numpy
    #calc_abssolv = np.flip(calc_abssolv, 1)                   # Omgør spejlning og 
    #calc_abssolv = np.rot90(calc_abssolv, k=-1)#, axes=(1,2))   # rotation
    
    calc_labels  = np.argmax(calc_values_structure, axis=2)             # Kollaps one-hot til rene labels
    calc_relsolv = np.around(calc_relsolv)#, axis=2)            # Kollaps one-hot til rene labels
    calc_abssolv = np.around(calc_abssolv)#, axis=2)            # Kollaps one-hot til rene labels
    
    correct_structures = calc_labels == real_labels                    # Find hvilke forudsigelser der er korrekte
    correct_structures_masked = np.ma.masked_array(correct_structures, real_mask) # Filtrér dem er er NoSeq
    
    correct_relsolv = calc_relsolv == real_values_relsolv                    # Find hvilke forudsigelser der er korrekte
    correct_relsolv_masked = np.ma.masked_array(correct_relsolv, real_mask)  # Filtrér dem er er NoSeq
    
    correct_abssolv = calc_abssolv == real_values_abssolv                    # Find hvilke forudsigelser der er korrekte
    correct_abssolv_masked = np.ma.masked_array(correct_abssolv, real_mask)  # Filtrér dem er er NoSeq
    
    structure_mean = np.mean(correct_structures_masked)                       # Tag gennemsnittet af sættet
    relsolv_mean = np.mean(correct_relsolv_masked)                       # Tag gennemsnittet af sættet
    abssolv_mean = np.mean(correct_abssolv_masked)                       # Tag gennemsnittet af sættet
    
    return structure_mean, relsolv_mean, abssolv_mean

In [12]:
def handleLoss(loss_function, calculated_struct, calculated_rel, calculated_abs, correct):
    #first_x  = F.softmax(calculated[:,:9,:], dim=1)
    #second_x = sigmoid1(calculated[:,9,:])
    #third_x  = sigmoid2(calculated[:,10,:])
    #print(calculated[:,9,:])
    #print(second_x)
    first_x  = calculated_struct
    second_x = calculated_rel
    third_x  = calculated_abs
    
    first_y  = correct[:,:-2,:]
    second_y = correct[:,-2,:]
    third_y  = correct[:,-1,:]
    
    loss1 = loss_function(first_x,  first_y)
    loss2 = loss_function(second_x, second_y)
    loss3 = loss_function(third_x,  third_y)
    #print(loss1)
    #print(loss2)
    #print(loss3)
    totes = (loss1 + loss2 + loss3)
    #print(totes)
    #totes.backward()
    #loss1.backward(retain_graph=True)
    #loss3.backward(retain_graph=True)
    #loss2.backward(retain_graph=True)
    return totes
    #print(first_x[0,:,0])
    #print(second.shape)

## Træn modellen

In [55]:
EPOCH = 5               # train the training data n times

step_cum = -1
for epoch in range(EPOCH):
    print('\nEpoch: ', epoch+1)
    for step, (b_x, b_y) in enumerate(train_loader):
        step_cum += 1
        a, b, c = cnn(b_x)#[0]               # cnn output
        optimizer.zero_grad()           # clear gradients for this training step
        loss = handleLoss(train_loss_func, a, b, c, b_y)
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
        if step % 10 == 0:
            a, b, c = cnn(torch_X_validation)
            vloss = handleLoss(validation_loss_func, a, b, c, torch_Y_validation)
            acc_struc, acc_rel, acc_abs = CalculateAccuracy(a, b, c, y_validation_unrot)
            sys.stdout.write('\rStep [%d]:\tTrain:[Loss: %.5f] \t Validation:[Loss: %.4f, Accuracies (struc, rel, abs): %.2f%%, %.2f%%, %.2f%%]' % (step, loss.item(), vloss.item(), acc_struc*100, acc_rel*100, acc_abs*100))
            #epochs.append(epoch)
            #losses.append(vloss.item())
            #accuracies.append(acc)
            #steps.append(step)
            #steps_cum.append(step_cum)
            
print('\nDone training.')


Epoch:  1
Step [100]:	Train:[Loss: 1.57085] 	 Validation:[Loss: 1.5796, Accuracies (struc, rel, abs): 58.56%, 80.31%, 78.79%]
Epoch:  2
Step [100]:	Train:[Loss: 1.55914] 	 Validation:[Loss: 1.5706, Accuracies (struc, rel, abs): 62.25%, 81.03%, 79.43%]
Epoch:  3
Step [100]:	Train:[Loss: 1.55386] 	 Validation:[Loss: 1.5666, Accuracies (struc, rel, abs): 64.11%, 81.30%, 79.85%]
Epoch:  4
Step [100]:	Train:[Loss: 1.55081] 	 Validation:[Loss: 1.5637, Accuracies (struc, rel, abs): 65.28%, 81.58%, 79.94%]
Epoch:  5
Step [100]:	Train:[Loss: 1.54928] 	 Validation:[Loss: 1.5626, Accuracies (struc, rel, abs): 65.36%, 81.85%, 80.20%]
Done training.


## Endelig test af accuracy

In [23]:
print('Running on the test set:')
a, b, c = cnn(torch_X_test)
tloss = handleLoss(test_loss_func, a, b, c, torch_Y_test)
x, y, z = CalculateAccuracy(a, b, c, y_test_unrot)
sys.stdout.write('Loss: %.5f,\tAccuracy (struct, rel, abs): %.3f%%, %.3f%%, %.3f%%' % (tloss.item(), x*100, y*100, z*100))

Running on the test set:
Loss: 1.55829,	Accuracy (struct, rel, abs): 64.929%, 81.955%, 80.337%

In [22]:
SaveState(cnn)

Model saved as:
/home/simonsen/Documents/Uni/bachelor/git/Bachelor19/checkpoints/multi_model_state_20190603-1420.pt
