In [None]:
import numpy as np
import datetime
import matplotlib.pyplot as plt
import matplotlib

import os
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable

In [None]:
matplotlib.rcParams['figure.figsize'] = (20,10)

## Vælg device

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(torch.version.cuda)
print(device)

## Indlæs data
Det antages at dataen ligger i `'../data'`

In [None]:
data_raw = np.load('../data/cullpdb+profile_5926.npy')

## Omform data

In [None]:
data = data_raw.reshape((-1,700,57))

amino_acid_recidues  = data[...,:22]
amino_seq_profiles   = data[...,35:]
sec_structure_labels = data[...,22:31]
sec_structure_actual_labels = np.argmax(sec_structure_labels, axis=2).reshape((-1, 700, 1))
solvent_access       = data[...,33:35]

ext_x = np.concatenate((amino_acid_recidues, amino_seq_profiles), axis=2)

x = ext_x
y = np.concatenate((sec_structure_actual_labels, solvent_access), axis=2)

input_channels  = x.shape[2]
output_channels = 9

print('Kanaler:\nInput:  %d\nOutput: %d' % (input_channels, output_channels))

print('Fuldt datasæt shape:')
print('X: ', x.shape)
print('Y: ', y.shape)

y_train_unrot = y[:5430]
y_test_unrot = y[5435:5690]
y_validation_unrot = y[5690:5926]

x = np.rot90(x, axes=(1,2))

x = np.flip(x, 1)

print('Fuldt datasæt vendt shape:')
print('X: ', x.shape)
print('Y: ', y.shape)

x_train = x[:5430]
y_train = y[:5430]

x_test = x[5435:5690]
y_test = y[5435:5690]

x_validation = x[5690:5926]
y_validation = y[5690:5926]

print('Splittet ud i training og testing:')
print('(Train) X: ', x_train.shape)
print('(Train) Y: ', y_train.shape)
print('(Test)  X: ', x_test.shape)
print('(Test)  Y: ', y_test.shape)
print('(Validation)  X: ', x_validation.shape)
print('(Validation)  Y: ', y_validation.shape)

torch_X_train = torch.from_numpy(x_train).type(torch.FloatTensor).to(device)
torch_Y_train = torch.from_numpy(y_train).type(torch.FloatTensor).to(device)
torch_X_test  = torch.from_numpy(x_test).type(torch.FloatTensor).to(device)
torch_Y_test  = torch.from_numpy(y_test).type(torch.FloatTensor).to(device)
torch_X_validation  = torch.from_numpy(x_validation).type(torch.FloatTensor).to(device)
torch_Y_validation  = torch.from_numpy(y_validation).type(torch.FloatTensor).to(device)

## Sæt data sammen i DataLoader

In [None]:
BATCH_SIZE = 4

train = torch.utils.data.TensorDataset(torch_X_train, torch_Y_train)
train_loader = torch.utils.data.DataLoader(dataset=train, batch_size=BATCH_SIZE, shuffle=True)

## Definér modellen

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv1d(
                in_channels=input_channels,       
                out_channels=layer_width,      
                kernel_size=kernel_sizes[0],        
                stride=1,             
                padding=int(kernel_sizes[0]/2),            
            ),                        
            nn.ReLU(),                
        )
        self.conv2 = nn.Sequential(   
            nn.Conv1d(
                in_channels=layer_width,       
                out_channels=layer_width,      
                kernel_size=kernel_sizes[1],        
                stride=1,             
                padding=int(kernel_sizes[1]/2),            
            ),                        
            nn.ReLU(),                
        )
        self.conv3 = nn.Sequential(   
            nn.Conv1d(
                in_channels=layer_width,       
                out_channels=layer_width,      
                kernel_size=kernel_sizes[2],        
                stride=1,             
                padding=int(kernel_sizes[2]/2),            
            ),                        
            nn.ReLU(),                
        )
        self.out = nn.Sequential(     
            nn.Conv1d(
                in_channels=layer_width,       
                out_channels=3,       
                kernel_size=kernel_sizes[3],        
                stride=1,             
                padding=int(kernel_sizes[3]/2),            
            ),
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        output = self.out(x)
        rel_solvent = output[:,0,:]
        abs_solvent = output[:,1,:]
        return rel_solvent, abs_solvent

## Instantiér modellen

In [None]:
layer_width = 90
kernel_sizes = [5, 7, 9, 11]

cnn = CNN().to(device)
print(cnn)
print('Model is on device: "%s"' % device)

LR = 0.0005              # learning rate

optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)
loss_func = nn.BCEWithLogitsLoss()
sigge = nn.Sigmoid()

## Arrays til at gemme data til visualisering

In [None]:
losses     = []
solv_rel_accs_v  = []
solv_rel_accs_t  = []
solv_abs_accs_v  = []
solv_abs_accs_t  = []
steps      = []
steps_cum  = []
epochs     = []
collected  = []

## Funktion til at måle accuracy

In [None]:
def CalculateAccuracy(calc_values_rel, calc_values_abs, real_values):
    NoSeq = 8
    
    real_labels           = real_values[:,:,0]     
    real_values_relsolv   = real_values[:,:,1]
    real_values_abssolv   = real_values[:,:,2]
    
    real_mask = real_labels == NoSeq                                              # Lav maske af dem der er NoSeq
    
    calc_relsolv = sigge(calc_values_rel).cpu().detach().numpy()                  # Kør sigmoid funktion og omform til numpy    
    calc_abssolv = sigge(calc_values_abs).cpu().detach().numpy()                  # Kør sigmoid funktion og omform til numpy
    
    calc_relsolv = np.around(calc_relsolv)                                        # Afrund til enten 0 eller 1
    calc_abssolv = np.around(calc_abssolv)                                        # Afrund til enten 0 eller 1
    
    correct_relsolv = calc_relsolv == real_values_relsolv                         # Lav en matrice af korrekte forudsigelser
    correct_relsolv_masked = np.ma.masked_array(correct_relsolv, real_mask)       # Filtrér dem er er NoSeq
    
    correct_abssolv = calc_abssolv == real_values_abssolv                         # Lav en matrice af korrekte forudsigelser
    correct_abssolv_masked = np.ma.masked_array(correct_abssolv, real_mask)       # Filtrér dem er er NoSeq
    
    relsolv_mean   = np.mean(correct_relsolv_masked)                              # Tag gennemsnittet af relativ solvent-sættet
    abssolv_mean   = np.mean(correct_abssolv_masked)                              # Tag gennemsnittet af absolut solvent-sættet    
    
    return relsolv_mean, abssolv_mean

## Træn modellen

In [None]:
EPOCH = 10

step_cum = -1
for epoch in range(EPOCH):
    print('\nEpoch: ', epoch+1)
    for step, (b_x, b_y) in enumerate(train_loader):
        step_cum += 1
        relsolv, abssolv = cnn(b_x)
        loss1 = loss_func(relsolv, b_y[:,:,1])
        loss2 = loss_func(abssolv, b_y[:,:,2])
        loss_sum = loss1 + loss2
        optimizer.zero_grad()
        loss_sum.backward()
        optimizer.step()

        if step % 100 == 0:
            relsolv_v, abssolv_v = cnn(torch_X_validation)
            rel_acc_v, abs_acc_v = CalculateAccuracy(relsolv_v, abssolv_v, y_validation_unrot)

            relsolv_t, abssolv_t = cnn(torch_X_test)
            rel_acc_t, abs_acc_t = CalculateAccuracy(relsolv_t, abssolv_t, y_test_unrot)

            sys.stdout.write('\rValidation:[Relative: %.4f, Absolute: %.4f]\tTest:[Relative: %.4f, Absolute: %.4f]' % (rel_acc_v, abs_acc_v, rel_acc_t, abs_acc_t))
            
            solv_rel_accs_v.append(rel_acc_v)
            solv_rel_accs_t.append(rel_acc_t)
            solv_abs_accs_v.append(abs_acc_v)
            solv_abs_accs_t.append(abs_acc_t)
            steps_cum.append(step_cum)

relsolv_v, abssolv_v = cnn(torch_X_validation)
rel_acc_v, abs_acc_v = CalculateAccuracy(relsolv_v, abssolv_v, y_validation_unrot)

relsolv_t, abssolv_t = cnn(torch_X_test)
rel_acc_t, abs_acc_t = CalculateAccuracy(relsolv_t, abssolv_t, y_test_unrot)
print('\n%.4f\t%.4f\t%.4f\t%.4f' % (rel_acc_v, abs_acc_v, rel_acc_t, abs_acc_t))
            
print('\nDone training.')

## Endelig test af accuracy

In [None]:
print('Running on the test set:')
test_output = cnn(torch_X_test)
tloss = loss_func(test_output, torch_Y_test)
acc = CalculateAccuracy(test_output, y_test_unrot)
sys.stdout.write('Loss: %.5f,\tAccuracy: %.4f%%' % (tloss.item(), acc*100))

## Gem værdier
Gemmer værdierne så de kan hentes i visualization.ipynb

In [None]:
%store solv_rel_accs_v  
%store solv_rel_accs_t  
%store solv_abs_accs_v  
%store solv_abs_accs_t  