In [None]:
#Neural Network Model - lib Pytorch

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import pdb
#from better_lstm import LSTM

import torch.nn.functional as F

from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn import metrics
from keras.preprocessing.text import Tokenizer

np.random.seed(1234)
torch.manual_seed(1234)
torch.backends.cudnn.benchmark = False

In [None]:
#pip install git+https://github.com/keitakurita/Better_LSTM_PyTorch

In [None]:
#Read Data

#read txt data
negative_training_set1_txt = pd.read_csv('https://raw.githubusercontent.com/bharuno/Methylation/main/negative_training_set1(1038).txt', sep = "\t", header = None)
negative_training_set2_txt = pd.read_csv('https://raw.githubusercontent.com/bharuno/Methylation/main/negative_training_set2(1038).txt', sep = "\t", header = None)
negative_training_set3_txt = pd.read_csv('https://raw.githubusercontent.com/bharuno/Methylation/main/negative_training_set3(1038).txt', sep = "\t", header = None)
negative_training_set4_txt = pd.read_csv('https://raw.githubusercontent.com/bharuno/Methylation/main/negative_training_set4(1038).txt', sep = "\t", header = None)
negative_training_set5_txt = pd.read_csv('https://raw.githubusercontent.com/bharuno/Methylation/main/negative_training_set5(1038).txt', sep = "\t", header = None)
positive_training_set_txt  = pd.read_csv('https://raw.githubusercontent.com/bharuno/Methylation/main/positive_training_set(1038).txt',  sep = "\t", header = None)

negative_test_set_txt = pd.read_csv('https://raw.githubusercontent.com/bharuno/Methylation/main/negative_test_set(260).txt', sep = "\t", header = None)
positive_test_set_txt = pd.read_csv('https://raw.githubusercontent.com/bharuno/Methylation/main/positive_test_set(260).txt', sep = "\t", header = None)

independent_negative_set_txt = pd.read_csv('https://raw.githubusercontent.com/bharuno/Methylation/main/independent_negative_set(3033).txt', sep = "\t", header = None)
independent_positive_set_txt = pd.read_csv('https://raw.githubusercontent.com/bharuno/Methylation/main/independent_positive_set(1131).txt', sep = "\t", header = None)


In [None]:
#Data Preprocessing - Convert to Dataframe

#def for convert txt data to dataframe
def preprocess_data(data):
    data1 = data[data.isnull().any(axis=1)].reset_index()
    data2 = data.dropna().reset_index()
    data3 = pd.concat([data1, data2], axis=1, sort=False, ignore_index=True)
    data3.drop(columns=[0,2,3], inplace=True)
    data3.rename(index=str, columns={1: "name", 4: "position", 5: "sequence"}, inplace = True)
    return data3

#applying def
negative_training_set1 = preprocess_data(negative_training_set1_txt)
negative_training_set2 = preprocess_data(negative_training_set2_txt)
negative_training_set3 = preprocess_data(negative_training_set3_txt)
negative_training_set4 = preprocess_data(negative_training_set4_txt)
negative_training_set5 = preprocess_data(negative_training_set5_txt)
positive_training_set = preprocess_data(positive_training_set_txt)

negative_test_set = preprocess_data(negative_test_set_txt)
positive_test_set = preprocess_data(positive_test_set_txt)

independent_negative_set = preprocess_data(independent_negative_set_txt)
independent_positive_set = preprocess_data(independent_positive_set_txt)

In [None]:
negative_training_set1

Unnamed: 0,name,position,sequence
0,>Q8VBY2,234,NLYLVFDLLRKGPVMEVPC
1,>Tb927.11.12390,641,ICMMLLEGLRQSASFGFDT
2,>Tb927.5.3710,438,MLCIIYMVARPYLQMHPTR
3,>Tb927.11.560,134,GDASILFFTRITAWLRLTY
4,>Tb11.01.1560,29,TCPKGCNSYRHVSTGSDEC
...,...,...,...
1033,>Q9NR22,204,SMLNTVIFARDKWLKPGGL
1034,>Q6KCD5,1195,EMMDSSTFKRFTASIENIL
1035,>Q6KCD5,2715,AICCPKYKDRPQIARVVQR
1036,>Tb927.8.780,542,EKGQVVKQLRESERQLEMT


In [None]:
#Data Preprocessing - Sequence Extraction and Labeling

#take the sequence
negative_seq1 = np.array([ list(word) for word in negative_training_set1.sequence.values])
negative_seq2 = np.array([ list(word) for word in negative_training_set2.sequence.values])
negative_seq3 = np.array([ list(word) for word in negative_training_set3.sequence.values])
negative_seq4 = np.array([ list(word) for word in negative_training_set4.sequence.values])
negative_seq5 = np.array([ list(word) for word in negative_training_set5.sequence.values])
negative_seq = np.concatenate((negative_seq1,
                               negative_seq2,
                               negative_seq3,
                               negative_seq4,
                               negative_seq5), axis=0, out=None)
positive_seq = np.array([ list(word) for word in positive_training_set.sequence.values])

negative_seq_val = np.array([ list(word) for word in independent_negative_set.sequence.values])
positive_seq_val = np.array([ list(word) for word in independent_positive_set.sequence.values])

negative_seq_test = np.array([ list(word) for word in negative_test_set.sequence.values])
positive_seq_test = np.array([ list(word) for word in positive_test_set.sequence.values])

#create label - training data
negative_lab1 = np.zeros((negative_seq1.shape[0],), dtype=int)
negative_lab = np.zeros((negative_seq.shape[0],), dtype=int)
positive_lab = np.ones((positive_seq.shape[0],), dtype=int)
negative_lab_val = np.zeros((negative_seq_val.shape[0],), dtype=int)
positive_lab_val = np.ones((positive_seq_val.shape[0],), dtype=int)
negative_lab_test = np.zeros((negative_seq_test.shape[0],), dtype=int)
positive_lab_test = np.ones((positive_seq_test.shape[0],), dtype=int)

In [None]:
dataset_Y

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
#Data Preprocessing - Decide dataset characteristics, Merging and Tokenizing 

#Parameters

balanced_data = False
token = True
window = 19 #3 to 19 odd number

if (balanced_data):
  df_train_pos = positive_seq
  df_train_neg = negative_seq1
  df_lab_pos = positive_lab
  df_lab_neg = negative_lab1
else:
  df_train_pos = positive_seq
  df_train_neg = negative_seq
  df_lab_pos = positive_lab
  df_lab_neg = negative_lab

start_w = 9-int(window/2)
end_w = 9+int(window/2)+1

#merge
dataset_X = np.concatenate((df_train_pos, df_train_neg), axis=0, out=None)
dataset_Y = np.concatenate((df_lab_pos, df_lab_neg), axis=0, out=None)
dataset_X_val = np.concatenate((positive_seq_val, negative_seq_val), axis=0, out=None)
dataset_Y_val = np.concatenate((positive_lab_val, negative_lab_val), axis=0, out=None)
dataset_X_test = np.concatenate((positive_seq_test, negative_seq_test), axis=0, out=None)
dataset_Y_test = np.concatenate((positive_lab_test, negative_lab_test), axis=0, out=None)

negative_seq_val_eq = negative_seq_val[1:len(positive_seq_val)+1]
negative_lab_val_eq = negative_lab_val[1:len(positive_lab_val)+1]
dataset_X_val_eq = np.concatenate((positive_seq_val, negative_seq_val_eq), axis=0, out=None)
dataset_Y_val_eq = np.concatenate((positive_lab_val, negative_lab_val_eq), axis=0, out=None)

#Tokenizing, Unique character got its own number - training
asam = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V','X']
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(asam)

dataset_X_token = []
for i in range(len(dataset_X)):
    temp = tokenizer.texts_to_sequences(dataset_X[i])
    dataset_X_token = np.append(dataset_X_token, temp)

dataset_X_token = dataset_X_token-1
dataset_X_token = dataset_X_token.reshape(len(dataset_X),19)
dataset_X_token =  dataset_X_token[:, range(start_w, end_w)]

#Tokenizing, Unique character got its own number - validation
dataset_X_token_val = []
for i in range(len(dataset_X_val)):
    temp = tokenizer.texts_to_sequences(dataset_X_val[i])
    dataset_X_token_val = np.append(dataset_X_token_val, temp)

dataset_X_token_val = dataset_X_token_val-1
dataset_X_token_val = dataset_X_token_val.reshape(len(dataset_X_val),19)
dataset_X_token_val =  dataset_X_token_val[:, range(start_w, end_w)]

#Tokenizing, Unique character got its own number - validation equal
dataset_X_token_val_eq = []
for i in range(len(dataset_X_val_eq)):
    temp = tokenizer.texts_to_sequences(dataset_X_val_eq[i])
    dataset_X_token_val_eq = np.append(dataset_X_token_val_eq, temp)

dataset_X_token_val_eq = dataset_X_token_val_eq-1
dataset_X_token_val_eq = dataset_X_token_val_eq.reshape(len(dataset_X_val_eq),19)
dataset_X_token_val_eq =  dataset_X_token_val_eq[:, range(start_w, end_w)]

#Tokenizing, Unique character got its own number - testing
dataset_X_token_test = []
for i in range(len(dataset_X_test)):
    temp = tokenizer.texts_to_sequences(dataset_X_test[i])
    dataset_X_token_test = np.append(dataset_X_token_test, temp)

dataset_X_token_test = dataset_X_token_test-1
dataset_X_token_test = dataset_X_token_test.reshape(len(dataset_X_test),19)
dataset_X_token_test =  dataset_X_token_test[:, range(start_w, end_w)]

#for if con ensamble
modelCNN_pred_val_imbalance = []
modelLSTM_pred_val_imbalance = [] 

In [None]:
dataset_X_token

array([[ 2., 15.,  7., ..., 18.,  9., 13.],
       [15.,  0.,  7., ...,  1., 17.,  5.],
       [15., 10.,  5., ...,  1., 11., 16.],
       ...,
       [ 3., 15., 16., ..., 11.,  1., 12.],
       [10., 13., 10., ...,  9.,  3., 14.],
       [ 9.,  0., 10., ..., 19.,  2.,  2.]])

In [None]:
#Creating the Neural Network

#logistic model
class ModelLog(nn.Module):
    def __init__(self, n_input_features):
        super(ModelLog, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)
        
        self.linear = nn.Linear(n_input_features, 1)

    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

# NN sample
class NeuralNet1(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet1, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size, 1)  
    
    def forward(self, x):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        # sigmoid at the end
        y_pred = torch.sigmoid(out)
        return y_pred

# Our method
class Model1(nn.Module):
    def __init__(self, embedding_vec):
        super(Model1, self).__init__()
        self.embedding = nn.Embedding(len(asam), embedding_vec) 
        self.flatten = nn.Flatten()
        self.batchNorm1 = nn.BatchNorm1d(embedding_vec*n_features)
        self.linear1 = nn.Linear(embedding_vec*n_features, embedding_vec*n_features)
        #self.batchNorm2 = nn.BatchNorm1d(embedding_vec*n_features)
        self.output = nn.Linear(embedding_vec*n_features, 2)  
    
    def forward(self, x):
        out = self.embedding(x)
        out = self.flatten(out)
        out = self.batchNorm1(out)
        out = self.linear1(out)        
        #out = self.batchNorm2(out)
        y_pred = self.output(out)
        # sigmoid at the end
        #y_pred = torch.sigmoid(out)
        return y_pred

# Our method
class Model2(nn.Module):
    def __init__(self, embedding_vec):
        super(Model2, self).__init__()
        self.embedding = nn.Embedding(len(asam), embedding_vec) 
        self.flatten = nn.Flatten()
        self.batchNorm1 = nn.BatchNorm1d(embedding_vec*n_features)
        self.linear1 = nn.Linear(embedding_vec*n_features, embedding_vec*n_features)
        #self.batchNorm2 = nn.BatchNorm1d(embedding_vec*n_features)
        self.output = nn.Linear(embedding_vec*n_features, 1)  
    
    def forward(self, x):
        out = self.embedding(x)
        out = self.flatten(out)
        out = self.batchNorm1(out)
        out = self.linear1(out)        
        #out = self.batchNorm2(out)
        out = self.output(out)
        # sigmoid at the end
        y_pred = torch.sigmoid(out)
        return y_pred

# CNN
class ModelCNN(nn.Module):
    def __init__(self):
        super(ModelCNN, self).__init__()
        self.embedding = nn.Embedding(len(asam), 21) 
        self.conv1 = nn.Conv2d(1, 64, (9,3)) 
        self.conv2 = nn.Conv2d(64, 128, (9,3), padding = (4,1))
        self.drop1 = nn.Dropout(p=0.6)
        self.drop2 = nn.Dropout(p=0.6)
        self.drop3 = nn.Dropout(p=0.5)
        self.drop4 = nn.Dropout(p=0.5)
        self.pool = nn.MaxPool2d(2, 2)
        self.flatten = nn.Flatten()

        self.linear1 = nn.Linear(5760, 768) 
        self.linear2 = nn.Linear(768, 256) 
        self.linear3 = nn.Linear(256, 2) 
    
    def forward(self, x):
        out = self.embedding(x)
        out = out.view(out.shape[0], 1, out.shape[1], out.shape[2]) #channel, add dim, seq, emd vec

        out = self.conv1(out)
        out = F.relu(out)
        out = self.drop1(out)
        
        out = self.conv2(out)
        out = F.relu(out)
        out = self.drop2(out)
        
        out = self.pool(out)
        out = self.flatten(out)
        
        out = self.linear1(out)
        out = self.drop3(out)
        out = self.linear2(out)
        out = self.drop4(out)

        y_pred = self.linear3(out)
        return y_pred

# LSTM
class ModelLSTM(nn.Module):
    def __init__(self):
        super(ModelLSTM, self).__init__()
        self.embedding = nn.Embedding(len(asam), 21) 

        self.lstm1 = nn.LSTM(21, 128, 1)
        self.lstm2 = nn.LSTM(128, 64, 1)
        self.drop1 = nn.Dropout(p=0.5)
        #self.betterlstm = LSTM(128, 64, dropoutw=0.2)
        self.linear1 = nn.Linear(64, 32) 
        self.linear2 = nn.Linear(32, 2) 

    def forward(self, x):
        out = self.embedding(x)
        out = out.permute(1,0,2)
        out = self.lstm1(out)
        out = self.lstm2(out[0])
        out = self.drop1(out[0])
        #out = self.betterlstm(out[0])
        #pdb.set_trace()
        #out = out[0]
        #out = self.linear1(out[0][out[0].shape[0]-1])
        out = self.linear1(out[out.shape[0]-1])
        y_pred = self.linear2(out)
        return y_pred

# Neo
class ModelNeo(nn.Module):
    def __init__(self):
        super(ModelNeo, self).__init__()
        self.embedding = nn.Embedding(len(asam), 21)

        self.lstm1 = nn.LSTM(21, 64, 1)
        self.lstm2 = nn.LSTM(64, 64, 1)
        self.lstm3 = nn.LSTM(64, 64, 1)
        self.lstm4 = nn.LSTM(64, 64, 1)
        self.batchnormL1 = nn.BatchNorm1d(64)
        self.batchnormL2 = nn.BatchNorm1d(64)
        self.batchnormL3 = nn.BatchNorm1d(64)
        self.batchnormL4 = nn.BatchNorm1d(64)
        self.dropL1 = nn.Dropout(p=0.5)
        self.dropL2 = nn.Dropout(p=0.5)
        self.dropL3 = nn.Dropout(p=0.5)
        self.dropL4 = nn.Dropout(p=0.5)
        self.linearL1 = nn.Linear(64, 32) 

        self.conv1 = nn.Conv2d(1, 64, (3,21), padding = (1,0)) 
        self.conv2 = nn.Conv2d(64, 64, (3,1), padding = (1,0)) 
        self.conv3 = nn.Conv2d(64, 64, (3,1), padding = (1,0)) 
        self.conv4 = nn.Conv2d(64, 64, (3,1), padding = (1,0))
        self.batchnormC1 = nn.BatchNorm2d(64)
        self.batchnormC2 = nn.BatchNorm2d(64)
        self.batchnormC3 = nn.BatchNorm2d(64)
        self.batchnormC4 = nn.BatchNorm2d(64)
        self.dropC1 = nn.Dropout(p=0.5)
        self.dropC2 = nn.Dropout(p=0.5)
        self.dropC3 = nn.Dropout(p=0.5)
        self.dropC4 = nn.Dropout(p=0.5)
        #self.pool = nn.MaxPool2d(2, 2)
        self.flatten = nn.Flatten()
        self.linearC1 = nn.Linear(1216, 32) 
        
        self.linear2 = nn.Linear(32, 2) 

    def forward(self, x):
        emb = self.embedding(x)
        
        #pdb.set_trace()
        lstm = emb.permute(1,0,2)
        
        lstm = self.lstm1(lstm)
        # lstm = lstm[0].permute(1,2,0)
        # lstm = self.batchnormL1(lstm)
        # lstm = lstm.permute(2,0,1)
        lstm1 = self.dropL1(lstm[0])

        lstm = self.lstm2(lstm1)
        # lstm = lstm[0].permute(1,2,0)
        # lstm = self.batchnormL2(lstm)
        # lstm = lstm.permute(2,0,1)
        lstm = self.dropL2(lstm[0])
        #lstm2 = lstm + lstm1

        # lstm = self.lstm3(lstm2)
        # lstm = lstm[0].permute(1,2,0)
        # lstm = self.batchnormL3(lstm)
        # lstm = lstm.permute(2,0,1)
        # lstm = self.dropL3(lstm)
        # lstm3 = lstm + lstm2

        # lstm = self.lstm4(lstm3)
        # lstm = lstm[0].permute(1,2,0)
        # lstm = self.batchnormL4(lstm)
        # lstm = lstm.permute(2,0,1)
        # lstm = self.dropL4(lstm)
        # lstm4 = lstm + lstm3
        #lstm = lstm2
        
        lstm = self.linearL1(lstm[lstm.shape[0]-1])
        
        cnn = emb.view(emb.shape[0], 1, emb.shape[1], emb.shape[2]) #channel, add dim, seq, emd vec
        
        cnn = self.conv1(cnn)
        cnn = F.relu(cnn)
        cnn = self.batchnormC1(cnn)
        cnn1 = self.dropC1(cnn)

        cnn = self.conv2(cnn1)
        cnn = F.relu(cnn)
        cnn = self.batchnormC2(cnn)
        cnn = self.dropC2(cnn)
        cnn2 = cnn + cnn1

        cnn = self.conv3(cnn2)
        cnn = F.relu(cnn)
        cnn = self.batchnormC3(cnn)
        cnn = self.dropC3(cnn)
        cnn3 = cnn + cnn2

        cnn = self.conv4(cnn3)
        cnn = F.relu(cnn)
        cnn = self.batchnormC4(cnn)
        cnn = self.dropC4(cnn)
        cnn4 = cnn + cnn3
        

        #cnn = self.pool(cnn)
        cnn = self.flatten(cnn4)
        cnn = self.linearC1(cnn)

        out = cnn
        y_pred = self.linear2(out)
        return y_pred

In [None]:
#Training the data

#model = ModelLog(n_features)
#model = NeuralNet1(input_size=n_features, hidden_size=5)

#Choose model
model_which = 'modelNeo' #'model1' 'modelCNN' 'modelLSTM'

#adjust input output type
if (model_which == 'model1'):
  inputType = np.int
  output_onehot = False
  output_type = np.int

elif (model_which == 'model2'):
  inputType = np.int
  output_onehot = False
  output_type = np.float32
  
elif (model_which == 'modelCNN'):
  inputType = np.int
  output_onehot = False
  output_type = np.int  

elif (model_which == 'modelLSTM'):
  inputType = np.int
  output_onehot = False
  output_type = np.int
  
elif (model_which == 'modelNeo'):
  inputType = np.int
  output_onehot = False
  output_type = np.int

else:
  raise Exception("Sorry, no model named that")


#Shuffle Dataset
if (token):
  X_train, y_train = shuffle(dataset_X_token, dataset_Y, random_state=13)
  X_val, y_val = shuffle(dataset_X_token_val, dataset_Y_val, random_state=13)
  X_val_eq, y_val_eq = shuffle(dataset_X_token_val_eq, dataset_Y_val_eq, random_state=13)
  X_test, y_test = shuffle(dataset_X_token_test, dataset_Y_test, random_state=13)
else:
  X_train, y_train = shuffle(dataset_X, dataset_Y, random_state=13)
  X_val, y_val = shuffle(dataset_X_val, dataset_Y_val, random_state=13)
  X_val_eq, y_val_eq = shuffle(dataset_X_val_eq, dataset_Y_val_eq, random_state=13)
  X_test, y_test = shuffle(dataset_X_test, dataset_Y_test, random_state=13)

#convert X vars to torch
X_train_torch = torch.from_numpy(X_train.astype(inputType)).cuda()
X_val_torch = torch.from_numpy(X_val.astype(inputType)).cuda()
X_val_eq_torch = torch.from_numpy(X_val_eq.astype(inputType)).cuda()
X_test_torch = torch.from_numpy(X_test.astype(inputType)).cuda()

#adjust Y vars size
output_size = 1
if (output_onehot):
  temp1 = np.expand_dims(y_train, axis=0).reshape(y_train.shape[0],1)
  temp1 = temp1.reshape(y_train.shape[0],1)
  temp2 = np.expand_dims(y_train, axis=0)*-1+1
  temp2 = temp2.reshape(y_train.shape[0],1)
  y_train = np.concatenate((temp1, temp2), axis=1, out=None)

  temp1 = np.expand_dims(y_val, axis=0).reshape(y_val.shape[0],1)
  temp1 = temp1.reshape(y_val.shape[0],1)
  temp2 = np.expand_dims(y_val, axis=0)*-1+1
  temp2 = temp2.reshape(y_val.shape[0],1)
  y_val = np.concatenate((temp1, temp2), axis=1, out=None)
  
  temp1 = np.expand_dims(y_val_eq, axis=0).reshape(y_val_eq.shape[0],1)
  temp1 = temp1.reshape(y_val_eq.shape[0],1)
  temp2 = np.expand_dims(y_val_eq, axis=0)*-1+1
  temp2 = temp2.reshape(y_val_eq.shape[0],1)
  y_val_eq = np.concatenate((temp1, temp2), axis=1, out=None)

  temp1 = np.expand_dims(y_test, axis=0).reshape(y_test.shape[0],1)
  temp1 = temp1.reshape(y_test.shape[0],1)
  temp2 = np.expand_dims(y_test, axis=0)*-1+1
  temp2 = temp2.reshape(y_test.shape[0],1)
  y_test = np.concatenate((temp1, temp2), axis=1, out=None)

  output_size = 2

#convert Y vars to torch
y_train_torch = torch.from_numpy(y_train.astype(output_type)).cuda()
y_val_torch = torch.from_numpy(y_val.astype(output_type)).cuda()
y_val_eq_torch = torch.from_numpy(y_val_eq.astype(output_type)).cuda()
y_test_torch = torch.from_numpy(y_test.astype(output_type)).cuda()

#reshape for model 2(sigmoid out)
if (model_which == 'model2'):
  y_train_torch = y_train_torch.view(y_train_torch.shape[0], output_size).cuda()
  y_val_torch = y_val_torch.view(y_val_torch.shape[0], output_size).cuda()
  y_val_eq_torch = y_val_eq_torch.view(y_val_eq_torch.shape[0], output_size).cuda()
  y_test_torch = y_test_torch.view(y_test_torch.shape[0], output_size).cuda()

n_samples, n_features = X_train_torch.shape

#model hyper parameter
if (model_which == 'model1'):
  embedding_vector = 3
  model = Model1(embedding_vector).cuda()
  num_epochs = 10000
  learning_rate = 0.001
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

elif (model_which == 'model2'):
  embedding_vector = 3
  model = Model2(embedding_vector).cuda()
  num_epochs = 1000
  learning_rate = 0.001
  criterion = nn.BCELoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
  
elif (model_which == 'modelCNN'):
  model = ModelCNN().cuda()
  num_epochs = 500
  learning_rate = 0.0001
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  
elif (model_which == 'modelLSTM'):
  model = ModelLSTM().cuda()
  num_epochs = 500
  learning_rate = 0.001
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  
elif (model_which == 'modelNeo'):
  model = ModelNeo().cuda()
  num_epochs = 500
  learning_rate = 0.001
  weight_loss = torch.tensor([0.17, 0.83]).cuda()
  criterion = nn.CrossEntropyLoss()#weight=weight_loss
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  


#Validation
print('train X shape: ', X_train_torch.shape)
print('train Y shape: ', y_train_torch.shape)
print()
print('validation X shape: ', X_val_torch.shape)
print('validation Y shape: ', y_val_torch.shape)
print()
print('test X shape: ', X_test_torch.shape)
print('test Y shape: ', y_test_torch.shape)
print()
print('input sample: ', X_train_torch[0])
print()
print('Model: ', model_which)
if (balanced_data):
  print('Training data: balance')
else:
  print('Training data: imbalance')
print()



train X shape:  torch.Size([6228, 19])
train Y shape:  torch.Size([6228])

validation X shape:  torch.Size([4164, 19])
validation Y shape:  torch.Size([4164])

test X shape:  torch.Size([520, 19])
test Y shape:  torch.Size([520])

input sample:  tensor([16, 12, 15, 15, 15, 16,  6, 19, 15,  1,  4,  9,  0,  8, 10,  8,  1, 16,
         6], device='cuda:0')

Model:  modelNeo
Training data: imbalance



In [None]:
valid_best_loss = np.inf
best_epoch = 0
#Training loop
for epoch in range(num_epochs):
    # Forward pass and loss
    y_pred = model(X_train_torch)
    loss = criterion(y_pred, y_train_torch)

    # Backward pass and update
    loss.backward()
    optimizer.step()

    # zero grad before new step
    optimizer.zero_grad()

    # save best epoch
    if (loss.item() < valid_best_loss):
      valid_best_loss = loss.item()
      best_epoch = epoch
      torch.save(model.state_dict(), "best_model.pth")

    # epoch progress 
    if (epoch+1) % (num_epochs/10) == 0:
        if (model_which == 'model2'): #if use sigmoid
          y_pred_cls = y_pred.round()
          acc = y_pred_cls.eq(y_train_torch).sum() / float(y_train_torch.shape[0])
        else:
          acc = torch.max(y_pred, 1)[1].eq(y_train_torch).sum() / float(y_train_torch.shape[0])         


        with torch.no_grad():
          y_predicted_balanced = model(X_val_eq_torch)
          if (model_which == 'model2'):#if use sigmoid
            y_predicted_cls_balanced = y_predicted_balanced.round()
            accv = y_predicted_cls_balanced.eq(y_val_eq_torch).sum() / float(y_val_eq_torch.shape[0])
          else:
            accv = torch.max(y_predicted_balanced, 1)[1].eq(y_val_eq_torch).sum() / float(y_val_eq_torch.shape[0])
 
          print(f'epoch: {epoch+1}, accuracy = {acc:.4f}, loss = {loss.item():.4f}, val acc = {accv.item():.4f}')

print(f'Best Epoch: {best_epoch}')

epoch: 50, accuracy = 0.8847, loss = 0.3054, val acc = 0.7630
epoch: 100, accuracy = 0.8998, loss = 0.2650, val acc = 0.7648
epoch: 150, accuracy = 0.9041, loss = 0.2496, val acc = 0.7706
epoch: 200, accuracy = 0.9139, loss = 0.2239, val acc = 0.7692
epoch: 250, accuracy = 0.9194, loss = 0.2085, val acc = 0.7688
epoch: 300, accuracy = 0.9180, loss = 0.2008, val acc = 0.7675
epoch: 350, accuracy = 0.9258, loss = 0.1848, val acc = 0.7653
epoch: 400, accuracy = 0.9298, loss = 0.1749, val acc = 0.7657
epoch: 450, accuracy = 0.9353, loss = 0.1616, val acc = 0.7670
epoch: 500, accuracy = 0.9316, loss = 0.1647, val acc = 0.7635
Best Epoch: 477


In [None]:
 #validation
model.load_state_dict(torch.load("best_model.pth"))
with torch.no_grad():
  y_predicted_imbalanced = model(X_val_torch)
  y_predicted_balanced = model(X_val_eq_torch)
  acc1 = torch.max(y_predicted_imbalanced, 1)[1].eq(y_val_torch).sum() / float(y_val_torch.shape[0])
  acc2 = torch.max(y_predicted_balanced, 1)[1].eq(y_val_eq_torch).sum() / float(y_val_eq_torch.shape[0])
  
  f1_score_imbalance = f1_score(torch.max(y_predicted_imbalanced, 1)[1].cpu().numpy(), y_val_torch.cpu().numpy(), average='macro')
  f1_score_balance = f1_score(torch.max(y_predicted_balanced, 1)[1].cpu().numpy(), y_val_eq_torch.cpu().numpy(), average='macro')

  mcc_imbalance = matthews_corrcoef(torch.max(y_predicted_imbalanced, 1)[1].cpu().numpy(), y_val_torch.cpu().numpy())
  mcc_balance = matthews_corrcoef(torch.max(y_predicted_balanced, 1)[1].cpu().numpy(), y_val_eq_torch.cpu().numpy())
  
  fpr, tpr, thresholds = metrics.roc_curve(y_val_torch.cpu().numpy(), torch.max(y_predicted_imbalanced, 1)[1].cpu().numpy(), pos_label=1)
  auc_imbalance = metrics.auc(fpr, tpr)
  fpr, tpr, thresholds = metrics.roc_curve(y_val_eq_torch.cpu().numpy(), torch.max(y_predicted_balanced, 1)[1].cpu().numpy(), pos_label=1)
  auc_balance = metrics.auc(fpr, tpr)

print(f'Validation accuracy (imbalance): {acc1.item():.4f}, F1: {f1_score_imbalance.item():.4f}, mcc: {mcc_imbalance.item():.4f}, auc: {auc_imbalance.item():.4f}')
print(confusion_matrix(torch.max(y_predicted_imbalanced, 1)[1].cpu().numpy(), y_val_torch.cpu().numpy()))
print()
print(f'Validation accuracy (balance): {acc2.item():.4f}, F1: {f1_score_balance.item():.4f}, mcc: {mcc_balance.item():.4f}, auc: {auc_imbalance.item():.4f}')
print(confusion_matrix(torch.max(y_predicted_balanced, 1)[1].cpu().numpy(), y_val_eq_torch.cpu().numpy()))

Validation accuracy (imbalance): 0.8941, F1: 0.8503, mcc: 0.7239, auc: 0.8175
[[2988  396]
 [  45  735]]

Validation accuracy (balance): 0.7666, F1: 0.7532, mcc: 0.6023, auc: 0.8175
[[1130  527]
 [   1  604]]


In [None]:

if (model_which == 'modelCNN'):
  print('modelCNN')
  modelCNN_pred_val_imbalance = nn.Softmax()(y_predicted_imbalanced)
  modelCNN_pred_val_balance = nn.Softmax()(y_predicted_balanced)


elif (model_which == 'modelLSTM'):
  print('modelLSTM')
  modelLSTM_pred_val_imbalance = nn.Softmax()(y_predicted_imbalanced)
  modelLSTM_pred_val_balance = nn.Softmax()(y_predicted_balanced)
else:
  print('not running softmax')

not running softmax


In [None]:

if (modelCNN_pred_val_imbalance!=[]) & (modelLSTM_pred_val_imbalance !=[]):
 print('both executed')
 ensamble_pred_imbalance = (modelLSTM_pred_val_imbalance * 0.17) + (modelCNN_pred_val_imbalance * 0.83)
 ensamble_pred_balance = (modelLSTM_pred_val_balance * 0.17) + (modelCNN_pred_val_balance * 0.83)

 acc1 = torch.max(ensamble_pred_imbalance, 1)[1].eq(y_val_torch).sum() / float(y_val_torch.shape[0])
 acc2 = torch.max(ensamble_pred_balance, 1)[1].eq(y_val_eq_torch).sum() / float(y_val_eq_torch.shape[0])
 f1_score_imbalance = f1_score(torch.max(ensamble_pred_imbalance, 1)[1].cpu().numpy(), y_val_torch.cpu().numpy(), average='macro')
 f1_score_balance = f1_score(torch.max(ensamble_pred_balance, 1)[1].cpu().numpy(), y_val_eq_torch.cpu().numpy(), average='macro')
 
 print()
 print(f'Validation accuracy (imbalance): {acc1.item():.4f}, F1: {f1_score_imbalance.item():.4f}')
 print(confusion_matrix(torch.max(ensamble_pred_imbalance, 1)[1].cpu().numpy(), y_val_torch.cpu().numpy()))
 print(f'Validation accuracy (balance): {acc2.item():.4f}, F1: {f1_score_balance.item():.4f}')
 print(confusion_matrix(torch.max(ensamble_pred_balance, 1)[1].cpu().numpy(), y_val_eq_torch.cpu().numpy()))
 
else:
  print('one of the model or both, have not been executed yet')

one of the model or both, have not been executed yet


In [None]:
#test

#print best model
model.load_state_dict(torch.load("best_model.pth"))
with torch.no_grad():
  y_predicted_test = model(X_test_torch)
  acc_test = torch.max(y_predicted_test, 1)[1].eq(y_test_torch).sum() / float(y_test_torch.shape[0])
  f1_score_test = f1_score(torch.max(y_predicted_test, 1)[1].cpu().numpy(), y_test_torch.cpu().numpy(), average='macro')
  mcc_test = matthews_corrcoef(torch.max(y_predicted_test, 1)[1].cpu().numpy(), y_test_torch.cpu().numpy())
  fpr, tpr, thresholds = metrics.roc_curve(y_test_torch.cpu().numpy(), torch.max(y_predicted_test, 1)[1].cpu().numpy(), pos_label=1)
  auc_test = metrics.auc(fpr, tpr)

print(f'Test accuracy: {acc_test.item():.4f}, F1: {f1_score_test.item():.4f}, mcc: {mcc_test.item():.4f}, auc: {auc_test.item():.4f}')
print(confusion_matrix(torch.max(y_predicted_test, 1)[1].cpu().numpy(), y_test_torch.cpu().numpy()))

Test accuracy: 0.7462, F1: 0.7303, mcc: 0.5628, auc: 0.7462
[[257 129]
 [  3 131]]
