In [None]:
!pip install optuna
import optuna
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from torch.autograd import Variable
from torch.nn.parameter import Parameter
import torch.nn.functional as F
import torch.utils.data as utils
import time
import torch.optim as optim
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
                              SequentialSampler, SubsetRandomSampler,Dataset)
from sklearn.model_selection import train_test_split

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Preprocess Data

In [None]:
def prepare_data(seqs):
    max_len = 0
    voc2ind = {voc:ind for ind,voc in enumerate(['<pad>', '<unk>', 'A', 'T', 'C', 'G','N','X'])}
    
    i = len(voc2ind)
    
    tokenized_seqs = []
    for seq in seqs:
        tokenized_seq = []
        for e in seq:
            seq = seq.upper()
            if not e in voc2ind:
                voc2ind[e] = i
                i += 1
            tokenized_seq.append(voc2ind[e])
        tokenized_seqs.append(tokenized_seq)
        
    return tokenized_seqs, voc2ind

def prepare_labels(labels):
    tokenized_labels = []
    label2token = {}
    i = 0
    for label in labels:
        if not label in label2token:
            label2token[label] = i
            i += 1
        tokenized_labels.append(label2token[label])
    return tokenized_labels, label2token

def pad(tokenized_seqs, voc2ind):
    padded_seqs = []
    max_len = 0
    for seq in tokenized_seqs:
        max_len = max(len(seq), max_len)
    
    for seq in tokenized_seqs:
        padded_seq = seq + [voc2ind['<pad>']] * (max_len - len(seq))
        padded_seqs.append(padded_seq)
        
    return np.array(padded_seqs, dtype=np.float32)

def data_loader(train_inputs, val_inputs, train_labels, val_labels,
                batch_size=128):
    train_inputs, val_inputs, train_labels, val_labels =\
    tuple(torch.tensor(data) for data in
          [train_inputs, val_inputs, train_labels, val_labels])

    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader

def preprocess(filenames, test_size = 0.2):
  df = pd.read_excel(filenames)
  df = df[df.columns.drop(list(df.filter(regex='1')))]
  df.set_axis([*df.columns[:-1], 'Class'], axis=1, inplace=True)
  df = df[['GenBarcode','Class']]
  df.rename(columns={'GenBarcode':'Gene'},inplace=True)
  df = df.drop(df[df.Class == 'Missing'].index)
  df = df.drop(df[df.Class == 'Undetermined'].index)
  df.dropna(inplace=True)
  seqs = df.Gene.values
  labels = df.Class.values

  tokenized_seqs, voc2ind = prepare_data(seqs)
  tokenized_seqs = pad(tokenized_seqs, voc2ind)

  tokenized_labels, label2token = prepare_labels(labels)

  train_inputs, test_inputs, train_labels, test_labels = train_test_split(
      tokenized_seqs, tokenized_labels, test_size=test_size, random_state=42, stratify=tokenized_labels)
  train_dataloader, test_dataloader = data_loader(train_inputs, test_inputs, 
                                                  train_labels, test_labels, 
                                                  batch_size=64)
  return tokenized_seqs, voc2ind, tokenized_labels, train_dataloader, test_dataloader

#Build Model

In [None]:
class BiLSTM(nn.Module):
  
    def __init__(self, vocab_size,input_dim, hidden_dim, num_layers, output_dim):
        super(BiLSTM, self).__init__()
        self.vocab_size = vocab_size
        self.feature_size = input_dim
        self.encoder = nn.Embedding(self.vocab_size, self.feature_size)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional = True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)
        
    def forward(self, x):
        x = torch.tensor(x).to(torch.int64)
        x = self.encoder(x)
        h0 = torch.zeros(self.num_layers*2, x.size(0),self.hidden_dim).requires_grad_()
        c0 = torch.zeros(self.num_layers*2, x.size(0),self.hidden_dim).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = torch.sigmoid(self.fc(out))
        out = out[:, -1]
        return out

#Hyperparameter Tuning

In [None]:
def build_model(params):
  model = BiLSTM(vocab_size = len(voc2ind), input_dim=feature, hidden_dim=params['hidden'], output_dim=1, num_layers=params['lstm'])
  return model

def train_model(params, model):
  num_epochs = 100
  criterion = torch.nn.BCELoss(reduction='mean')
  hist = np.zeros(num_epochs)
  optimizer = getattr(optim, params['optimizer'])(model.parameters(), lr=0.001)
  for t in range(num_epochs):
    for j,(x_train,y_train) in enumerate(train_dataloader):
      output = model(x_train)
      y_train = y_train.unsqueeze(1)
      y_train = y_train.float()
      loss = criterion(output,y_train)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    hist[t] = loss.item()
    min_loss = hist.min()
  return min_loss

def objective(trial):
     params = {
              'optimizer': trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"]),
              'lstm': trial.suggest_int("lstm", 1, 3, 1),
              'hidden': trial.suggest_int("hidden",8, 32, 8)
              }
    
     model = build_model(params)
    
     loss = train_model(params, model)

     return loss

In [None]:
tokenized_seqs, voc2ind, tokenized_labels, train_dataloader, test_dataloader = preprocess('pf_drug_pyrim.xlsx')
feature = tokenized_seqs.shape[1]
num_epochs = 100
params = {
              'optimizer': ["Adam", "RMSprop", "SGD"],
              'lstm': [1,2,3],
              'hidden': [8,16,32]
              }
study = optuna.create_study(direction="minimize", sampler=optuna.samplers.GridSampler(params))
study.optimize(objective, n_trials=27)

[32m[I 2022-10-09 03:34:24,303][0m A new study created in memory with name: no-name-cac4dfa7-5458-43ba-a0df-5d7ffbc7d6e0[0m
  
[32m[I 2022-10-09 03:51:06,803][0m Trial 0 finished with value: 0.06998824328184128 and parameters: {'optimizer': 'SGD', 'lstm': 2, 'hidden': 8}. Best is trial 0 with value: 0.06998824328184128.[0m
[32m[I 2022-10-09 04:17:03,017][0m Trial 1 finished with value: 0.0014208841603249311 and parameters: {'optimizer': 'Adam', 'lstm': 3, 'hidden': 16}. Best is trial 1 with value: 0.0014208841603249311.[0m
[32m[I 2022-10-09 04:28:17,348][0m Trial 2 finished with value: 0.005149521864950657 and parameters: {'optimizer': 'RMSprop', 'lstm': 1, 'hidden': 16}. Best is trial 1 with value: 0.0014208841603249311.[0m
[32m[I 2022-10-09 04:37:13,998][0m Trial 3 finished with value: 0.007728338707238436 and parameters: {'optimizer': 'RMSprop', 'lstm': 1, 'hidden': 8}. Best is trial 1 with value: 0.0014208841603249311.[0m
[32m[I 2022-10-09 05:01:51,854][0m Trial 4 

In [None]:
hyperparameter_tuning = study.trials_dataframe()
hyperparameter_tuning = hyperparameter_tuning[['params_lstm','params_optimizer','params_hidden','value']]
hyperparameter_tuning = hyperparameter_tuning.sort_values(by=['params_lstm','params_optimizer','params_hidden']).reset_index()
hyperparameter_tuning.drop('index',axis=1,inplace=True)
hyperparameter_tuning.head()

Unnamed: 0,params_lstm,params_optimizer,params_hidden,value
0,1,Adam,8,0.009201
1,1,Adam,16,0.010367
2,1,Adam,32,0.001564
3,1,RMSprop,8,0.007728
4,1,RMSprop,16,0.00515


In [None]:
hyperparameter_tuning.to_excel('Hyperparameter Tuning.xlsx')