<a href="https://colab.research.google.com/github/aneesh2711/Bioinformatics/blob/master/Keras_Multi_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from numpy import inf
import math

from sklearn.metrics import f1_score
import tensorflow
from tensorflow.keras.layers import Dropout
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
device_name = tensorflow.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_fn = "/content/drive/My Drive/Colab Notebooks/Protien_prediction/data"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
branch_term = "GO0071840"

In [None]:
def oversampleData(tasks,train_set):
  train_set_final = {}
  train_set_pos = {}
  train_set_neg = {}
  label_pos = {}
  label_neg = {}
  for task in tasks:
    data_pos = train_set[task][0][train_set[task][1].ravel() == 1, :]
    data_neg = train_set[task][0][train_set[task][1].ravel() == 0, :]
    pos_len = len(data_pos)
    #print(pos_len)
    neg_len = len(data_neg)
    #print(neg_len)
    if pos_len<neg_len:
      ratio = math.ceil(neg_len/pos_len)
      #print(ratio)
      data_pos = np.concatenate([data_pos]*ratio)
      #print(len(data_pos))
      data_pos = data_pos[0:neg_len][:]
      #print(len(data_pos))
    if pos_len>neg_len:
      ratio = math.ceil(pos_len/neg_len)
      #print(ratio)
      data_neg = np.concatenate([data_neg]*ratio)
      #print(len(data_neg))
      data_neg = data_neg[0:pos_len][:]
      #print(len(data_neg))
    label_pos[task] = [1]*len(data_pos)
    label_neg[task] = [0]*len(data_neg)
    train_set_pos[task] = data_pos
    train_set_neg[task] = data_neg
    assert len(train_set_pos[task]) == len(train_set_neg[task])
    assert len(label_pos[task]) == len(label_neg[task])
    assert len(train_set_pos[task]) == len(label_pos[task])
    assert len(train_set_neg[task]) == len(label_neg[task])
    train_set_final[task] = [np.concatenate((train_set_pos[task],train_set_neg[task])), np.concatenate((label_pos[task],label_neg[task]))]
  return train_set_final

In [None]:
space = {'shared_layers_num': hp.choice('shared_layers_num', [1, 2, 3]),
         'spec_layers_num': hp.choice('spec_layers_num', [1, 2, 3]),
         'dropout_i': hp.uniform('dropout_i', .05, .95),
         'dropout_o': hp.uniform('dropout_o', .05, .95),
         'shared_hidden_number_1': hp.choice('shared_hidden_number_1', [256, 512, 768, 1024]),
         'shared_hidden_number_2': hp.choice('shared_hidden_number_2', [256, 512, 768, 1024]),
         'shared_hidden_number_3': hp.choice('shared_hidden_number_3', [256, 512, 768, 1024]),
         'spec_hidden_number_1': hp.choice('spec_hidden_number_1', [64, 128, 256, 512]),
         'spec_hidden_number_2': hp.choice('spec_hidden_number_2', [64, 128, 256, 512]),
         'spec_hidden_number_3': hp.choice('spec_hidden_number_3', [64, 128, 256, 512]),
         'batch_size': hp.choice('batch_size', [32, 64, 128, 256]),
         'optimizer': hp.choice('optimizer', ['Nadam', 'Adagrad', 'Adadelta', 'Adam', 'Rmsprop']),
         'branch_term': branch_term,
         'loss': 'binary_crossentropy',
         'oversample': hp.choice('oversample', [True,False]),
         }

In [None]:

def fun(params):
  metrics=['accuracy']
  oversample = params['oversample']

  shared_layers_num = params['shared_layers_num']
  spec_layers_num = params['spec_layers_num']
  shared_layers_units = [params['shared_hidden_number_1'],params['shared_hidden_number_2'],params['shared_hidden_number_3']]
  spec_layers_units = [params['spec_hidden_number_1'],params['spec_hidden_number_2'],params['spec_hidden_number_3']]
  dropout = [params['dropout_i'],params['dropout_o'],params['dropout_o']]
  optimizer = params['optimizer']
  loss= params['loss']
  metrics=['accuracy']
  batch_size = params['batch_size']
  oversample = params['oversample']
  branch_term = params['branch_term']

  all_data_x_fn = data_fn + '/all_data_X.csv'
  all_data_x = pd.read_csv(all_data_x_fn, sep='\t', header=0, index_col=0)
  all_proteins_train = [p.replace('"', '') for p in all_data_x.index]
  all_data_x.index = all_proteins_train

  train_fn = data_fn + '/train_sets/' + branch_term + '_train.csv'
  train_y = pd.read_csv(train_fn, sep='\t', header=0, index_col=0)
  proteins_train = [p for p in train_y.index if p in all_data_x.index]
  train_x = all_data_x.loc[proteins_train, :]
  train_y = train_y.loc[proteins_train, :]
  validate_fn = data_fn + '/train_sets/' + branch_term + '_valid.csv'
  validate_y = pd.read_csv(validate_fn, sep='\t', header=0, index_col=0)
  proteins_validate = [p for p in validate_y.index if p in all_data_x.index]
  validate_x = all_data_x.loc[proteins_validate, :]
  validate_y = validate_y.loc[proteins_validate, :]

  tasks = train_y.columns

  train_set = {}
  validate_set = {}
  for task in tasks:
    proteins_in_task = train_y.index[train_y.loc[:, task] != inf]
    train_set[task] = [train_x.loc[proteins_in_task, :].values, train_y.loc[proteins_in_task, task].values]
    proteins_in_task = validate_y.index[validate_y.loc[:, task] != inf]
    validate_set[task] = [validate_x.loc[proteins_in_task, :].values, validate_y.loc[proteins_in_task, task].values]

  train_set_final = train_set
  if oversample:
    train_set_final = oversampleData(tasks,train_set)

  shared_layers = []
  spec_layers = {}
  outputs = {}
  models = {}

  inputs = Input(shape=(258,))
  x = Dropout(dropout[0])(inputs)
  for i in range(shared_layers_num):
    name = "shared-"+ str(i)
    layer = Dense(shared_layers_units[i], activation='relu',name = name)(x)
    shared_layers.append(layer)
    x = Dropout(dropout[1])(layer)
  for task in tasks:
    spec_layers[task] = []
    layer = Dense(spec_layers_units[0], activation='relu',name = task+"-spec-0")(x)
    spec_layers[task].append(layer)
    y = Dropout(dropout[2])(layer)
    for i in range(1,spec_layers_num):
      name = task+"-spec-"+ str(i)
      layer = Dense(spec_layers_units[i], activation='relu',name = name)(y)
      spec_layers[task].append(layer)
      y = Dropout(dropout[2])(layer)
    outputs[task] = Dense(1, activation='sigmoid',name=task)(y)
    models[task] = Model(inputs = inputs,outputs = outputs[task],name = task)

  for task in tasks:
    models[task].compile(optimizer = optimizer, loss= loss, metrics=metrics)

  callback = EarlyStopping(monitor='loss', patience=3)

  for task in tasks:
    models[task].fit(train_set_final[task][0], train_set_final[task][1], epochs = 50,batch_size=batch_size, verbose=0, callbacks=[callback], validation_data = (validate_set[task][0], validate_set[task][1]))

  for task in tasks:
    for i in range(shared_layers_num):
      name = "shared-"+str(i)
      layer = models[task].get_layer(name)
      if layer.trainable == True:
        layer.trainable = False
      assert layer.trainable == False
    models[task].compile(optimizer = optimizer, loss= loss, metrics=metrics)

  for task in tasks:
    models[task].fit(train_set_final[task][0], train_set_final[task][1], epochs = 50,batch_size=batch_size, verbose=0, callbacks=[callback], validation_data = (validate_set[task][0], validate_set[task][1]))

  test_fn = data_fn + '/train_sets/' + branch_term + '_test.csv'
  test_y = pd.read_csv(test_fn, sep='\t', header=0, index_col=0)
  proteins_test = [p for p in test_y.index if p in all_data_x.index]
  test_x = all_data_x.loc[proteins_test, :]
  test_y = test_y.loc[proteins_test, :]
  test_set = {}
  predict_set = {}
  test_labels = []
  predict_labels = []
  for task in tasks:
    proteins_in_task = test_y.index[test_y.loc[:, task] != inf]
    test_set[task] = [test_x.loc[proteins_in_task, :].values, test_y.loc[proteins_in_task, task].values]
    predict_set[task] = models[task].predict(test_set[task][0])
    test_labels = np.concatenate((test_labels,test_set[task][1]))
    predict_set[task] = predict_set[task].reshape(len(predict_set[task]))
    predict_labels = np.concatenate((predict_labels,predict_set[task]))

  i = 1
  Fmax = 0
  T = 0
  while i < 100:
    k = i/100
    predict_labels_T = (predict_labels>k)
    f1 = f1_score(test_labels, predict_labels_T)
    
    if f1>Fmax:
      Fmax = f1
      T = i
    i = i + 1
  print(str(Fmax) + "---"+ str(T))
  return {'loss': -Fmax, 'tres':T, 'status': STATUS_OK}




In [None]:
trials = Trials()
best = fmin(fun, space, algo=tpe.suggest, max_evals=10, trials=trials,return_argmin=False)
print('best: ', best)

0.3064929736383219---49
0.3649795347908555---19
0.362758326823379---20
0.27646338241298973---49
0.44014285714285717---32
0.2677152132542259---49
0.3607681755829904---25
0.3619768778311135---39
0.2590868321018238---47
0.48664335143983933---34
100%|██████████| 10/10 [25:21<00:00, 152.10s/it, best loss: -0.48664335143983933]
best:  {'batch_size': 256, 'branch_term': 'GO0071840', 'dropout_i': 0.3243499474136037, 'dropout_o': 0.17749052057828274, 'loss': 'binary_crossentropy', 'optimizer': 'Rmsprop', 'oversample': False, 'shared_hidden_number_1': 512, 'shared_hidden_number_2': 512, 'shared_hidden_number_3': 1024, 'shared_layers_num': 1, 'spec_hidden_number_1': 256, 'spec_hidden_number_2': 512, 'spec_hidden_number_3': 128, 'spec_layers_num': 3}
