## Defining the data and create_model functions

In [None]:
from __future__ import print_function
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# The GPU id to use, usually either "0" or "1"
os.environ["CUDA_VISIBLE_DEVICES"]="1" 
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.utils import multi_gpu_model, Sequence, np_utils
import math
from keras.optimizers import SGD, Adam, RMSprop, Nadam
from keras.callbacks import EarlyStopping, TensorBoard
import scipy.stats as ss
import matplotlib.pyplot as plt
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform
from keras.backend.tensorflow_backend import set_session
from keras import backend as K

def data():
    embedding_file = 'opa2vec/backup/embeddings_extended_300.out'
    data = pd.read_csv(embedding_file, header = None, sep = ' ', skiprows=0)
    embds_data = data.values
    patho_dict = dict(zip(embds_data[:,0],embds_data[:,1:]))

    embedding_file = 'opa2vec2/backup/embeddings_HP_300.out'
    data = pd.read_csv(embedding_file, header = None, sep = ' ', skiprows=0)
    embds_data = data.values
    host_dict = dict(zip(embds_data[:,0],embds_data[:,1:]))

    protein_set = set()
    with open('host_pheno_asso/human_pheno_asso_onlyhuman.txt', 'r') as f:
        for line in f:
            items = line.strip().split()
            protein_set.add(items[0])
    
    hosts = set()
    pathos = set()
    positives_set = set()

    with open('./data/hpidb2.entrez.score.txt', 'r') as f:
        for line in f:
            items = line.strip().split('\t')
            if ':' in items[2]:
                if float(items[2].split('miscore:')[1]) > 0.5:
                    patho = 'NCBITaxon_' + items[1]
                    hosts.add(items[0])
                    pathos.add(patho)
                    positives_set.add((items[0], patho))

    pathogen_missed = {}
    positives = set()
    for pair in positives_set:
        if pair[0] in host_dict and pair[1] in patho_dict and pair[0] in protein_set:
            positives.add((pair))
        if pair[1] not in patho_dict:
            if pair[1] not in pathogen_missed:
                pathogen_missed[pair[1]] = 0
            pathogen_missed[pair[1]] += 1
        
    return pathogens, positives, protein_set

def create_model(pathogens, positives, protein_set):
    
    class Generator(Sequence):
        def __init__(self, x_set, y_set, batch_size):
            self.x, self.y = x_set, y_set
            self.batch_size = batch_size
            self.nbatch = int(np.ceil(len(self.x) / float(self.batch_size)))
            self.length = len(self.x)

        def __len__(self):
            return self.nbatch

        def __getitem__(self, idx):
            start = idx * self.batch_size
            batch_len = min(self.batch_size, (self.length)-start)
            X_batch_list = np.empty((batch_len, 600), dtype=np.float32)
            y_batch_list = np.empty(batch_len, dtype=np.float32)

            for ids in range(start, min((idx + 1) * self.batch_size, self.length)):
                array1 = host_dict[self.x[ids][0]]
                array2 = patho_dict[self.x[ids][1]]
                embds = np.concatenate([array1, array2])
                X_batch_list[ids-start] = embds
                y_batch_list[ids-start] = self.y[ids]
            return X_batch_list, y_batch_list
    batch_size = 2**11
    num_classes = 1
    rank_counts = []
    epochs = 5
    for i in range(epochs):
        rank_counts.append(dict())
    
    model = Sequential()
    model.add(Dense(units={{choice([512, 256, 128, 64, 32])}}, activation={{choice(['relu', 'sigmoid'])}}, input_shape=(600,)))
    model.add(BatchNormalization())
    model.add(Dropout(rate={{uniform(0, 1)}}))
    if {{choice(['three', 'four'])}} == 'four':
        model.add(Dense(units={{choice([64, 32, 16, 8])}}, activation={{choice(['relu', 'sigmoid'])}}))
        model.add(BatchNormalization())
        model.add(Dropout(rate = {{uniform(0, 1)}}))
        if {{choice(['three', 'four'])}} == 'three':
            model.add(Dense(units={{choice([8, 4, 2])}}, activation={{choice(['relu', 'sigmoid'])}}))
            model.add(BatchNormalization())
            model.add(Dropout(rate={{uniform(0, 1)}}))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer={{choice(['rmsprop', 'adam'])}},
                  metrics=['accuracy'])
    
    model.save('model_300.h5')
    pathogens = np.unique(np.array(list(positives))[:,1])
    counter = 0
    
    for taxon in pathogens:
        K.clear_session()
        parallel_model = load_model('model_300.h5')
        counter+=1
        print('taxon ', counter)
        val_pathos = set()
        val_pathos.add(taxon)
        train_pathos = set(list(pathogens)) - val_pathos

        train_positives = []
        val_positives = []
        train_positives_set = set()
        val_positives_set = set()
        for items in positives:
            if items[1] in train_pathos:
                train_positives_set.add((items[0], items[1]))
                train_positives.append((items[0], items[1], 1))
            if items[1] in val_pathos:
                val_positives_set.add((items[0], items[1]))
                val_positives.append((items[0], items[1], 1))
        print(len(train_positives), len(val_positives))

        train_negatives = []
        train_all_tuples = set()
        pathogens1 = np.array(list(train_positives))[:,1]
        for host in protein_set:
            for patho in pathogens1:
                if host in host_dict and patho in patho_dict:
                    train_all_tuples.add((host, patho))
        for item in train_all_tuples:
            if item not in train_positives_set:
                train_negatives.append((item[0], item[1], 0))

        train_positives = np.repeat(np.array(list(train_positives)), len(train_negatives)//len(train_positives), axis = 0)
        train_negatives = np.array(list(train_negatives))
        triple_train = np.concatenate((train_positives, train_negatives), axis=0)
        np.random.shuffle(triple_train)

        factor = 1
        generator = Generator(triple_train[:int(factor*len(triple_train)),0:2], triple_train[:int(factor*len(triple_train)),2], batch_size)
        
        for i in range(epochs):
            history = parallel_model.fit_generator(generator=generator,
                                epochs=1,
                                steps_per_epoch = int(math.ceil(math.ceil(factor*len(triple_train))/ batch_size)),
                                verbose=0,
                                validation_data=generator,
                                validation_steps=1, max_queue_size = 20,
                                use_multiprocessing=True,
                                workers = 6)

            for pathogen in val_pathos:
                protein_list = []
                positive_set = set()
                for items in positives:
                    if items[1] == pathogen:
                        protein_list.append((items[0], items[1], 1))
                        positive_set.add(items[0])
                num_positive = len(protein_list)
                for protein in protein_set:
                    if protein not in positive_set:
                        protein_list.append((protein, pathogen, 0))
                protein_list = np.array(protein_list)
                sim_list = parallel_model.predict_generator(generator=Generator(protein_list[:,0:2], protein_list[:,2], 1000), verbose=0, steps=int(math.ceil(math.ceil(len(protein_list))  / 1000)), max_queue_size = 20, use_multiprocessing=True, workers = 6)
                y_rank = ss.rankdata(-sim_list, method='average')
                x_list = y_rank[:num_positive]
                print(np.mean(x_list))
                for x in x_list:
                    if x not in rank_counts[i]:
                        rank_counts[i][x] = 0
                    rank_counts[i][x]+=1
    aucs = []
    for i in range(epochs):                
        auc_x = list(rank_counts[i].keys())
        auc_x.sort()
        auc_y = []
        tpr = 0
        step = 1/sum(rank_counts[i].values())
        for x in auc_x:
            tpr += rank_counts[i][x]*step
            auc_y.append(tpr)
        auc_x.append(len(protein_set))
        auc_y.append(1)
        auc1 = np.trapz(auc_y, auc_x)/len(protein_set)
        print('Rank based auc is: %f' % (auc1))
        aucs.append(auc1)
    max_auc = max(aucs)
    output = open('hyperopt_300.aucs', 'a+')
    output.write(str(aucs) + '\n')
    return {'loss': -max_auc, 'status': STATUS_OK, 'model': parallel_model}

## Running hyperas

In [None]:
if __name__ == '__main__':
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)
    best_run, best_model = optim.minimize(model=create_model, data=data, algo=tpe.suggest, max_evals=100, trials=Trials(), notebook_name='hyperopt_300')
    print("Evalutation of best performing model:")
    print("Best performing model chosen hyper-parameters:")
    print(best_run)