-
Notifications
You must be signed in to change notification settings - Fork 1
/
deepSimDEF_for_protein_protein_interaction.py
385 lines (331 loc) · 24.6 KB
/
deepSimDEF_for_protein_protein_interaction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
# CUDA_VISIBLE_DEVICES=gpu-number python deepSimDEF_for_protein_protein_interaction.py arguments
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}
import sys
import logging
import random
import operator
import numpy as np
import pprint
import argparse
import tensorflow.keras
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from networks import deepSimDEF_network
from datasets import ppi_dataset, generic_production_dataset
from dataloaders import generic_dataloader, generic_production_dataloader
from sklearn.metrics import f1_score
import datetime
from pytz import timezone
import collections
from utils import *
tz = timezone('US/Eastern') # To monitor training time (showing start & end points of a fixed timezone when the code runs on a remote server)
pp = pprint.PrettyPrinter(indent=4)
#checkpoint = '[base_dir]/2020.03.04-23h40m37s_server_name/model_checkpoints/epoch_58'
checkpoint = None
parser = argparse.ArgumentParser(description='Calculate gene-product pairs similarity.')
# experiment arguments
parser.add_argument('--deepsimdef_mode', default='training', type=str, help='mode of the model can be either "training", "evaluation", or "production"')
parser.add_argument('--nb_fold', default=10, type=int, help='number of folds of training and evaluation in n-fold cross-validation (default: 10)')
parser.add_argument('--iea', default=True, type=str2bool, help='whether to consider "inferred from electronic annotations" or not')
parser.add_argument('--sub_ontology', default='all', type=str, help='considering annotations of what subontologies, "bp", "cc", "mf", or "all" (default: "all")')
parser.add_argument('--inpute_file', default='default', type=str, help='inpute file of the gene product terms and the score(s), if not provide use default file')
parser.add_argument('--production_input_file', default='', type=str, help='test file of the gene product terms used in production mode (you should provide directory too)')
parser.add_argument('--production_output_file', default='', type=str, help='result file of the gene product terms used in production mode (you should provide directory too)')
parser.add_argument('--experiment_mode', default=2, type=int, help='1: any pairs of unseen genes; 2: only pair in which both genes are unseen')
parser.add_argument('--partial_shuffle_percent', default=0.0, type=float, help='Should be more than 0.0 for "Negative Control" experiments (default: 0.0)')
parser.add_argument('--cutoff', default=True, type=str2bool, help='whether to consider finding a proper cutoff point (on predicted probabilities) in binary classification')
parser.add_argument('--species', default='yeast', type=str, help='the species of interest for evaluation (human, yeast, etc)')
# network arguments
parser.add_argument('--dropout', default=0.3, type=float, help='dropout applied to dense layers of the network (default: 0.3)')
parser.add_argument('--embedding_dropout', default=0.15, type=float, help='dropout applied to embedding layers of the network; i.e., percentage of features dropped out completely (default: 0.15)')
parser.add_argument('--annotation_dropout', default=0.0, type=float, help='dropout applied to annotations of a gene at training time; i.e., percentage of annotations ignored (default: 0.0)')
parser.add_argument('--pretrained_embedding', default=True, type=str2bool, help='whether the GO term embeddings loaded should be computed in advance from a pretrained unsupervised model (default: True)')
parser.add_argument('--updatable_embedding', default=True, type=str2bool, help='whether the GO term embeddings should be updatable during the traning (default: True)')
parser.add_argument('--activation_hidden', default='relu', type=str, help='activation function of hidden layers (default: "relu")')
parser.add_argument('--activation_highway', default='relu', type=str, help='activation function of highway layer (default: "relu")')
parser.add_argument('--activation_output', default='sigmoid', type=str, help='activation function of last, i.e. output, layer (default: "sigmoid")')
parser.add_argument('--embedding_dim', default=100, type=int, help='dimentionality of GO term embeddings, i.e. number of latent features (default: 100)')
parser.add_argument('--highway_layer', default=True, type=str2bool, help='True if highway layer instead of cosince similarity (default: True)')
parser.add_argument('--cosine_similarity', default=False, type=str2bool, help='True cosince similarity instead of highway layer (default: False)')
# training arguments
parser.add_argument('--nb_epoch', default=400, type=int, help='number of epochs for training')
parser.add_argument('--batch_size', default=256, type=int, help='batch size (default: 256)')
parser.add_argument('--loss', default='binary_crossentropy', type=str, help='loss type of the onjective function that gets optimized ("binary_crossentropy" or "mean_squared_error")')
parser.add_argument('--optimizer', default='adam', type=str, help='optimizer algorithm, can be: "adam", "rmsprop", etc. (default: "adam")')
parser.add_argument('--learning_rate', default=0.001, type=float, help='starting learning rate for optimization')
parser.add_argument('--adaptive_lr', default=True, type=str2bool, help='whether to use adavtive learning rate or not')
parser.add_argument('--adaptive_lr_rate', default=10, type=int, help='after how many epoch, decay the learning rate')
# checkpointting arguments
parser.add_argument('--checkpoint', default=checkpoint, help='starting from scratch or using model checkpoints')
parser.add_argument('--save_model', default=False, type=str2bool, help='model checkpointing, whether to save the models during training')
parser.add_argument('--save_embeddings', default=False, type=str2bool, help='storing weights of the embedding layers, whether to save updated embeddings')
parser.add_argument('--save_interval', default=5, type=int, help='-1, checkpoint if see improvement in the result; otherwise after each interval (default: -1)')
parser.add_argument('--log_dir', default='logs/', type=str, help='base log folder (will be created if it does not exist)')
parser.add_argument('--log_name', default='PPI_test', type=str, help='prefix name to use when logging this model')
# misc arguments
parser.add_argument('--verbose', default=False, type=str2bool, help='if print extra information during model training')
parser.add_argument('--reproducible', default=True, type=str2bool, help='whether we want to have a reproducible result (mostly helpful with training on a CPU at the cost of training speed)')
parser.add_argument('--seed', default=2021, type=int, help='seed used for Random Number Generation if "reproducible=True"')
def fit_ppi(models, args):
"""if we need the cutoff point for accuracy"""
if args.cutoff: seq = np.round(np.arange(0.15, 0.9, 0.01), 2)
else: seq = [0.5]
best_epoch = 0
final = []
start_time = datetime.datetime.now(tz)
former_iteration_endpoint = start_time
print("~~~~~~~~~~~~~ TIME ~~~~~~~~~~~~~~")
print("Time started: {}".format(start_time.strftime("%Y-%m-%d %H:%M:%S")))
"""Training loop"""
for e in range(checkpoint_baseline, args.nb_epoch):
print("~~~~~~~ {} ({}) ~~~~~~~ EPOCH {}/{} (Embedding dimention: {}) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n".format(
'/'.join(sub_ontology_interested), args.species, e+1, args.nb_epoch, args.embedding_dim))
_model_cutoff_preds = []
if args.adaptive_lr:
learning_rate = exp_decay(epoch=e//args.adaptive_lr_rate, initial_lrate=args.learning_rate) # claculating the desired learning rate using the exponential decay formula
else:
learning_rate = args.learning_rate
for model_index in range(len(models)): # Going through each model one by one
_model_cutoff_preds.append({})
# Preparing the data
train_pair, X_train, y_train, test_pair, X_test, y_test, train_gene, test_gene = generic_dataloader(
model_index, nb_test_genes, gene_shuffled_indx, gene_1, gene_2, fully_annotated_genes,
gene_1_annotation, gene_2_annotation, prediction_value, sub_ontology_interested,
args.experiment_mode)
if e==0: save_gene_pairs(logdir=logdir, model_id=model_index, train_pair=train_pair,
test_pair=test_pair, train_gene=train_gene, test_gene=test_gene)
if args.nb_fold==1: X_train, y_train = X_test, y_test # If single model, we need to redefine the training data due to absence of folds
# Training and Prediction
model = models[model_index]
model.optimizer.lr = learning_rate # decreasing a model's learning rate already calculated by exponential decay formula
history = model.fit(X_train, y_train, batch_size=args.batch_size, epochs=1, shuffle=True)
if args.nb_fold!=1: # Only evaluation and report in n-fold cross-validation set up
res = model.predict(X_test)
# Finding the proper cutoff point
for cutoff in seq:
predictions = np.asarray(1*(cutoff<res))
_model_cutoff_preds[model_index][cutoff] = np.round(f1_score(y_test, predictions, average='binary'), 5)
#finding the best f1_measure and the best cutoff point
f1_measure = max(_model_cutoff_preds[model_index].items(), key=operator.itemgetter(1))[1]
thresholds[model_index] = max(_model_cutoff_preds[model_index].items(), key=operator.itemgetter(1))[0]
# Keeping track of improving or receding f1_measure
best_f1_measure = best_f1_measures[model_index]
if best_f1_measure < f1_measure:
best_f1_measures[model_index] = f1_measure
st = "(+){}".format(best_f1_measures[model_index])
else:
st = "(-){}".format(best_f1_measures[model_index])
print(">>> F1-score ({}): {} Best({}): {} ({}: {})\n".format(
model_index+1, f1_measure, model_index+1, st, thresholds[model_index], np.round(f1_measure-best_f1_measure, 5)))
# loging the model results
log_model_result_for_ppi(e+1, model_index+1, learning_rate,
best_f1_measures[model_index]*100, (f1_measure-best_f1_measure)*100, f1_measure*100, logdir)
if args.nb_fold!=1: # Stats on all folds in this epoch
_res = {}
for cutoff in seq:
_res[cutoff] = 0 # holds average f1_measure of n-folded models across different cutoffs
for model_index in range(args.nb_fold):
_res[cutoff] += _model_cutoff_preds[model_index][cutoff]/args.nb_fold
f1_res = max(_res.items(), key=operator.itemgetter(1))[1] # best f1-measure for all models among all cutoffs in this epoch
threshold_res = max(_res.items(), key=operator.itemgetter(1))[0] # best cutoff point for the best f1-measure in this epoch
final.append([threshold_res, f1_res])
# Stats on what we have done so far from the begining of training
if e==checkpoint_baseline:
best_epoch = e+1
best_cv_cutoff = final[0][0]
best_cv_f1_measure = final[0][1]
else:
for epoch, final_result in enumerate(final):
if best_cv_f1_measure < final[epoch][1]:
best_epoch = checkpoint_baseline+epoch+1
best_cv_cutoff = final[epoch][0]
best_cv_f1_measure = final[epoch][1]
print(" F1-score for this epoch: {:.2f}% ({})-- Best F1-score::==> {:.2f}% ({}) (for epoch #{} of {})\n".format(
f1_res*100, threshold_res, best_cv_f1_measure*100, best_cv_cutoff, best_epoch, args.nb_epoch))
# save models and embeddings
if args.save_interval==-1 and best_epoch==e+1: # save if improved the result
if args.save_model: # save models
save_model(path=logdir, models=models, epoch=e+1, verbose=args.verbose)
if args.save_embeddings: # save (updated) GO term embeddings
save_embeddings(path=logdir,
models=models,
go_term_indeces=go_term_indeces,
sub_ontology_interested=sub_ontology_interested,
embedding_save=go_term_embedding_save_in,
epoch=e+1,
verbose=args.verbose)
elif args.save_interval!=-1 and (e+1)%args.save_interval==0: # save after each interval
if args.save_model: # save models
save_model(path=logdir, models=models, epoch=e+1, verbose=args.verbose)
if args.save_embeddings: # save (updated) GO term embeddings
save_embeddings(path=logdir,
models=models,
go_term_indeces=go_term_indeces,
sub_ontology_interested=sub_ontology_interested,
embedding_save=go_term_embedding_save_in,
epoch=e+1,
verbose=args.verbose)
# Calculating 'Computation Time' for this round of iteration
former_iteration_endpoint, current_iteration_elapsed = cal_iter_time(former_iteration_endpoint, e, args, tz)
# loging the epoch results
if args.nb_fold!=1: # Stats on all folds in this epoch
log_epoch_result_for_ppi(e+1, args.nb_epoch, current_iteration_elapsed,
best_cv_f1_measure*100, f1_res*100, best_epoch, logdir)
def eval_ppi(models, args):
"""if we need the cutoff point for accuracy"""
if args.cutoff: seq = np.round(np.arange(0.15, 0.9, 0.01), 2)
else: seq = [0.5]
_model_cutoff_preds = []
preds = []
for model_index in range(len(models)): # Going through each model one by one
_model_cutoff_preds.append({})
train_pair, X_train, y_train, test_pair, X_test, y_test, train_gene, test_gene = generic_dataloader(
model_index, nb_test_genes, gene_shuffled_indx, gene_1, gene_2, fully_annotated_genes,
gene_1_annotation, gene_2_annotation, prediction_value, sub_ontology_interested,
args.experiment_mode)
# Training and Prediction
model = models[model_index]
if args.nb_fold!=1: # Only evaluation and report in n-fold cross-validation set up
res = model.predict(X_test)
# Finding the proper cutoff point
for cutoff in seq:
predictions = np.asarray(1*(cutoff<res))
_model_cutoff_preds[model_index][cutoff] = np.round(f1_score(y_test, predictions, average='binary'), 5)
#finding the best f1_measure and the best cutoff point
f1_measure = max(_model_cutoff_preds[model_index].items(), key=operator.itemgetter(1))[1]
thresholds[model_index] = max(_model_cutoff_preds[model_index].items(), key=operator.itemgetter(1))[0]
print(f">>> F1-score (fold {(model_index+1):2d}): {(f1_measure*100):.2f}")
preds.append(f1_measure)
final_res = 0
best_cutoff = 0
for cutoff in seq:
final_res_temp = 0
for model_index in range(len(models)):
final_res_temp += _model_cutoff_preds[model_index][cutoff]/len(models)
if final_res < final_res_temp:
final_res = final_res_temp
best_cutoff = cutoff
print(f">>> Final F1-score (averaged over folds with classification cutoff {best_cutoff}): {(final_res*100):.2f}")
if __name__ == "__main__":
args = parser.parse_args()
# some assertions before proceeding
assert not ((args.deepsimdef_mode == 'evaluation') and (checkpoint is None)), "'checkpoint' cann't be None in 'evaluation' mode of '--deepsimdef_mode' as the model needs a trained deepSimDEF network"
assert not ((args.deepsimdef_mode == 'production') and (checkpoint is None)), "'checkpoint' cann't be None in 'production' mode of '--deepsimdef_mode' as the model needs a trained deepSimDEF network"
assert not ((args.deepsimdef_mode == 'production') and (not os.path.isfile(args.production_input_file))), "the provided 'args.production_input_file' does not exist, provide proper directory"
assert not ((args.nb_fold == 1) and (args.save_interval == -1)), "'--save_interval' cann't be -1 when '--nb_fold' is 1; define a 'positive integer' interval"
assert not (args.highway_layer and args.cosine_similarity), "Either '--highway_layer' can be True or '--cosine_similarity', not both."
if args.deepsimdef_mode=='training' or args.deepsimdef_mode=='evaluation':
# printing out the argument of the model
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Arguments are:")
pp.pprint(vars(args))
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
# directory to the files needed for traning and testing
if args.inpute_file=='default':
data_file_name = f'{args.species}_protein_protein_interaction.tsv'
else:
data_file_name = args.inpute_file
ppi_data_dir = './data/species/{}/ppi/processed'.format(args.species) # directory to the ppi datasets, pay attention to the file names and thier content format
embedding_dir = f'./data/gene_ontology/definition_embedding/{args.embedding_dim}_dimensional' # directory to the GO term embeddings, pay attention to the file names and thier content format
gene_annotations_dir = './data/species/{}/association_file/processed'.format(args.species) # directory to the gene annotations, pay attention to the file names and thier content format
# set RNG
if args.reproducible: make_reproducible(args.seed)
# some variables needed to work with sub-ontologies
if args.sub_ontology=='all': sub_ontology_interested = ['BP', 'CC', 'MF']
elif args.sub_ontology=='bp': sub_ontology_interested = ['BP']
elif args.sub_ontology=='cc': sub_ontology_interested = ['CC']
elif args.sub_ontology=='mf': sub_ontology_interested = ['MF']
sub_ontology_all = ['BP', 'CC', 'MF'] # for experimets, to make sure all genes have annotations from all three ontologies
# do we use a checkpointed model, if not we can set it
if args.checkpoint is None:
args.log_name = f"{args.log_name}_{args.species}/pretrained_emb_{args.pretrained_embedding}_iea_{args.iea}_ontology_{args.sub_ontology}"
logdir, checkpoint_baseline = log(args)
else:
logdir = args.checkpoint.rsplit("/", 2)[0]
checkpoint_baseline = int(args.checkpoint.rsplit("_")[-1])
args.log_name = checkpoint.rsplit("/", 3)[0]
print(f"The checkpoint directory is: '{logdir}'\n")
# some variables to work with GO-term embeddings later
go_term_embedding_file_path = {} # directory of embedding files (for every subontolgy)
go_term_embedding_save_in = {} # files into which the updated GO term embeddings will be stored
for sbo in sub_ontology_interested:
go_term_embedding_file_path[sbo] = '{}/GO_{}_Embeddings_{}D.emb'.format(embedding_dir, sbo, args.embedding_dim)
go_term_embedding_save_in[sbo] = 'GO_{}_Embeddings_{}D_Updated'.format(sbo, args.embedding_dim)
# getting GO annotations
gene_indeces, gene_annotations, go_term_indeces, max_ann_len, max_ann_len_indx = extract_annotation_2nd_form(
sub_ontology_all, gene_annotations_dir, args.iea, args.verbose)
fully_annotated_genes = [] # we keep only those genes for which we have annatoation from all sub-ontologies
for sbo in sub_ontology_all:
fully_annotated_genes.append(gene_indeces[sbo].keys())
fully_annotated_genes = sorted(list(set(fully_annotated_genes[0]).intersection(*fully_annotated_genes)))
if args.verbose: print("Number of fully annotated gene products: {}\n".format(len(fully_annotated_genes)))
"""Shuffling the genes"""
gene_shuffled_indx = np.arange(len(fully_annotated_genes))
np.random.shuffle(gene_shuffled_indx)
VALIDATION_SPLIT = 1.0/args.nb_fold
nb_test_genes = int(VALIDATION_SPLIT * len(fully_annotated_genes))
gene_1, gene_2, gene_1_annotation, gene_2_annotation, prediction_value = ppi_dataset(
ppi_data_dir, data_file_name, fully_annotated_genes,
gene_annotations, gene_indeces, max_ann_len, args.partial_shuffle_percent, sub_ontology_interested)
VALIDATION_SPLIT = 1.0/args.nb_fold
gene_pair_indx = np.arange(gene_1_annotation[sub_ontology_interested[0]].shape[0])
np.random.shuffle(gene_pair_indx)
nb_test_gene_pairs = int(VALIDATION_SPLIT * gene_1_annotation[sub_ontology_interested[0]].shape[0])
if args.verbose:
for sbo in sub_ontology_interested:
print("Shape of data tensor for gene 1 ({}): {}".format(sbo, gene_1_annotation[sbo].shape))
print("Shape of data tensor for gene 2 ({}): {}\n".format(sbo, gene_2_annotation[sbo].shape))
print("Shape of output ppi tensors: {}\n".format(prediction_value.shape))
models = []
best_f1_measures = []
thresholds = []
for m in range(args.nb_fold):
network = deepSimDEF_network(args, model_ind=m, max_ann_len=max_ann_len, go_term_embedding_file_path=go_term_embedding_file_path,
sub_ontology_interested=sub_ontology_interested, go_term_indeces=go_term_indeces)
models.append(network)
best_f1_measures.append(0)
thresholds.append(0)
if args.deepsimdef_mode=='training':
fit_ppi(models, args)
save_model(path=logdir, models=models, epoch=args.nb_epoch, verbose=args.verbose)
save_embeddings(path=logdir,
models=models,
go_term_indeces=go_term_indeces,
sub_ontology_interested=sub_ontology_interested,
embedding_save=go_term_embedding_save_in,
epoch=args.nb_epoch,
verbose=args.verbose)
elif args.deepsimdef_mode=='evaluation':
eval_ppi(models, args)
elif args.deepsimdef_mode=='production':
data_file_dir, data_file_name = os.path.dirname(args.production_input_file), os.path.basename(args.production_input_file)
gene_annotations_dir = './data/species/{}/association_file/processed'.format(args.species) # directory to the gene annotations, pay attention to the file names and thier content format
# some variables needed to work with sub-ontologies
if args.sub_ontology=='all': sub_ontology_interested = ['BP', 'CC', 'MF']
elif args.sub_ontology=='bp': sub_ontology_interested = ['BP']
elif args.sub_ontology=='cc': sub_ontology_interested = ['CC']
elif args.sub_ontology=='mf': sub_ontology_interested = ['MF']
sub_ontology_all = ['BP', 'CC', 'MF'] # to make sure all genes have annotations from all three ontologies, modify this if you want to work with single channel networks
gene_indeces, gene_annotations, go_term_indeces, max_ann_len, max_ann_len_indx = extract_annotation_2nd_form(
sub_ontology_all, gene_annotations_dir, args.iea, args.verbose)
fully_annotated_genes = [] # we keep only those genes for which we have annatoation from all sub-ontologies
for sbo in sub_ontology_all:
fully_annotated_genes.append(gene_indeces[sbo].keys())
fully_annotated_genes = sorted(list(set(fully_annotated_genes[0]).intersection(*fully_annotated_genes)))
gene_1, gene_2, gene_1_annotation, gene_2_annotation = generic_production_dataset(
data_file_dir, data_file_name, fully_annotated_genes,
gene_annotations, gene_indeces, max_ann_len, sub_ontology_interested)
model = deepSimDEF_network(args, model_ind=0)
test_pair, X_test, test_gene = generic_production_dataloader(gene_1, gene_2, fully_annotated_genes,
gene_1_annotation, gene_2_annotation, sub_ontology_interested)
preds = model.predict(X_test)
if len(args.production_output_file)==0:
for i, (pair, pred) in enumerate(zip(test_pair, preds)):
#print("{}\t{}\t{:.8f}".format(i+1, pair.replace(" ", "\t"), pred[0]))
print("{}\t{:.8f}".format(pair.replace(" ", "\t"), pred[0]))
else:
with open(args.production_output_file, "w") as wf:
for i, (pair, pred) in enumerate(zip(test_pair, preds)):
#print("{}\t{}\t{:.8f}".format(i+1, pair.replace(" ", "\t"), pred[0]))
wf.write("{}\t{:.8f}\n".format(pair.replace(" ", "\t"), pred[0]))