-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate_Results.m
293 lines (206 loc) · 9.39 KB
/
evaluate_Results.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
%
% Copyright (c) 2016, Andrej Aderhold
%
% This script processes the results of the iCheMA main script and extract
% edge posterior probabilities. Given a known gold-standard (provided with
% the Biopepa example data) the AUROC and AUPREC scores are calculated
% with the edge probabilities.
%
function evaluate_Results()
% This is for AUROC_AUPREC.m
addpath('Scripts');
% The results are located here
results_dir = sprintf('Results/');
% The gradient type, can be 'RBFGradient' (analytic), or 'coarseGradient'.
gradient_type = 'RBFGradient';
% A vector of data ids.
data_instances = [1];
% Different network names.
biopepa_networks = {'wildtype'}; %, 'mutant_PRR7_PRR9', 'mutant_PRR5_PRR7_PRR9', 'mutant_PRR5_PRR7_PRR9_TOC1', 'mutant_TOC1', 'mutant_PRR7_PRR9_TOC1'};
% Make AUROC and AUPREC calculation for each network and data instance
% independently.
for network = biopepa_networks
network = biopepa_networks{1};
% Loop over data instances
for dataid = data_instances
fprintf('handling network %s, data instance %i\n', network, dataid);
edge_vec = process_Results(network, dataid, gradient_type, results_dir);
%
% Get the gold-standard networks
%
if strcmp(network, 'wildtype')
realnet = importdata('Data/Goldstandards/true_network_wildtype_mrna.txt');
elseif strcmp(network, 'mutant_PRR7_PRR9')
realnet = importdata('Data/Goldstandards/true_network_prr7-9_mrna.txt');
elseif strcmp(network, 'mutant_PRR7_PRR9_TOC1')
realnet = importdata('Data/Goldstandards/true_network_prr7-9-toc1_mrna.txt');
elseif strcmp(network, 'mutant_PRR5_PRR7_PRR9')
realnet = importdata('Data/Goldstandards/true_network_prr5-7-9_mrna.txt');
elseif strcmp(network, 'mutant_PRR5_PRR7_PRR9_TOC1')
realnet = importdata('Data/Goldstandards/true_network_prr5-7-9-toc1_mrna.txt');
elseif strcmp(network, 'mutant_TOC1')
realnet = importdata('Data/Goldstandards/true_network_toc1_mrna.txt');
end
% transpose realnet matrix and transform into vector, excluding -1 edges
realnet = realnet'; % transpose, in order to make as.vector work and be consistent with weights.vec
realnet_vec = reshape(realnet, 1, size(realnet,1)*size(realnet,2));
realnet_vec = realnet_vec(realnet_vec ~= -1);
% Calculate the scores
[auroc, auprec] = AUROC_AUPREC(realnet_vec, edge_vec);
% Save the edge posterior probabilities.
fileout = sprintf(['%s/EVAL/EDGES_%s_%s_id%i.txt'], results_dir, network, gradient_type, dataid);
edge_vec = edge_vec';
dlmwrite(fileout, edge_vec, 'Delimiter', ',');
% Save the AUROC and AUPREC scores
fileout = sprintf(['%s/EVAL/AUROC_AUPREC_%s_%s_id%i.txt'], results_dir, network, gradient_type, dataid);
auroc_auprec = [auroc, auprec];
dlmwrite(fileout, auroc_auprec, 'Delimiter', ',');
end
end
end
%
% Extract the marginal log likelihoods for all the response and parent set
% combinations and calculate a posterior probability for each response
% parent pair.
%
% Return the response - parent matrix as a vector.
%
%
function[edge_vec] = process_Results(network, dataid, gradient_type, results_dir)
% Some basic settings to read in and handle the results data
predictor_type = 'mRNA';
RELOCATE_LIGHT_VAR = 1; % Set this to 0 if not Biopepa data
responses = 7; % Biopepa specific setting
total_nodes = 8; % Biopepa specific setting
fanin = 3;
RESULTS = {};
%
%
% Stage 1: Read in the marginal log likelihoods for each response and
% each parent set.
%
%
for response_node = 1:responses
fprintf('.');
% take out response_node as parent
parent_nodes = setdiff(1:total_nodes, response_node);
% this counter is used to identify the parent set generated with
% recombination of different parent ids for different parent sizes
parent_sets= {};
parent_sets{end+1} = []; % the first is the empty set
max_parents = fanin;
if fanin == -1
max_parents = length(parent_nodes);
end
% create all possible parent set combinations from 1 parent node to
% max_parents node.
for ii = 1:max_parents
comb_tmp = nchoosek(parent_nodes, ii);
for j = 1:size(comb_tmp,1)
% append set
parent_sets{end+1} = comb_tmp(j,:);
end
end
% Save it for later
RESULTS{response_node}.parent_sets = parent_sets;
RESULTS{response_node}.scores = {};
response_max_val_logLL = -Inf;
response_max_val_logLL_prior = -Inf;
%
% Loop over the different parent set configurations and read out
% the marginal log likelihood (LL) for each one. Determine the highest
% LL of each parent set given its different activator/inhibitor
% setups. Ignore all other setups and use the highest one for
% further probability calculations involving the parents in the
% set.
%
for parent_set_id = 1:length(parent_sets)
% Identify results file and load it
filein = sprintf('%s/%s/OUT_r%i_psi%i_BIOPEPA-%s_run%i.mat', results_dir, gradient_type, response_node, parent_set_id, network, dataid);
load(filein);
% Save the marginal log likelihood
RESULTS{response_node}.scores{parent_set_id}.scores = results;
% Use this to identify the parent set with the highest log
% likelihood
score_tmp = results;
% Just to be save, delete results.
clear results;
%
% get max values for each parent_set
%
max_val_logLL = -Inf;
% loop through cell array and extract largest likelihood
for jj = 1:length(score_tmp)
if score_tmp{jj} > max_val_logLL
max_val_logLL = score_tmp{jj};
end
end
RESULTS{response_node}.scores{parent_set_id}.max_score = max_val_logLL;
% Check if this is the overall maximum score for this response
% (used to move likelihood to maximum of 0 for better exp()
% translation
if max_val_logLL > response_max_val_logLL
response_max_val_logLL = max_val_logLL;
end
end
RESULTS{response_node}.response_max_score = response_max_val_logLL;
end % End of response
fprintf('\n');
%
%
% Stage 2: For each response, calculate the probability of the parents.
%
%
SCORES = NaN * zeros(responses, total_nodes);
SCORES_ALL = NaN * zeros(responses, total_nodes);
for response_node = 1:responses
result = RESULTS{response_node};
% for each possible parent
for parent = 1:total_nodes
% ignore self-loop
if parent == response_node
continue;
end
% these are used to calculate probability, marginalized over all
% scores (score_edge_all)
score_edge_logLL = 0;
score_edge_all_logLL = 0;
% look into all parent_sets
for parent_set_id = 1:length(result.parent_sets)
% --> max value for parent set (can have different
% values dependent on activator/inhibitor setup
% result.scores{parent_set_id}.max_Thermo_logLL_prior
%
% --> overall max. value for all parent set, used to
% shift all log score to max value of 0 for better
% exp() usage
% result.response_max_Thermo_logLL_prior
% only likelihood
tmp_ll_logLL = result.scores{parent_set_id}.max_score - result.response_max_score;
tmp_prob_logLL = exp(tmp_ll_logLL);
% if the parent is in the parent_set .. take the intercept of
% parent set and the parent, if it is 0 then its in the set,
% negate it and we are in side the if condition
if ~isempty(intersect(result.parent_sets{parent_set_id}, parent))
score_edge_logLL = score_edge_logLL + tmp_prob_logLL;
end
score_edge_all_logLL = score_edge_all_logLL + tmp_prob_logLL;
end
SCORES(response_node, parent) = score_edge_logLL;
SCORES_ALL(response_node, parent) = score_edge_all_logLL;
end
end
DAG_PROB = SCORES ./ SCORES_ALL;
DAG_PROB(isnan(DAG_PROB)) = -1;
%
% Relocate light*P to 3rd position.
%
if(RELOCATE_LIGHT_VAR)
DAG_PROB = [DAG_PROB(:,1:2),DAG_PROB(:,8), DAG_PROB(:,3:7)];
end
% Transpose and convert to vector.
DAG_PROB = DAG_PROB';
edge_vec = DAG_PROB(:);
% Get rid of -1 (self-loops). This variable is returned.
edge_vec( find(edge_vec == -1) ) = [];
end