-
Notifications
You must be signed in to change notification settings - Fork 7
/
datagraph.py
573 lines (477 loc) · 27.8 KB
/
datagraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
import os, time, pathlib
from typing import List
from functools import reduce
import numpy as np
import pandas as pd
from tqdm import tqdm
class DataGraph:
'''
This class given the crawled dataset in the form of CSV file, deals with forming a graph on the full dataset, taking advantage of connections between different modalities. Based
on these connections, an edge table is made. This class also supports querying and for a given query, returns the file locations of the user-specified sub-dataset from the full dataset.
The graph is made based on the references made by different DICOMS. Different connections are given different edge type values, to make parsing easier. The edge types are as follows:-
1) edge_type:0 RTDOSE(key:ref_rt) -> RTSTRUCT(pair: series/instance)
2) edge_type:1 RTDOSE(key:ref_ct) -> CT(pair: series)
3) edge_type:2 RTSTRUCT(key:ref_ct) -> CT(pair: series)
4) edge_type:3 RTSTRUCT(key:ref_ct) -> PT(pair: series)
5) edge_type:4 CT(key:study) -> PT(pair: study)
6) edge_type:5 RTDOSE(key: ref_pl) -> RTPLAN(pair: instance)
7) edge_type:6 RTPLAN(key: ref_rs) -> RTSTRUCT(pair: series/instance)
Once the edge table is formed, one can query on the graph to get the desired results. For uniformity, the supported query is list of modalities to consider
For ex:
query = ["CT","RTDOSE","RTSTRUCT","PT], will return interconnected studies containing the listed DICOM modalities. The interconnected studies for example may look like
(RTDOSE->RTSTRUCT->CT<-PT<-RTSTRUCT)
'''
def __init__(self,
path_crawl: str,
edge_path: str = "./patient_id_full_edges.csv",
visualize: bool = False) -> None:
'''
Parameters
----------
path_crawl
The csv returned by the crawler
edge_path
This path denotes where the graph in the form of edge table is stored or to be stored
'''
self.df = pd.read_csv(path_crawl, index_col=0)
self.edge_path = edge_path
self.df_new = None
if os.path.exists(self.edge_path):
print("Edge table is already present. Loading the data...")
self.df_edges = pd.read_csv(self.edge_path)
else:
print("Edge table not present. Forming the edge table based on the crawl data...")
self.form_graph()
if visualize:
self.visualize_graph()
def form_graph(self):
'''
Forms edge table based on the crawled data
'''
# enforce string type to all columns to prevent dtype merge errors for empty columns
for col in self.df:
self.df[col] = self.df[col].astype(str)
#Get reference_rs information from RTDOSE-RTPLAN connections
df_filter = pd.merge(self.df, self.df[["instance_uid","reference_rs"]],
left_on="reference_pl",
right_on="instance_uid",
how="left")
df_filter.loc[(df_filter.reference_rs_x.isna()) & (~df_filter.reference_rs_y.isna()),"reference_rs_x"] = df_filter.loc[(df_filter.reference_rs_x.isna()) & (~df_filter.reference_rs_y.isna()),"reference_rs_y"].values
df_filter.drop(columns=["reference_rs_y", "instance_uid_y"], inplace=True)
df_filter.rename(columns={"reference_rs_x":"reference_rs", "instance_uid_x":"instance_uid"}, inplace=True)
#Remove entries with no RTDOSE reference, for extra check, such cases are mostprobably removed in the earlier step
df_filter = df_filter.loc[~((df_filter["modality"] == "RTDOSE") & (df_filter["reference_ct"].isna()) & (df_filter["reference_rs"].isna()))]
#Get all study ids
# all_study = df_filter.study.unique()
start = time.time()
#Defining Master df to store all the Edge dataframes
# self.df_master = []
# for i in tqdm(range(len(all_study))):
# self._form_edge_study(df_filter, all_study, i)
# df_edge_patient = form_edge_study(df,all_study,i)
self.df_edges = self._form_edges(self.df) #pd.concat(self.df_master, axis=0, ignore_index=True)
end = time.time()
print(f"\nTotal time taken: {end - start}")
self.df_edges.loc[self.df_edges.study_x.isna(),"study_x"] = self.df_edges.loc[self.df_edges.study_x.isna(), "study"]
#dropping some columns
self.df_edges.drop(columns=["study_y", "patient_ID_y", "series_description_y", "study_description_y", "study"],inplace=True)
self.df_edges.sort_values(by="patient_ID_x", ascending=True)
print(f"Saving edge table in {self.edge_path}")
self.df_edges.to_csv(self.edge_path, index=False)
def visualize_graph(self):
"""
Generates visualization using Pyviz, a wrapper around visJS. The visualization can be found at datanet.html
"""
from pyvis.network import Network # type: ignore (PyLance)
print("Generating visualizations...")
data_net = Network(height='100%', width='100%', bgcolor='#222222', font_color='white')
sources = self.df_edges["series_y"]
targets = self.df_edges["series_x"]
name_src = self.df_edges["modality_y"]
name_tar = self.df_edges["modality_x"]
patient_id = self.df_edges["patient_ID_x"]
reference_ct = self.df_edges["reference_ct_y"]
reference_rs = self.df_edges["reference_rs_y"]
data_zip = zip(sources,targets,name_src,name_tar,patient_id,reference_ct,reference_rs)
for i in data_zip:
data_net.add_node(i[0],i[2],title=i[2],group=i[4])
data_net.add_node(i[1],i[3],title=i[3],group=i[4])
data_net.add_edge(i[0],i[1])
node = data_net.get_node(i[0])
node["title"] = "<br>Patient_id: {}<br>Series: {}<br>reference_ct: {}<br>reference_rs: {}".format(i[4],i[0],i[5],i[6])
node = data_net.get_node(i[1])
node["title"] = "<br>Patient_id: {}<br>Series: {}<br>reference_ct: {}<br>reference_rs: {}".format(i[4],i[1],i[5],i[6])
neigbour_map = data_net.get_adj_list()
for node in data_net.nodes:
node["title"] += "<br>Number of connections: {}".format(len(neigbour_map[node['id']]))
node["value"] = len(neigbour_map[node['id']])
vis_path = pathlib.Path(os.path.dirname(self.edge_path),"datanet.html").as_posix()
data_net.show(vis_path)
def _form_edges(self, df):
'''
For a given study id forms edge table
'''
df_list = []
# Split into each modality
plan = df[df["modality"] == "RTPLAN"]
dose = df[df["modality"] == "RTDOSE"]
struct = df[df["modality"] == "RTSTRUCT"]
ct = df[df["modality"] == "CT"]
mr = df[df["modality"] == "MR"]
pet = df[df["modality"] == "PT"]
edge_types = np.arange(7)
for edge in edge_types:
if edge==0: # FORMS RTDOSE->RTSTRUCT, can be formed on both series and instance uid
df_comb1 = pd.merge(struct, dose, left_on="instance_uid", right_on="reference_rs")
df_comb2 = pd.merge(struct, dose, left_on="series", right_on="reference_rs")
df_combined = pd.concat([df_comb1, df_comb2])
#Cases where both series and instance_uid are the same for struct
df_combined = df_combined.drop_duplicates(subset=["instance_uid_x"])
elif edge==1: # FORMS RTDOSE->CT
df_combined = pd.merge(ct, dose, left_on="series", right_on="reference_ct")
elif edge==2: # FORMS RTSTRUCT->CT on ref_ct to series
df_ct = pd.merge(ct, struct, left_on="series", right_on="reference_ct")
df_mr = pd.merge(mr, struct, left_on="series", right_on="reference_ct")
df_combined = pd.concat([df_ct, df_mr])
elif edge==3: # FORMS RTSTRUCT->PET on ref_ct to series
df_combined = pd.merge(pet, struct, left_on="series", right_on="reference_ct")
elif edge==4: # FORMS PET->CT on study
df_combined = pd.merge(ct, pet, left_on="study", right_on="study")
elif edge==5:
df_combined = pd.merge(plan, dose, left_on="instance_uid", right_on="reference_pl")
else:
df_combined = pd.merge(struct, plan, left_on="instance_uid", right_on="reference_rs")
df_combined["edge_type"] = edge
df_list.append(df_combined)
df_edges = pd.concat(df_list, axis=0, ignore_index=True)
return df_edges
def _form_edge_study(self, df, all_study, study_id):
'''
For a given study id forms edge table
'''
df_study = df.loc[self.df["study"] == all_study[study_id]]
df_list = []
# Split into each modality
plan = df_study.loc[df_study["modality"] == "RTPLAN"]
dose = df_study.loc[df_study["modality"] == "RTDOSE"]
struct = df_study.loc[df_study["modality"] == "RTSTRUCT"]
ct = df_study.loc[df_study["modality"] == "CT"]
mr = df_study.loc[df_study["modality"] == "MR"]
pet = df_study.loc[df_study["modality"] == "PT"]
seg = df_study.loc[df_study["modality"] == "SEG"]
edge_types = np.arange(8)
for edge in edge_types:
if edge==0: # FORMS RTDOSE->RTSTRUCT, can be formed on both series and instance uid
df_comb1 = pd.merge(struct, dose, left_on="instance_uid", right_on="reference_rs")
df_comb2 = pd.merge(struct, dose, left_on="series", right_on="reference_rs")
df_combined = pd.concat([df_comb1, df_comb2])
#Cases where both series and instance_uid are the same for struct
df_combined = df_combined.drop_duplicates(subset=["instance_uid_x"])
elif edge==1: # FORMS RTDOSE->CT
df_combined = pd.merge(ct, dose, left_on="series", right_on="reference_ct")
elif edge==2: # FORMS RTSTRUCT->CT on ref_ct to series
df_ct = pd.merge(ct, struct, left_on="series", right_on="reference_ct")
df_mr = pd.merge(mr, struct, left_on="series", right_on="reference_ct")
df_combined = pd.concat([df_ct, df_mr])
elif edge==3: # FORMS RTSTRUCT->PET on ref_ct to series
df_combined = pd.merge(pet, struct, left_on="series", right_on="reference_ct")
elif edge==4: # FORMS PET->CT on study
df_combined = pd.merge(ct, pet, left_on="study", right_on="study")
elif edge==5:
df_combined = pd.merge(plan, dose, left_on="instance", right_on="reference_pl")
elif edge==7: # FORMS RTSTRUCT->CT on ref_ct to series
df_ct_seg = pd.merge(ct, seg, left_on="series", right_on="reference_ct")
df_mr_seg = pd.merge(mr, seg, left_on="series", right_on="reference_ct")
df_combined = pd.concat([df_ct_seg, df_mr_seg])
else:
df_combined = pd.merge(struct, plan, left_on="instance", right_on="reference_rs")
df_combined["edge_type"] = edge
df_list.append(df_combined)
df_edges = pd.concat(df_list, axis=0, ignore_index=True)
self.df_master.append(df_edges)
def parser(self, query_string: str) -> pd.DataFrame:
'''
For a given query string(Check the documentation), returns the dataframe consisting of two columns namely modality and folder location of the connected nodes
Parameters
----------
df
Dataframe consisting of the crawled data
df_edges
Processed Dataframe forming a graph, stored in the form of edge table
query_string
Query string based on which dataset will be formed
Query ideas:
There are four basic supported modalities are RTDOSE, RTSTRUCT, CT, PT, MRI
The options are, the string can be in any order:
1) RTDOSE
2) RTSTRUCT
3) CT
4) PT
5) PT,RTSTRUCT
6) CT,PT
7) CT,RTSTRUCT
8) CT,RTDOSE
9) RTDOSE,RTSTRUCT,CT
10) RTDOSE,CT,PT
11) RTSTRUCT,CT,PT
12) RTDOSE,RTSTRUCT,CT,PT
'''
#Basic processing of just one modality
supp_mods = ["RTDOSE", "RTSTRUCT", "CT", "PT", 'MR']
edge_def = {"RTSTRUCT,RTDOSE" : 0, "CT,RTDOSE" : 1, "CT,RTSTRUCT" : 2, "PET,RTSTRUCT" : 3, "CT,PT" : 4, 'MR,RTSTRUCT': 2, "RTPLAN,RTSTRUCT": 6, "RTPLAN,RTDOSE": 5, "CT,SEG": 7, "MR,SEG": 7, "MR,RTSTRUCT": 2}
self.mods = query_string.split(",")
self.mods_n = len(self.mods)
#Deals with single node queries
if query_string in supp_mods:
final_df = self.df.loc[self.df.modality == query_string, ["study", "patient_ID", "series", "folder", "subseries"]]
final_df.rename(columns = {"series": f"series_{query_string}",
"study": f"study_{query_string}",
"folder": f"folder_{query_string}",
"subseries": f"subseries_{query_string}", }, inplace=True)
elif self.mods_n == 2:
#Reverse the query string
query_string_rev = (",").join(self.mods[::-1])
if query_string in edge_def.keys():
edge_type = edge_def[query_string]
valid = query_string
elif query_string_rev in edge_def.keys():
edge_type = edge_def[query_string_rev]
valid = query_string_rev
else:
raise ValueError("Invalid Query. Select valid pairs.")
#For cases such as the CT-RTSTRUCT and CT-RTDOSE, there exists multiple pathways due to which just searching on the edgetype gives wrong results
if edge_type in [0, 1, 2]:
edge_list = [0, 1, 2]
if edge_type==0:
#Search for subgraphs with edges 0 or (1 and 2)
regex_term = '(((?=.*0)|(?=.*5)(?=.*6))|((?=.*1)(?=.*2)))'
mod = [i for i in self.mods if i in ['CT', 'MR']][0] # making folder_mod CT/MR agnostic <-- still needs testing
final_df = self.graph_query(regex_term, edge_list, f"folder_{mod}")
elif edge_type==1:
#Search for subgraphs with edges 1 or (0 and 2)
regex_term = '((?=.*1)|(((?=.*0)|(?=.*5)(?=.*6))(?=.*2)))'
final_df = self.graph_query(regex_term, edge_list, "RTSTRUCT")
elif edge_type==2:
#Search for subgraphs with edges 2 or (1 and 0)
regex_term = '((?=.*2)|(((?=.*0)|(?=.*5)(?=.*6))(?=.*1)))'
final_df = self.graph_query(regex_term, edge_list, "RTDOSE")
else:
final_df = self.df_edges.loc[self.df_edges.edge_type == edge_type, ["study_x","patient_ID_x", "study_x", "study_y", "series_x","folder_x","series_y","folder_y", "subseries_x", "subseries_y"]]
node_dest = valid.split(",")[0]
node_origin = valid.split(",")[1]
final_df.rename(columns={"study_x": "study",
"patient_ID_x": "patient_ID",
"series_x": f"series_{node_dest}",
"series_y": f"series_{node_origin}",
"study_x": f"study_{node_dest}",
"study_y": f"study_{node_origin}",
"folder_x": f"folder_{node_dest}",
"folder_y": f"folder_{node_origin}",
"subseries_x": f"subseries_{node_dest}",
"subseries_y": f"subseries_{node_origin}", }, inplace=True)
elif self.mods_n > 2:
#Processing of combinations of modality
bads = ["RTPLAN"]
# CT/MR,RTSTRUCT,RTDOSE
if (("CT" in query_string) or ('MR' in query_string)) & ("RTSTRUCT" in query_string) & ("RTDOSE" in query_string) & ("PT" not in query_string):
#Fetch the required data. Checks whether each study has edge 2 and (1 or 0)
regex_term = '((?=.*1)|(?=.*0)|(?=.*5)(?=.*6))(?=.*2)'
edge_list = [0, 1, 2, 5, 6]
# CT/MR,RTSTRUCT,RTDOSE,PT
elif (("CT" in query_string) or ('MR' in query_string)) & ("RTSTRUCT" in query_string) & ("RTDOSE" in query_string) & ("PT" in query_string):
#Fetch the required data. Checks whether each study has edge 2,3,4 and (1 or 0)
regex_term = '((?=.*1)|(?=.*0)|(?=.*5)(?=.*6))(?=.*2)(?=.*3)(?=.*4)' # fix
edge_list = [0, 1, 2, 3, 4]
#CT/MR,RTSTRUCT,PT
elif (("CT" in query_string) or ('MR' in query_string)) & ("RTSTRUCT" in query_string) & ("PT" in query_string) & ("RTDOSE" not in query_string):
#Fetch the required data. Checks whether each study has edge 2,3,4
regex_term = '(?=.*2)(?=.*3)(?=.*4)'
edge_list = [2, 3, 4]
#CT/MR,RTDOSE,PT
elif (("CT" in query_string) or ('MR' in query_string)) & ("RTSTRUCT" not in query_string) & ("PT" in query_string) & ("RTDOSE" in query_string):
#Fetch the required data. Checks whether each study has edge 4 and (1 or (2 and 0)). Remove RTSTRUCT later
regex_term = '(?=.*4)((?=.*1)|((?=.*2)((?=.*0)|(?=.*5)(?=.*6))))'
edge_list = [0, 1, 2, 4, 5, 6]
bads.append("RTSTRUCT")
else:
raise ValueError("Please enter the correct query")
final_df = self.graph_query(regex_term, edge_list, bads)
else:
raise ValueError("Please enter the correct query")
final_df.reset_index(drop=True, inplace=True)
final_df["index_chng"] = final_df.index.astype(str) + "_" + final_df["patient_ID"].astype(str)
final_df.set_index("index_chng", inplace=True)
final_df.rename_axis(None, inplace=True)
#change relative paths to absolute paths
for col in final_df.columns:
if col.startswith("folder"):
# print(self.edge_path, os.path.dirname(self.edge_path))
final_df[col] = final_df[col].apply(lambda x: pathlib.Path(os.path.split(os.path.dirname(self.edge_path))[0], x).as_posix() if isinstance(x, str) else x) #input folder joined with the rel path
return final_df
def graph_query(self,
regex_term: str,
edge_list: List[int],
change_df: List[str],
return_components: bool = False,
remove_less_comp: bool = True):
'''
Based on the regex forms the final dataframe. You can
query the edge table based on the regex to get the
subgraph in which the queried edges will be present.
The components are process further to get the final
dataframe of the required modalities.
Parameters
----------
regex_term
To search the string in edge_type column of self.df_new which is aggregate of all the edges in a single study
edge_list
The list of edges that should be returned in the subgraph
return_components
True to return the dictionary of the componets present with the condition present in the regex
change_df
Use only when you want to remove columns containing that string
remove_less_comp
False when you want to keep components with modalities less than the modalitiy listed in the query
'''
if self.df_new is None:
self._form_agg() #Form aggregates
# Fetch the required data. Checks whether each study has edge 4 and (1 or (2 and 0)). Can remove later
relevant_study_id = self.df_new.loc[(self.df_new.edge_type.str.contains(regex_term)), "study_x"].unique()
# Based on the correct study ids, fetches the relevant edges
df_processed = self.df_edges.loc[self.df_edges.study_x.isin(relevant_study_id) & (self.df_edges.edge_type.isin(edge_list))]
# The components are deleted if it has less number of nodes than the passed modalities, change this so as to alter that condition
final_df = self._get_df(df_processed, relevant_study_id, remove_less_comp)
# Removing columns
for bad in change_df:
# Find columns with change_df string present
col_ids = [cols for cols in list(final_df.columns)[1:] if bad != cols.split("_")[1]]
final_df = final_df[[*list(final_df.columns)[:1], *col_ids]]
if return_components:
return self.final_dict
else:
return final_df
def _form_agg(self):
'''
Form aggregates for easier parsing, gets the edge types for each study and aggregates as a string. This way one can do regex based on what type of subgraph the user wants
'''
self.df_edges['edge_type_str'] = self.df_edges['edge_type'].astype(str)
self.df_new = self.df_edges.groupby("study_x").agg({'edge_type_str':self.list_edges})
self.df_new.reset_index(level=0, inplace=True)
self.df_new["edge_type"] = self.df_new["edge_type_str"]
def _get_df(self,
df_edges_processed,
rel_studyids,
remove_less_comp = True):
'''
Assumption
----------
The components are determined based on the unique CTs.
Please ensure the data conforms to this case. Based on
our preliminary analysis, there are no cases where CT
and PT are present but are disconnected.
Hence this assumption should hold for most of the cases
This function returns dataframe consisting of folder
location and modality for subgraphs
Parameters
----------
df_edges_processed
Dataframe processed containing only the desired edges from the full graph
rel_studyids
Relevant study ids to process(This operation is a bit costly
so better not to perform on full graph for maximum performance)
remove_less_comp
True for removing components with less number of edges than the query
Changelog
---------
* June 14th, 2022: Changing from studyID-based to sample-based for loop
* Oct 11th, 2022: Reverted to studyID-based loop + improved readability and make CT,RTSTRUCT,RTDOSE mode pass tests
'''
#Storing all the components across all the studies
self.final_dict = []
final_df = []
#For checking later if all the required modalities are present in a component or not
mods_wanted = set(self.mods)
#Determine the number of components
for i, study in enumerate(rel_studyids): # per study_id
df_temp = df_edges_processed.loc[df_edges_processed.study_x == study]
CT_locs = df_temp.loc[df_temp.modality_x.isin(['CT', 'MR'])]
CT_series = CT_locs.series_x.unique()
A = []
save_folder_comp = []
#Initialization. For each component intialize a dictionary with the CTs and their connections
for ct in CT_series:
df_connections = CT_locs.loc[CT_locs.series_x == ct]
if len(df_connections) > 0:
row = df_connections.iloc[0]
else:
row = df_connections
series = row.series_x
modality = row.modality_x
folder = row.folder_x
#For each component, this loop stores the CT and its connections
temp = {"study": study,
ct: {"modality": modality,
"folder": folder}}
#For saving the components in a format easier for the main pipeline
folder_save = {"study": study,
'patient_ID': row.patient_ID_x,
f'series_{modality}': series,
f'folder_{modality}': folder}
#This loop stores connection of the CT
for k in range(len(df_connections)):
row_y = df_connections.iloc[k]
series_y = row_y.series_y
folder_y = row_y.folder_y
modality_y = row_y.modality_y
temp[row.series_y] = {"modality": modality_y,
"folder": folder_y,
"conn_to": modality}
#Checks if there is already existing connection
key, key_series = self._check_save(folder_save, modality_y, modality) #CT/MR
folder_save[key_series] = series_y
folder_save[key] = folder_y
A.append(temp)
save_folder_comp.append(folder_save)
#For rest of the edges left out, the connections are formed by going through the dictionary. For cases such as RTstruct-RTDose and PET-RTstruct
rest_locs = df_temp.loc[~df_temp.modality_x.isin(['CT', 'MR']), ["series_x", "modality_x","folder_x", "series_y", "modality_y", "folder_y"]]
for j in range(len(rest_locs)):
edge = rest_locs.iloc[j]
for k in range(len(CT_series)):
A[k][edge['series_y']] = {"modality": edge['modality_y'],
"folder": edge['folder_y'],
"conn_to": edge['modality_x']}
modality_origin = edge['modality_x']
# RTDOSE is connected via either RTstruct or/and CT, but we usually don't care, so naming it commonly
if edge['modality_y'] == "RTDOSE":
modality_origin = "CT"
key, key_series = self._check_save(save_folder_comp[k], edge['modality_y'], modality_origin)
save_folder_comp[k][key_series] = edge['series_y']
save_folder_comp[k][key] = edge['folder_y']
flag = False
remove_index = []
if remove_less_comp:
for j in range(len(CT_series)):
#Check if the number of nodes in a components isn't less than the query nodes, if yes then remove that component
mods_present = set([items.split("_")[1] for items in save_folder_comp[j].keys() if items.split("_")[0] == "folder"])
#Checking if all the read modalities are present in a component
if mods_wanted.issubset(mods_present) == True:
remove_index.append(j)
save_folder_comp = [save_folder_comp[idx] for idx in remove_index]
A = [A[idx] for idx in remove_index]
self.final_dict.extend(A)
final_df.extend(save_folder_comp)
final_df = pd.DataFrame(final_df)
return final_df
@staticmethod
def _check_save(save_dict,node,dest):
key = f"folder_{node}_{dest}"
key_series = f"series_{node}_{dest}"
i = 1
while key in save_dict.keys():
key = f"folder_{node}_{dest}_{i}"
key_series = f"series_{node}_{dest}_{i}"
i +=1
return key,key_series
@staticmethod
def list_edges(series):
return reduce(lambda x, y:str(x) + str(y), series)