-
Notifications
You must be signed in to change notification settings - Fork 14
/
utils_challenge.py
1834 lines (1457 loc) · 71 KB
/
utils_challenge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# AUTOGENERATED! DO NOT EDIT! File to edit: ../source_nbs/lib_nbs/utils_challenge.ipynb.
# %% auto 0
__all__ = ['majority_filter', 'unique_labelled', 'label_filter', 'label_continuous_to_list', 'label_list_to_continuous',
'array_to_df', 'df_to_array', 'file_nonOverlap_reOrg', 'get_VIP', 'changepoint_assignment',
'changepoint_alpha_beta', 'jaccard_index', 'single_changepoint_error', 'ensemble_changepoint_error',
'create_binary_segment', 'jaccard_between_segments', 'segment_assignment', 'metric_anomalous_exponent',
'metric_diffusion_coefficient', 'metric_diffusive_state', 'check_no_changepoints', 'segment_property_errors',
'extract_ensemble', 'multimode_dist', 'distribution_distance', 'error_Ensemble_dataset',
'check_prediction_length', 'separate_prediction_values', 'load_file_to_df', 'error_SingleTraj_dataset',
'when_error_single', 'run_single_task', 'run_ensemble_task', 'listdir_nohidden', 'codalab_scoring',
'transform_ref_to_res']
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 2
import numpy as np
from scipy.optimize import linear_sum_assignment
from scipy.stats import mode
import pandas
from tqdm.auto import tqdm
import warnings
from pathlib import Path
from .models_phenom import models_phenom
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 6
def majority_filter(seq, width):
'''
Given a vector, applies a majority filter of given width.
Parameters
----------
seq : list
Vector to filter.
width : int
Size of the window in which the filter is applied.
Returns
-------
list
Filtered vector
'''
offset = width // 2
seq = [0] * offset + seq
seq_filtered = []
for i in range(len(seq) - offset):
a = seq[i:i+width]
seq_filtered.append(mode(a, keepdims = False)[0])
return seq_filtered
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 8
def unique_labelled(arr : list # List or array from which to create
)-> list : # List with new values as labels from unique
'''
Transforms the values of an input array to their corresponding label given by the uniques in the array.
'''
# Dictionary to store the first occurrence of each element
unique_dict = {}
uniques = np.array([])
inverse_indices = np.array([])
# Iterate through the array and populate the dictionary and uniques list
for index, value in enumerate(arr):
if value not in unique_dict:
unique_dict[value] = len(uniques)
uniques = np.append(uniques, value)
inverse_indices = np.append(inverse_indices, unique_dict[value])
return uniques, inverse_indices
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 10
def label_filter(label,
window_size = 5,
min_seg = 3):
'''
Given a vector of changing labels, applies a majority filter such that the minimum segment of a particular label is
bigger than the minimum set segment.
Parameters
----------
label : list
label vector to filter.
window_size : int
Size of the window in which the majority filter is applied.
min_seg : int
Minimum segment size after filtering.
Returns
-------
np.array
Filtered label vector
'''
if np.min(label) < 0:
raise ValueError('This function only works with positive labels')
# if there are no changes:
if np.sum(label[1:] != label[:-1]) == 0:
return label
# define dummy vector of same value distribution as
# label but which values are given by their unique tag/label
values, dummy = unique_labelled(label)
# check if there are segment smaller than minimal segment (min_seg)
cp = np.argwhere(dummy[1:] != dummy[:-1])+1
cp = np.append(0, cp)
current_min = (cp[1:]-cp[:-1]).flatten().min()
while (current_min < min_seg):
filt = majority_filter(dummy.tolist(), width = window_size)
filt = np.array(filt)
# check if there are segment smaller than minimal segment (min_seg)
cp = np.argwhere(filt[1:] != filt[:-1])+1
# If all changepoints were eliminated
if cp.size == 0:
dummy = filt
break
cp = np.append(0, cp)
current_min = (cp[1:]-cp[:-1]).flatten().min()
if (dummy == filt).all():
# If all failed and still have segments smaller than min_seg
seg_lengths = (cp[1:]-cp[:-1]).flatten().astype(int)
seg_smaller = np.argwhere(seg_lengths < min_seg).flatten()
# We go over each segment and we asign the values 'by hand'
for idxsegs in seg_smaller:
if seg_lengths[idxsegs] == 1:
filt[(cp[idxsegs]+1)] = filt[cp[idxsegs]]
elif seg_lengths[idxsegs] == 2:
filt[(cp[idxsegs]+1)] = filt[cp[idxsegs]]
filt[(cp[idxsegs]+2)] = filt[cp[idxsegs]+3]
dummy = filt
break
dummy = filt
# Check boundaries
if dummy[0] != dummy[1] or dummy[1] != dummy[2]:
dummy[:2] = dummy[2]
if dummy[-2] != dummy[-3] or dummy[-1] != dummy[-2]:
dummy[-3:] = dummy[-3]
# reset to label values
dummy_ret = np.zeros_like(dummy).astype(float)
for idx, v in enumerate(values):
dummy_ret[dummy == idx] = v
return dummy_ret
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 20
def label_continuous_to_list(labs):
'''
Given an array of T x 2 labels containing the anomalous exponent and diffusion
coefficient at each timestep, returns 3 arrays, each containing the changepoints,
exponents and coefficient, respectively.
If labs is size T x 3, then we consider that diffusive states are given and also
return those.
Parameters
----------
labs : array
T x 2 or T x 3 labels containing the anomalous exponent, diffusion
and diffusive state.
Returns
-------
tuple
- First element is the list of change points
- The rest are corresponding segment properties (order: alpha, Ds and states)
'''
# Check if states were given
are_states = False
if labs.shape[1] == 3:
are_states = True
# Check in which variable there is changes
CP = np.argwhere((labs[:-1, :] != labs[1:, :]).sum(1) != 0).flatten()+1
T = labs.shape[0]
alphas = np.zeros(len(CP)+1)
Ds = np.zeros(len(CP)+1)
if are_states: states = np.zeros(len(CP)+1)
for idx, cp in enumerate(np.append(CP, T)):
alphas[idx] = labs[cp-1, 0]
Ds[idx] = labs[cp-1, 1]
if are_states: states[idx] = labs[cp-1, 2]
CP = np.append(CP, T)
if are_states:
return CP, alphas, Ds, states
else:
return CP, alphas, Ds
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 24
def label_list_to_continuous(CP, label):
'''
Given a list of change points and the labels of the diffusion properties of the
resulting segments, generates and array of continuous labels. The last change point
indicates the array length.
Parameters
----------
CP : array, list
list of change points. Last change point indicates label length.
label : array, list
list of segment properties
Returns
-------
array
Continuous label created from the given change points and segment properties
'''
if isinstance(label, list):
label = np.array(label)
segs = create_binary_segment(CP[:-1], CP[-1])
return (segs.transpose()*label).sum(1)
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 28
from .utils_trajectories import segs_inside_fov
def array_to_df(trajs,
labels,
min_length = 10,
fov_origin = [0,0], fov_length= 100.0, cutoff_length = 10):
'''
Given arrays for the position and labels of trajectories, creates a dataframe with that
data. The function also applies the demanded FOV. If you don't want a field of view, chose a
FOV length bigger (smaller) that your maximum (minimum) trajectory position.
Parameters
----------
trajs : array
Trajectories to store in the df (dimension: T x N x 3)
labels : array
Labels to store in the df (dimension: T x N x 3)
fov_origin : tuple
Bottom left point of the square defining the FOV.
fov_length : float
Size of the box defining the FOV.
cutoff_length : int
Minimum length of a trajectory inside the FOV to be considered in the output dataset.
Returns
-------
tuple
- df_in (dataframe): dataframe with trajectories
- df_out (datafram): dataframe with labels
'''
xs = []
ys = []
idxs = []
df_out = pandas.DataFrame(columns = ['traj_idx', 'Ds', 'alphas', 'states', 'changepoints'])
idx_t = 0
for traj, l_alpha, l_D, l_s in zip(tqdm(trajs), labels[:, :, 0], labels[:, :, 1], labels[:, :, 2]):
# Check FOV and
idx_inside_segments = segs_inside_fov(traj, fov_origin, fov_length, cutoff_length)
if idx_inside_segments is not None:
for idx_in in idx_inside_segments:
seg_x = traj[idx_in[0]:idx_in[1], 0]
seg_y = traj[idx_in[0]:idx_in[1], 1]
seg_alpha = l_alpha[idx_in[0]:idx_in[1]]
seg_D = l_D[idx_in[0]:idx_in[1]]
seg_state = l_s[idx_in[0]:idx_in[1]]
# Filtering
seg_alpha = label_filter(seg_alpha)
seg_D = label_filter(seg_D)
seg_state = label_filter(seg_state)
# Stacking data of input dataframe
xs += seg_x.tolist()
ys += seg_y.tolist()
idxs += (np.ones(len(seg_x))*idx_t).tolist()
# Transforming to list of changepoints and physical properties
merge = np.hstack((seg_alpha.reshape(seg_alpha.shape[0], 1),
seg_D.reshape(seg_D.shape[0], 1),
seg_state.reshape(seg_state.shape[0], 1)))
CP, alphas, Ds, states = label_continuous_to_list(merge)
# Saving each segment info in output dataframe
df_out.loc[df_out.shape[0]] = [idx_t, Ds, alphas, states, CP]
# Updating segment index
idx_t += 1
# Saving trajectories in Dataframe
tr_to_df = np.vstack((idxs,
xs,
ys)).transpose()
df_in = pandas.DataFrame(tr_to_df, columns = ['traj_idx', 'x', 'y'])
return df_in, df_out
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 33
def df_to_array(df, pad = -1):
'''
Transform a dataframe as the ones given in the ANDI 2 challenge (i.e. 4 columns:
traj_idx, frame, x, y) into a numpy array. To deal with irregular temporal supports,
we pad the array whenever the trajectory is not present.
The output array has the typical shape of ANDI datasets: TxNx2
Parameters
----------
df : dataframe
Dataframe with four columns 'traj_idx': the trajectory index, 'frame' the time frame and
'x' and 'y' the positions of the particle.
pad : int
Number to use as padding.
Returns
-------
array
Array containing the trajectories from the dataframe, with usual ANDI shape (TxNx2).
'''
max_T = int(df.frame.max()+1)
num_part = int(df.iloc[-1].traj_idx)
array_trajs = np.ones((max_T, num_part+1, 2))*pad
for idx in np.unique(df.traj_idx).astype(int):
df_part = df.loc[df.traj_idx == idx]
array_trajs[df_part.frame.values.astype(int), idx, 0] = df_part.x.values
array_trajs[df_part.frame.values.astype(int), idx, 1] = df_part.y.values
return array_trajs
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 35
from pathlib import Path
import shutil
def file_nonOverlap_reOrg(# Original folder with data produced by datasets_challenge.challenge_phenom_dataset
raw_folder: Path,
# Folder where to put reorganized files
target_folder: Path,
# Number of experiments
experiments: int,
# Number of FOVS
num_fovs: int,
# Track to consider
tracks = [1,2],
# If True, moves all data (also labels,.. etc). Do True only if saving reference / groundtruth data.
# Moreover, if True also save the trajectories for the video track
save_labels = False,
# Which task to consider
task = ['single', 'ensemble'],
# If True prints, the percentage of states for each experiment
print_percentage = True):
'''
This considers that you have n_fovs*n_experiments 'fake' experiments
and organize them based on the challenge instructions
'''
if save_labels:
names_files = ['traj_labs_', 'trajs_', 'videos_', 'ens_labs_', 'vip_idx_']
extensions = ['.txt', '.csv', '.tiff', '.txt', '.txt']
else:
names_files = ['trajs_', 'videos_']
extensions = ['.csv', '.tiff']
exp = 0
ensemble_info = []
# Get model and num_states
info_exp = np.loadtxt(raw_folder/f'ens_labs_exp_0_fov_0.txt', max_rows=1, dtype = str)
model_exp, num_states = info_exp[1][:-1], info_exp[-1].astype(int)
percentage_exp = np.zeros((num_fovs, num_states))
for k in range(num_fovs*len(experiments)):
# ----- Check when we are done with one experiment and go to next -----
if (k % num_fovs == 0 and k != 0):
# First save the ensemble information of the current experiment
if num_states > 1:
percentage_exp = np.sum(percentage_exp, axis = 0)
percentage_exp /= percentage_exp.sum()
ensemble_fov[-1,:] = percentage_exp
if num_states == 1:
ensemble_fov[-1] = 1
if print_percentage:
print(f'Experiment {exp}: {np.round(ensemble_fov[-1], 2)}')
if save_labels:
for track in tracks:
with open(target_folder/f'track_{track}/exp_{exp}/ensemble_labels.txt', 'w') as f:
f.truncate(0)
f.write(f'model: {model_exp}; num_state: {num_states} \n')
np.savetxt(f, ensemble_fov, delimiter = ';')
# Then restart for next experiment
exp += 1
ensemble_info = []
info_exp = np.loadtxt(raw_folder/f'ens_labs_exp_{k}_fov_0.txt', max_rows=1, dtype = str)
model_exp, num_states = info_exp[1][:-1], info_exp[-1].astype(int)
percentage_exp = np.zeros((num_fovs, num_states))
# ----- Move the folders -----
for track in tracks:
(target_folder/(f'track_{track}/'+f'exp_{exp}')).mkdir(parents=True, exist_ok=True)
# Move single trajectory information
for name, ext in zip(names_files, extensions):
if track == 1 and name == 'trajs_' and save_labels == False: continue
if track == 2 and (name == 'videos_' or name == 'vip_idx_'): continue
shutil.copyfile(src = raw_folder/(name + f'exp_{k}_fov_0'+ext),
dst = target_folder/(f'track_{track}/exp_{exp}/' + name + f'fov_{k%num_fovs}' + ext))
### ----- Collect ensemble information -----
ensemble_fov = np.loadtxt(raw_folder/f'ens_labs_exp_{k}_fov_0.txt',
skiprows = 1, delimiter = ';')
if num_states > 1:
percentage_exp[k%num_fovs] = ensemble_fov[-1, :].copy()
# Save the ensemble information of the LAST experiment
if num_states > 1:
percentage_exp = np.sum(percentage_exp, axis = 0)
percentage_exp /= percentage_exp.sum()
ensemble_fov[-1,:] = percentage_exp
if num_states == 1:
ensemble_fov[-1] = 1
if print_percentage:
print(f'Experiment {exp}: {np.round(ensemble_fov[-1], 2)}')
if save_labels:
for track in tracks:
with open(target_folder/f'track_{track}/exp_{exp}/ensemble_labels.txt', 'w') as f:
f.truncate(0)
f.write(f'model: {model_exp}; num_state: {num_states} \n')
np.savetxt(f, ensemble_fov, delimiter = ';')
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 37
from scipy.spatial import distance
def get_VIP(array_trajs, num_vip = 5, min_distance_part = 2, pad = -1,
boundary = False, boundary_origin = (0,0), min_distance_bound = 0,
sort_length = True):
'''
Given an array of trajectories, finds the particles VIP particles that participants will
need to characterize in the video trakcl.
The function first finds the particles that exist at frame 0 (i.e. that their first value
is different from pad). Then, iterates over this particles to find num_vip that are at
distance > than min_distance_part in the first frame.
Parameters
----------
array_trajs : array
Position of the trajectories that will be considered for the VIP search.
num_vip : int
Number of VIP particles to flag.
min_distance_part : float
Minimum distance between two VIP particles.
pad : int
Number used to indicate in the temporal support that the particle is outside of the FOV.
boundary : bool, float
If float, defines the length of the box acting as boundary
boundary_origin : tuple
X and Y coords of the boundary
min_distance_bound : float
Minimum distance a particles has to be from the boundary in ordered to be considered a VIP particle
sort_length : bool
If True, candidates for VIP particles are choosen in descending trajectory length. This ensures
that the longest ones are chosen.
Returns
-------
list
List of indices of the chosen VIP particles
'''
if not boundary:
candidates_vip = np.argwhere(array_trajs[0,:,0] != pad).flatten()
else:
# Define masks
boundary_x0 = array_trajs[0,:,0] > (boundary_origin[0] + min_distance_bound)
boundary_xL = array_trajs[0,:,0] < (boundary_origin[0] + boundary - min_distance_bound)
boundary_y0 = array_trajs[0,:,1] > (boundary_origin[1] + min_distance_bound)
boundary_yL = array_trajs[0,:,1] < (boundary_origin[1] + boundary - min_distance_bound)
padding = array_trajs[0,:,0] != pad
candidates_vip = np.argwhere(boundary_x0 & boundary_xL & boundary_y0 & boundary_yL & padding).flatten()
if len(candidates_vip) < num_vip:
raise ValueError('Number of VIP demanded is bigger than available particles.')
elected = []
count_while = 0
if sort_length:
array_candidates = array_trajs[:, candidates_vip, :]
lengths = np.ones(array_candidates.shape[1])*array_candidates.shape[0]
where_pad = np.argwhere(array_candidates[:,:,0] == pad)
lengths[where_pad[:,1]] = where_pad[:,0]
# We sort the particle by their lenghts (note the minus for descending order)
candidates_vip = candidates_vip[np.argsort(-lengths)]
while len(elected) < num_vip:
if sort_length and count_while == 0:
# if we already did a while loop, we start with a random candidate even
# when sorting
elected = [candidates_vip[0]]
else:
elected = [np.random.choice(candidates_vip)]
for c_idx in candidates_vip:
if c_idx == elected[0]:
continue
if len(array_trajs[0, elected,:].shape) < 2:
all_rest = np.expand_dims(array_trajs[0, elected,:], 0)
else:
all_rest = array_trajs[0, elected,:]
dist = distance.cdist(np.expand_dims(array_trajs[0,c_idx,:], 0), all_rest, metric='euclidean').transpose()
if dist.min() > min_distance_part:
elected.append(c_idx)
if len(elected) == num_vip:
break
count_while += 1
if count_while > 100:
raise ValueError('Could not find suitable VIP particles. This is due to either having to few particles or them being too close')
return elected
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 42
def _get_error_bounds():
'''
Sets the current maximum errors we can do in the different diffusive properties.
'''
# For single trajectory
threshold_error_alpha = 2
threshold_error_D = 1e5
threshold_error_s = 0
threshold_cp = 10
# For ensemble, it relates to the Wasserstein distance. Check test below distribution_distance function
threshold_ensemble_alpha = np.abs(models_phenom().bound_alpha[0]-models_phenom().bound_alpha[1])
threshold_ensemble_D = np.abs(models_phenom().bound_D[0]-models_phenom().bound_D[1])
return threshold_error_alpha, threshold_error_D, threshold_error_s, threshold_cp, threshold_ensemble_alpha, threshold_ensemble_D
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 44
def changepoint_assignment(GT, preds):
'''
Given a list of groundtruth and predicted changepoints, solves the assignment problem via
the Munkres algorithm (aka Hungarian algorithm) and returns two arrays containing the index of the
paired groundtruth and predicted changepoints, respectively.
The distance between change point is the Euclidean distance.
Parameters
----------
GT : list
List of groundtruth change points.
preds : list
List of predicted change points.
Returns
-------
tuple
- tuple of two arrays, each corresponding to the assigned GT and pred changepoints
- Cost matrix
'''
cost_matrix = np.zeros((len(GT), len(preds)))
for idxg, gt in enumerate(GT):
for idxp, pred in enumerate(preds):
cost_matrix[idxg, idxp] = np.abs(gt-pred)
return linear_sum_assignment(cost_matrix), cost_matrix
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 46
def changepoint_alpha_beta(GT, preds, threshold = 10):
'''
Calculate the alpha and beta measure of paired changepoints.
Inspired from Supplemantary Note 3 in https://www.nature.com/articles/nmeth.2808
Parameters
----------
GT : list
List of groundtruth change points.
preds : list
List of predicted change points.
threshold : float
Distance from which predictions are considered to have failed. They are then assigned this number.
Returns
-------
tuple
alpha, beta
'''
assignment, _ = changepoint_assignment(GT, preds)
assignment = np.array(assignment)
threshold = 10
distance = np.abs(GT[assignment[0]] - preds[assignment[1]])
distance[distance > threshold] = threshold
distance = np.sum(distance)
d_x_phi = threshold*len(GT)
d_ybar_phi = max([0, (len(preds)-len(GT))*threshold])
alpha = 1-distance/d_x_phi
beta = (d_x_phi-distance)/(d_x_phi+d_ybar_phi)
return alpha, beta
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 48
def jaccard_index(TP: int, # true positive
FP: int, # false positive
FN: int # false negative
)-> float: # Jaccard Index
'''
Given the true positive, false positive and false negative rates, calculates the Jaccard Index
'''
return TP/(TP+FP+FN)
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 49
def single_changepoint_error(GT, preds, threshold = 5):
'''
Given the groundtruth and predicted changepoints for a single trajectory, first solves the assignment problem between changepoints,
then calculates the RMSE of the true positive pairs and the Jaccard index.
Parameters
----------
GT : list
List of groundtruth change points.
preds : list
List of predicted change points.
threshold : float
Distance from which predictions are considered to have failed. They are then assigned this number.
Returns
-------
tuple
- TP_rmse: root mean square error of the true positive change points.
- Jaccard Index of the ensemble predictions
'''
assignment, _ = changepoint_assignment(GT, preds)
assignment = np.array(assignment)
TP, FP, FN = 0, 0, 0
TP_rmse = []
for p in assignment.transpose():
if np.abs(GT[p[0]] - preds[p[1]]) < threshold:
TP += 1
TP_rmse.append((GT[p[0]] - preds[p[1]])**2)
else:
FP += 1
FN += 1
# Calculating RMSE
TP_rmse = np.sqrt(np.mean(TP_rmse))
# Checking false positive and missed events
if len(preds) > len(GT):
FP += len(preds) - len(GT)
elif len(preds) < len(GT):
FN += len(GT) - len(preds)
return TP_rmse, jaccard_index(TP, FP, FN)
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 50
def ensemble_changepoint_error(GT_ensemble, pred_ensemble, threshold = 5):
'''
Given an ensemble of groundtruth and predicted change points, iterates
over each trajectory's changepoints. For each, it solves the assignment problem
between changepoints. Then, calculates the RMSE of the true positive pairs and
the Jaccard index over the ensemble of changepoints (i.e. not the mean of them
w.r.t. to the trajectories)
Parameters
----------
GT_ensemble : list, array
Ensemble of groutruth change points.
pred_ensemble : list
Ensemble of predicted change points.
threshold : float
Distance from which predictions are considered to have failed. They are then assigned this number.
Returns
-------
tuple
- TP_rmse: root mean square error of the true positive change points.
- Jaccard Index of the ensemble predictions
'''
TP, FP, FN = 0, 0, 0
TP_empty_GT = 0
TP_rmse = []
num_cp_GT = 0
for gt_traj, pred_traj in zip(GT_ensemble, pred_ensemble):
num_cp_GT += len(gt_traj)
assignment, _ = changepoint_assignment(gt_traj, pred_traj)
assignment = np.array(assignment)
for p in assignment.transpose():
if np.abs(gt_traj[p[0]] - pred_traj[p[1]]) < threshold:
TP += 1
TP_rmse.append((gt_traj[p[0]] - pred_traj[p[1]])**2)
else:
FP += 1
FN += 1
# Checking false positive and missed events
if len(pred_traj) > len(gt_traj):
FP += len(pred_traj) - len(gt_traj)
elif len(pred_traj) < len(gt_traj):
FN += len(gt_traj) - len(pred_traj)
# Case where no CP was correctly predicted
if assignment.shape[1] == 0 and len(pred_traj) == len(gt_traj):
TP_empty_GT += 1
if TP+FP+FN == 0:
if num_cp_GT == 0: # this means there where no CP both in GT and Pred
return 0, 1
wrn_str = f'No segments found in your predictions dataset.'
warnings.warn(wrn_str)
return threshold, 0
# Calculating RMSE
if len(TP_rmse) > 0:
TP_rmse = np.sqrt(np.mean(TP_rmse))
else:
# We consider here that, if you don't predict any CP, there can't be
# a TP, hence TP_rmse must be zero.
TP_rmse = 0
return TP_rmse, jaccard_index(TP+TP_empty_GT, FP, FN)
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 54
def create_binary_segment(CP: list, # list of changepoints
T: int # length of the trajectory
)-> list: # list of arrays with value 1 in the temporal support of the current segment.
'''
Given a set of changepoints and the lenght of the trajectory, create segments which are equal to one
if the segment takes place at that position and zero otherwise.
'''
segments = np.zeros((len(CP)+1, T))
CP = np.append(0, CP)
for idx, (cp1, cp2) in enumerate(zip(CP[:-1], CP[1:])):
segments[idx, cp1+1:cp2+1] = 1
segments[-1, CP[-1]+1:] = 1
segments[0, 0] = 1
return segments
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 56
def jaccard_between_segments(gt, pred):
'''
Given two segments, calculates the Jaccard index between them by considering TP as correct labeling,
FN as missed events and FP leftover predictions.
Parameters
----------
gt : array
groundtruth segment, equal to one in the temporal support of the given segment, zero otherwise.
pred : array
predicted segment, equal to one in the temporal support of the given segment, zero otherwise.
Returns
-------
float
Jaccard index between the given segments.
'''
if len(gt) > len(pred):
pred = np.append(pred, np.zeros(len(gt) - len(pred)))
elif len(pred) > len(gt):
gt = np.append(gt, np.zeros(len(pred) - len(gt)))
tp = np.sum(np.logical_and(pred == 1, gt == 1))
fp = np.sum(np.logical_and(pred == 1, gt == 0))
fn = np.sum(np.logical_and(pred == 0, gt == 1))
# special case for absence of changepoint
if tp+fp+fn == 0: return 0
else: return jaccard_index(tp, fp, fn)
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 57
def segment_assignment(GT, preds, T:int = None):
'''
Given a list of groundtruth and predicted changepoints, generates a set of segments. Then constructs
a cost matrix by calculting the Jaccard Index between segments. From this cost matrix, we solve the
assignment problem via the Munkres algorithm (aka Hungarian algorithm) and returns two arrays
containing the index of the groundtruth and predicted segments, respectively.
If T = None, then we consider that GT and preds may have different lenghts. In that case, the end
of the segments is the the last CP of each set of CPs.
Parameters
----------
GT : list
List of groundtruth change points.
preds : list
List of predicted change points.
T : int, None
Length of the trajectory. If None, considers different GT and preds length.
Returns
-------
tuple
- tuple of two arrays, each corresponding to the assigned GT and pred changepoints
- Cost matrix calculated via JI of segments
'''
if T is not None:
T_gt = T_pred = T
# Check if the GT or predictions are a single integer or an empty array
if isinstance(GT, int): GT = [GT]
elif len(GT) == 0: GT = [T-1]
if isinstance(preds, int): preds = [preds]
elif len(preds) == 0: preds = [T-1]
else:
T_gt = GT[-1]
if len(GT) > 1:
GT = GT[:-1]
T_pred = preds[-1]
if len(preds) > 1:
preds = preds[:-1]
seg_GT = create_binary_segment(GT, T_gt)
seg_preds = create_binary_segment(preds, T_pred)
cost_matrix = np.zeros((seg_GT.shape[0], seg_preds.shape[0]))
for idxg, gt in enumerate(seg_GT):
for idxp, pred in enumerate(seg_preds):
cost_matrix[idxg, idxp] = 1-jaccard_between_segments(gt, pred)
return linear_sum_assignment(cost_matrix), cost_matrix
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 67
from sklearn.metrics import mean_squared_log_error as msle, f1_score
def metric_anomalous_exponent(gt = None,
pred = None,
max_error = np.abs(models_phenom().bound_alpha[0]-models_phenom().bound_alpha[1])):
'''
Compute the mean absolute error (mae) between anomalous exponents.
Checks the current bounds of anomalous exponents from models_phenom to calculate the maximum error.
'''
error = np.mean(np.abs(gt-pred))
if error > max_error:
return max_error
else:
return error
def metric_diffusion_coefficient(gt = None, pred = None,
threshold_min = models_phenom().bound_D[0],
max_error = msle([models_phenom().bound_D[0]],
[models_phenom().bound_D[1]])):
'''
Compute the mean squared log error (msle) between diffusion coefficients.
Checks the current bounds of diffusion from models_phenom to calculate the maximum error.
'''
# considering the presence of zeros and negatives
pred = np.array(pred).copy(); gt = np.array(gt).copy()
pred[pred <= threshold_min] = threshold_min
gt[gt <= threshold_min] = threshold_min
# mean squared log error
error = msle(gt, pred)
if error > max_error:
return max_error
else:
return error
def metric_diffusive_state(gt = None, pred = None):
'''
Compute the F1 score between diffusive states.
'''
return f1_score(gt.astype(int), pred.astype(int), average = 'micro')
# %% ../source_nbs/lib_nbs/utils_challenge.ipynb 71
def check_no_changepoints(GT_cp, GT_alpha, GT_D, GT_s,
preds_cp, preds_alpha, preds_D, preds_s,
T:bool|int = None):
'''
Given predicionts over changepoints and variables, checks if in both GT and preds there is an
absence of change point. If so, takes that into account to pair variables.
Parameters
----------
GT_cp : list, int, float
Groundtruth change points
GT_alpha : list, float
Groundtruth anomalous exponent
GT_D : list, float
Groundtruth diffusion coefficient
GT_s : list, float
Groundtruth diffusive state
preds_cp : list, int, float
Predicted change points
preds_alpha : list, float
Predicted anomalous exponent
preds_D : list, float
Predicted diffusion coefficient
preds_s : list, float
Predicted diffusive state
T : bool,int
(optional) Length of the trajectories. If none, last change point is length.
Returns
-------
tuple
- False if there are change points. True if there were missing change points.
- Next three are either all Nones if change points were detected, or paired exponents,
coefficient and states if some change points were missing.
'''
if isinstance(GT_cp, int) or isinstance(GT_cp, float):
GT_cp = [GT_cp]
if isinstance(preds_cp, int) or isinstance(preds_cp, float):
preds_cp = [preds_cp]
no_GT_cp = False; no_preds_cp = False
# CP always contain the final point of the trajectory, hence minimal length is one
if len(GT_cp) == 1: no_GT_cp = True
if len(preds_cp) == 1: no_preds_cp = True
if no_GT_cp + no_preds_cp == 0:
return False, None, None, None
else:
[row_ind, col_ind], _ = segment_assignment(GT_cp, preds_cp, T)
if no_GT_cp and not no_preds_cp:
paired_alpha = np.array([[GT_alpha[0], preds_alpha[col_ind[0]]]])
paired_D = np.array([[GT_D[0], preds_D[col_ind[0]]]])
paired_s = np.array([[GT_s[0], preds_s[col_ind[0]]]])
if no_preds_cp and not no_GT_cp:
row_position = np.argwhere(col_ind == 0).flatten()[0]
paired_alpha = np.array([[GT_alpha[row_position], preds_alpha[col_ind[row_position]]]])
paired_D = np.array([[GT_D[row_position], preds_D[col_ind[row_position]]]])
paired_s = np.array([[GT_s[row_position], preds_s[col_ind[row_position]]]])
if no_preds_cp and no_GT_cp:
paired_alpha = np.array([[GT_alpha[0], preds_alpha[0]]])