-
Notifications
You must be signed in to change notification settings - Fork 14
/
datasets_phenom.py
324 lines (270 loc) · 13.9 KB
/
datasets_phenom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
# AUTOGENERATED! DO NOT EDIT! File to edit: ../source_nbs/lib_nbs/datasets_phenom.ipynb.
# %% auto 0
__all__ = ['datasets_phenom']
# %% ../source_nbs/lib_nbs/datasets_phenom.ipynb 2
from .models_phenom import models_phenom
import inspect
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import copy
from pathlib import Path
import warnings
# %% ../source_nbs/lib_nbs/datasets_phenom.ipynb 5
class datasets_phenom():
def __init__(self,
models_class = models_phenom()):
'''
This class generates, saves and loads datasets of trajectories simulated from various phenomenological diffusion models (available at andi_datasets.models_phenom).
'''
self.models_class = models_class
self._get_models()
def _get_models(self):
'''Loads the available models from the subclass'''
available_models = inspect.getmembers(self.models_class, inspect.ismethod)
available_models = available_models[1:][::-1] # we need this to get rid of the init
self.avail_models_name = [x[0] for x in available_models]
self.avail_models_func = [x[1] for x in available_models]
def _get_inputs_models(self, model, get_default_values = False):
''' Given the name of a phenom model, returns the inputs to that model '''
model_f = self.avail_models_func[self.avail_models_name.index(model)]
defaults = inspect.getfullargspec(model_f).defaults
params = inspect.getfullargspec(model_f).args[1:]
if get_default_values:
return params, defaults
else:
return params
# %% ../source_nbs/lib_nbs/datasets_phenom.ipynb 7
class datasets_phenom(datasets_phenom):
def create_dataset(self,
dics: list|dict|bool = None,
T: None|int = None,
N_model: None|int = None,
path: str = '',
save: bool = False,
load: bool = False):
'''
Given a list of dictionaries, generates trajectories of the demanded properties.
The only compulsory input for every dictionary is `model`, i.e. the model from which
trajectories must be generated. The rest of inputs are optional.
You can see the input parameters of the different models in `andi_datasets.models_phenom`,
This function checks and handles the input dataset and the manages both the creation,
loading and saving of trajectories.
Parameters
----------
dics : list, dictionary, bool
- if list or dictionary: the function generates trajectories with the properties stated in each dictionary.
- if bool: the function generates trajectories with default parameters set for the ANDI 2 challenge (phenom) for every available diffusion model.
T : int, None
- if int: overrides the values of trajectory length in the dictionaries.
- if None: uses the trajectory length values in the dictionaries.
Caution: the minim T of all dictionaries will be considered!
N_model : int, None
- if int: overrides the values of number of trajectories in the dictionaries.
- if None: uses the number of trajectories in the dictionaries
save : bool
If True, saves the generated dataset (see self._save_trajectories).
load : bool
If True, loads a dataset from path (see self._load_trajectories).
path : str
Path from where to save or load the dataset.
Returns
-------
tuple
- trajs (array TxNx2): particles' position. N considers here the sum of all trajectories generated from the input dictionaries. Note: if the dimensions of all trajectories are not equal, then trajs is a list.
- labels (array TxNx2): particles' labels (see ._multi_state for details on labels)
'''
self.T = T
self.N_model = N_model
self.path = Path(path)
self.dics = dics
'Managing dictionaries'
# If the input is a single dictionary, transform it to list
if isinstance(self.dics, dict): self.dics = [self.dics]
# if dics is False, we select trajectories from all models with default values
if self.dics is None:
self.dics = [{'model': model} for model in self.avail_models_name]
# Checking and saving the dimension of the models to be generated
else:
diff_dims = []
for dic in self.dics:
try:
diff_dims.append(dic['dim'])
except: # dim may not be input as it is not used for some models. In this case, dim = 2
diff_dims.append(2)
# Saving the info in internal variable
self.diff_dims = True if np.unique(diff_dims).shape[0] > 1 else False
'Managing folders of the datasets'
self.save = save
self.load = load
if self.save or self.load:
if self.load:
self.save = False
if not self.path.exists() and self.load:
raise FileNotFoundError('The directory from where you want to load the dataset does not exist')
if self.save:
self.path.mkdir(parents=True, exist_ok=True)
'Create trajectories'
trajs, labels = self._create_trajectories()
return trajs, labels
# %% ../source_nbs/lib_nbs/datasets_phenom.ipynb 13
class datasets_phenom(datasets_phenom):
def _create_trajectories(self):
'''
Given a list of dictionaries, generates trajectories of the demanded properties.
First checks in the .csv of each demanded model if a dataset of similar properties exists.
If it does, it loads it from the corresponding file.
Returns
-------
tuple
data_t array containing the generated trajectories
data_l array containing the corresponding labels.
'''
for idx_dic, dic in enumerate(copy.deepcopy(self.dics)):
df, dataset_idx = self._inspect_dic(dic)
# If the dataset does not yet exists
if dataset_idx is False:
# Retrieve name and function of diffusion model
model_f = self.avail_models_func[self.avail_models_name.index(dic['model'])]
# Create dictionary with only arguments
dic_args = dict(dic); dic_args.pop('model')
trajs, labels = model_f(**dic_args)
# Save the trajectories if asked
if self.save:
self._save_trajectories(trajs = trajs,
labels = labels,
dic = dic,
df = df,
dataset_idx = dataset_idx,
path = self.path)
else:
trajs, labels = self._load_trajectories(model_name = dic['model'],
dataset_idx = dataset_idx,
path = self.path)
# Stack dataset
if idx_dic == 0: # first loop
data_t = trajs
data_l = labels
else:
if self.diff_dims: # Do when having different dimensions
if not isinstance(data_t, list): data_t = [data_t]
data_t.append(trajs)
else:
data_t = np.hstack((data_t, trajs))
data_l = np.hstack((data_l, labels))
return data_t, data_l
def _save_trajectories(self, trajs, labels, dic, df, dataset_idx, path):
'''
Given a set of trajectories and labels, saves two things:
- In the .csv corresponding to the demanded model, all the input parameters of the generated dataset. This allows to keed that of what was created before.
- In a .npy file, the trajectories and labels generated.
'''
file_name = (path/(dic['model']+'_'+str(df.shape[0]))).with_suffix('.npy')
# Save information in CSV handler
df = pd.concat([df, pd.DataFrame([dic])], ignore_index=True)
df.to_csv((path/dic['model']).with_suffix('.csv'))
# Save trajectories and labels
data = np.dstack((trajs, labels))
np.save(file_name, data)
def _load_trajectories(self, model_name, dataset_idx, path):
'''
Given the path for a dataset, loads the trajectories and labels
'''
file_name = (path/(model_name+'_'+str(dataset_idx))).with_suffix('.npy')
data = np.load(file_name)
return data[:, :, :2], data[:, : , 2:]
# %% ../source_nbs/lib_nbs/datasets_phenom.ipynb 22
class datasets_phenom(datasets_phenom):
def _inspect_dic(self, dic):
'''
Checks the information of the input dictionaries so that they fulfil the constraints of the program , completes missing information
with default values and then decides about loading/saving depending on parameters.
Parameters
----------
dic : dict
Dictionary with the information of the trajectories we want to generate
Returns
-----------
tuple
df: dataframe collecting the information of the dataset to load.
dataset_idx: location in the previous dataframe of the particular dataset we want to generate.
'''
# Add time and number of trajectories information
if self.N_model is not None:
dic['N'] = self.N_model
if self.T is not None:
dic['T'] = self.T
# Check if CSV with information of dataset exists. If not, create it
model_m = dic['model']
model_f = self.avail_models_func[self.avail_models_name.index(model_m)]
# Check arguments and defaults from model's function
args = inspect.getfullargspec(model_f).args[1:]
defaults = inspect.getfullargspec(model_f).defaults
try:
df = pd.read_csv(self.path/(model_m+'.csv'), index_col=0)
except:
# convert to dataframe and add model
df = pd.DataFrame(columns = args+['model'])
# Assign missing keys in dic with default values
for arg, default in zip(args, defaults):
if arg not in dic.keys():
dic[arg] = default
# Check if updated keys of dic equal keys of csv.
if set(list(df.keys())) != set(list(dic.keys())):
raise ValueError('Input model dictionary does not match models properties')
# Check if the dataset already exists:
df_conditions = df.copy()
# Nones in dataframes are transformed into Nans. We change back this here
# but instead of putting None, we put False.
df_conditions = df_conditions.where(pd.notnull(df_conditions), False)
for key in dic:
# Transforming Nones to False in variables dictionaries (see problem with df just above)
if dic[key] is None: dic[key] = False
# We need to transform it to str to do a fair comparison between matrices (e.g. transition matrix, Ds, alphas,...)
df_conditions = df_conditions.loc[(df_conditions[key].astype(str) == str(dic[key]))]
if len(df_conditions.index) == 0:
break
# If dataset exists
if len(df_conditions.index) > 0:
# if the dataset exists and save was True, do not save but load
if self.save:
wrn_str = f'The dataset you want to save already exists (file: {model_m}_{df_conditions.index[0]}.npy). Switching to Load mode.'
warnings.warn(wrn_str)
dataset_idx = df_conditions.index[0]
elif self.load:
dataset_idx = df_conditions.index[0]
else:
dataset_idx = False
# If dataset does no exists
else:
if self.load:
raise ValueError('The dataset you want to load does not exist.')
else: # If the dataset does not exist, append empty string.
# This allows to mix saving and loading
dataset_idx = False
return df, dataset_idx
# %% ../source_nbs/lib_nbs/datasets_phenom.ipynb 28
class datasets_phenom(datasets_phenom):
def _get_args(self, model, return_defaults = False):
'''
Given the name of a diffusion model, return its inputs arguments.
Parameters
----------
model : str
Name of the diffusion model (see self.available_models_name)
return_defaults : bool
If True, the function will also return the default values of each input argument.
Returns
-------
tuple
args (list): list of input arguments.
defaults (optional, list): list of default value for the input arguments.
'''
model_f = self.avail_models_func[self.avail_models_name.index(model)]
# Check arguments and defaults from model's function
args = inspect.getfullargspec(model_f).args[1:]
defaults = inspect.getfullargspec(model_f).defaults
if return_defaults:
return args, defaults
else:
return args