In [1]:
import sys
import os
import numpy as np
from scipy.io import loadmat
import collections

import copy
import tqdm

DATA_PATH = '../../exp_doubleEC_28_log_nonadapt/'
DATA_OUTPUT = '../../test-output-npz/'

os.makedirs(DATA_OUTPUT, exist_ok=True)

In [2]:
named_tuple_types = {}

def flatten(item, verbose=False):
    #print(f'Flattening: {item}')
    #print(f'Shape: {item.shape}')
    if item.dtype.kind in ['O', 'V'] and item.shape == (1,1): # prob. cell
        if verbose:
            print("Object")
        return flatten(item[0,0], verbose)
    elif item.dtype.kind == 'V' and item.shape == tuple(): # prob structure
        if verbose:
            print("Void")
        if item.dtype.names not in named_tuple_types:
            named_tuple_types[item.dtype.names] = collections.namedtuple('Structure', item.dtype.names)
        conv = [flatten(x, verbose) for x in item]
        assert len(conv) == len(item.dtype.names)
        return named_tuple_types[item.dtype.names](*conv)
    else :
        if item.shape == (1,1):
            return item[0,0]
        elif item.shape == (1,):
            return item[0]
        else:
            if verbose:
                print('Other - ?')
            return item
            
    

In [4]:
class MatFileRun:
    def __init__(self, mat_file_dataset, index):
        self.glob = mat_file_dataset
        self.y_evals = flatten(mat_file_dataset.y_evals[index, 0])
        self.cmaes_out = flatten(mat_file_dataset.cmaes_out[0, index])
        
    @staticmethod
    def keep_array(st):
        if isinstance(st, (float, int)):
            return np.array([st])
        elif isinstance(st, np.ndarray) and len(st.shape) == 2 and st.shape[0] == 1:
            return st[0,:]
        elif isinstance(st, np.ndarray) and len(st.shape) == 2 and st.shape[1] == 1:
            return st[:,0]
        return st
        
    def save(self, path):
        np.savez(path
            , dimensions = self.glob.bbParams.dimensions # #of dim
            , function_id = self.glob.bbParams.functions # function evaled
            , restarts = self.glob.cmaesParams.Restarts  # restarts (maximum?)
            , exp_id = self.glob.exp_id # name of experiment
                 
            , surrogate_param_set_size_max = self.glob.surrogateParams.modelOpts.trainsetSizeMax
            , surrogate_param_range = self.glob.surrogateParams.modelOpts.trainRange
            , surrogate_param_type = self.glob.surrogateParams.modelOpts.trainsetType
            , surrogate_data_means = self.cmaes_out.means
            , surrogate_data_sigmas = self.cmaes_out.sigmas
            , surrogate_data_c = self.cmaes_out.diagCs
                 
            , points = self.cmaes_out.arxvalids.T # points
            , fvalues = self.keep_array(self.cmaes_out.fvalues) # baseline
            , orig_evaled = self.keep_array(self.cmaes_out.origEvaled.astype(bool)) # fvalues is orig?
            , gen_split = self.keep_array((self.cmaes_out.generationStarts - 1)) # gen 
            , iruns = self.keep_array(self.cmaes_out.iruns) # ??
            , evals = self.cmaes_out.evals # evaluations of o. fitness function ?
            , coco = self.keep_array(self.cmaes_out.fvaluesOrig) # !!! proc je to jinak??
        )

In [5]:
        
        
class MatFileDataset:
    '''
        bbParams          -
        cmaesParams       -
        cmaes_out         - (run, 1)
        exp_id            - str
        exp_results       -
        exp_settings      -
        surrogateParams   - 
        y_evals           - (1, run)
    '''
    def __init__(self, data_path):
        data_file = loadmat(data_path
            , verify_compressed_data_integrity=True
            , mat_dtype=False
            , struct_as_record=True
            )
        
        self._options_top_level = ['bbParams', 'cmaesParams', 'cmaes_out', 'exp_id', 'exp_results', 'exp_settings', 'surrogateParams', 'y_evals']
        
        self.__dict__.update(
            {name: value for name, value in data_file.items() if not name.startswith('__')}
        )
        
        self.bbParams = flatten(self.bbParams)
        self.cmaesParams = flatten(self.cmaesParams)
        #self.cmaes_out = _
        self.exp_id = flatten(self.exp_id)
        self.exp_results = flatten(self.exp_results)
        self.exp_settings = flatten(self.exp_settings)
        self.surrogateParams = flatten(self.surrogateParams)
        #self.y_evals = _
        
    def consistency_check(self):
        for i in self._options_top_level:
            assert hasattr(self, i)
            
        assert isinstance(self.exp_id, str)
            
        #assert len(self.cmaes_out) == len(self.bbParams.instances) == len(self.y_evals)
        
    @staticmethod
    def safe_cell_removal(array):
        assert len(array) == 1
        return array[0,0]
        
    @staticmethod
    def convert_dtype_array_to_dictionary(array):
        assert array.shape == tuple()
        return {name: value for name, value in zip(array.dtype.names, array)}
        
        
    def __iter__(self):
        iterator = [MatFileRun(self, i) for i in range(len(self.y_evals))]
        return iter(iterator)
        

In [6]:
if False:
    mf = MatFileDataset(DATA_PATH + "exp_doubleEC_28_log_nonadapt_results_1_2D_3.mat")
    #list(mf.data_file.keys())
    mf.cmaes_out[0,1][0,0][0,0].dtype
    a = mf.bbParams


In [7]:
mat_files = os.listdir(DATA_PATH)
mat_files = filter(lambda x: x.endswith('.mat'), mat_files)
mat_files = filter(lambda x: x.startswith('exp_doubleEC_28_log_nonadapt'), mat_files)
mat_files = list(mat_files)

for filename in tqdm.tqdm(mat_files):
    strip_filename = filename[:-4]
    data_path = DATA_PATH + filename
    
    mfd = MatFileDataset(data_path)
    for runid, run in enumerate(mfd, start=0):
        run.save(DATA_OUTPUT + strip_filename + f"_{runid}.npz")
    

100%|██████████| 1056/1056 [03:42<00:00,  4.75it/s]


In [52]:
'''
import re
def numpy_match(a,b):
    #print(f"{a.shape} vs {b.shape}")
    try:
        #return (np.all(np.equal(a,b)), 'ok')
        if a.dtype.kind == 'b':
            return (np.all(np.equal(a,b)), 'ok')
        else:
            diff = np.mean(np.abs(a-b)) < 10e-6
            #alll = np.all(diff)
            #if not alll:
            #    print(a[~diff] - b[~diff])
            
            return (diff, 'ok')
    except Exception as e:
        return (False, str(e))



for run_name in os.listdir('../../npz-data'):
    print(run)
    a = np.load('../../npz-data/' + run_name)
    b = np.load(DATA_OUTPUT +  run_name)
    
    exc1 = exc2 =  '<neni>'
    
    for key, val in a.items():
        (r, exc1) = numpy_match(val, b[key])
        if not r:
            (r, exc2) = numpy_match(val, b[key][1:])
        
        if not r:
            print(f"{run_name} {key} {r}\n\t{exc1}\n\t{exc2}")
   ''' 

exp_doubleEC_28_log_nonadapt_results_21_10D_833_1.npz coco False
	operands could not be broadcast together with shapes (40278,) (40279,) 
	ok
exp_doubleEC_28_log_nonadapt_results_21_10D_835_3.npz coco False
	operands could not be broadcast together with shapes (27804,) (27805,) 
	ok
exp_doubleEC_28_log_nonadapt_results_21_10D_831_3.npz coco False
	operands could not be broadcast together with shapes (14301,) (14302,) 
	ok
exp_doubleEC_28_log_nonadapt_results_21_10D_835_1.npz coco False
	operands could not be broadcast together with shapes (40320,) (40321,) 
	ok
exp_doubleEC_28_log_nonadapt_results_21_10D_831_2.npz coco False
	operands could not be broadcast together with shapes (41055,) (41056,) 
	ok
exp_doubleEC_28_log_nonadapt_results_21_10D_829_2.npz coco False
	operands could not be broadcast together with shapes (15792,) (15793,) 
	ok
exp_doubleEC_28_log_nonadapt_results_21_10D_837_1.npz coco False
	operands could not be broadcast together with shapes (4872,) (4873,) 
	ok
exp_doub