From 5b7f3a0f12775cc9e3797b0092e2ed88f81e99b5 Mon Sep 17 00:00:00 2001 From: Manuel Gunther Date: Mon, 5 Oct 2015 17:54:11 -0600 Subject: [PATCH] Added OpenBR to Bob score file conversion function --- bob/measure/openbr.py | 149 +++++++++++++++++++++++++++++++++++++++++ bob/measure/test_io.py | 54 +++++++++++++++ doc/guide.rst | 32 ++++++++- 3 files changed, 234 insertions(+), 1 deletion(-) diff --git a/bob/measure/openbr.py b/bob/measure/openbr.py index 25fc755..62ff8f1 100644 --- a/bob/measure/openbr.py +++ b/bob/measure/openbr.py @@ -165,3 +165,152 @@ def _write_matrix(filename, matrix): # OK, now finally write the file in the desired format _write_matrix(mask_file, mask) _write_matrix(matrix_file, matrix) + + +def write_score_file( + matrix_file, + mask_file, + score_file, + models_ids = None, + probes_ids = None, + model_names = None, + probe_names = None, + score_file_format = '4column', + replace_nan = None +): + """Writes the Bob score file in the desired ``score_file_format`` (four or five column), given the OpenBR matrix and mask files. + + In principle, the score file can be written based on the matrix and mask files, and the format suffice the requirements to compute CMC curves. + However, the contents of the score files can be adapted. + If given, the ``models_ids`` and ``probes_ids`` define the **client ids** of model and probe, and they have to be in the same order as used to compute the OpenBR matrix. + The ``model_names`` and ``probe_names`` define the **paths** of model and probe, and they should be in the same order as the ids. + + In rare cases, the OpenBR matrix contains NaN values, which Bob's score files cannot handle. + You can use the ``replace_nan`` parameter to decide, what to do with these values. + By default (``None``), these values are ignored, i.e., not written into the score file. + This is, what OpenBR is doing as well. + However, you can also set ``replace_nan`` to any value, which will be written instead of the NaN values. + + Keyword parameters: + + matrix_file : str + The OpenBR matrix file that should be read. + Usually, the file name extension is ``.mtx`` + + mask_file : str + The OpenBR mask file that should be read. + Usually, the file name extension is ``.mask`` + + score_file : str + The 4 or 5 column style score file that should be written. + + models_ids : [str] or ``None`` + The client ids of the models that will be written in the first column of the score file. + If given, the size must be identical to the number of models (gallery templates) in the OpenBR files. + If not given, client ids of the model will be identical to the **gallery index** in the matrix file. + + probes_ids : [str] or ``None``: + The client ids of the probes that will be written in the second/third column of the four/five column score file. + If given, the size must be identical to the number of probe templates in the OpenBR files. + It will be checked that the OpenBR mask fits to the model/probe client ids. + If not given, the probe ids will be estimated automatically, i.e., to fit the OpenBR matrix. + + model_names : [str] or ``None`` + A list of model path written in the second column of the five column score file. + If not given, the model index in the OpenBR file will be used. + + .. note:: + This entry is ignored in the four column score file format. + + probe_names : [str] or ``None`` + A list of probe path to be written in the third/fourth column in the four/five column score file. + If given, the size must be identical to the number of probe templates in the OpenBR files. + If not given, the probe index in the OpenBR file will be used. + + score_file_format : one of ``('4column', '5column')`` + The format, in which the ``score_file`` should be written. + + replace_nan : float or ``None``: + If NaN values are encountered in the OpenBR matrix (which are not ignored due to the mask being non-NULL), this value will be written instead. + If ``None``, the values will not be written in the score file at all. + """ + def _read_matrix(filename): + ## Helper function to read a matrix file as written by OpenBR + with open(filename, 'rb') as f: + # get version + header = f.readline() + assert header[:2] == "S2" + # skip gallery and probe files + f.readline() + f.readline() + # read size and type of matrix + size = f.readline() + splits = size.rstrip().split() + # TODO: check the endianess of the magic number stored in split[3] + assert splits[0][0] == 'M' + w,h = int(splits[1]), int(splits[2]) + # read matrix data + data = numpy.fromfile(f, dtype={'B':numpy.uint8, 'F': numpy.float32}[splits[0][1]]) + assert data.shape[0] == w*h + data.shape = (w,h) + return data + + # check parameters + if score_file_format not in ("4column", "5column"): + raise ValueError("The given score file format %s is not known; choose one of ('4column', '5column')" % score_file_format) + # get type of score file + four_col = score_file_format == "4column" + + # read the two matrices + scores = _read_matrix(matrix_file) + mask = _read_matrix(mask_file) + + # generate the id lists, if not given + if models_ids is None: + models_ids = [str(g+1) for g in range(mask.shape[1])] + assert len(models_ids) == mask.shape[1] + + if probes_ids is None: + probes_ids = [] + # iterate over all probes + for p in range(mask.shape[0]): + # get indices, where model and probe id should be identical + equal_indices = numpy.where(mask[p] == 0xff) + if len(equal_indices): + # model id found, use the first one + probes_ids.append(models_ids[equal_indices[0]]) + else: + # no model found; add non-existing id + probes_ids.append("unknown") + else: + assert len(probes_ids) == mask.shape[0] + # check that the probes client ids are in the correct order + for p in range(mask.shape[0]): + for g in range(mask.shape[1]): + if mask[p,g] == 0x7f: + if models_ids[g] == probes_ids[p]: raise ValueError("The probe id %s with index %d should not be identical to model id %s with index %d" % (probes_ids[p], p, models_ids[g], g)) + elif mask[p,g] == 0xff: + if models_ids[g] != probes_ids[p]: raise ValueError("The probe id %s with index %d should be identical to model id %s with index %d" % (probes_ids[p], p, models_ids[g], g)) + + # generate model and probe names, if not given + if not four_col and model_names is None: + model_names = [str(g+1) for g in range(mask.shape[1])] + if probe_names is None: + probe_names = [str(p+1) for p in range(mask.shape[0])] + + + # iterate through the files and write scores + with open(score_file, 'w') as f: + for g in range(mask.shape[1]): + for p in range(mask.shape[0]): + if mask[p,g]: + score = scores[p,g] + # handle NaN values + if numpy.isnan(score): + if replace_nan is None: continue + score = replace_nan + # write score file + if four_col: + f.write("%s %s %s %3.8f\n" % (models_ids[g], probes_ids[p], probe_names[p], score)) + else: + f.write("%s %s %s %s %3.8f\n" % (models_ids[g], model_names[g], probes_ids[p], probe_names[p], score)) diff --git a/bob/measure/test_io.py b/bob/measure/test_io.py index f3c268c..924c098 100644 --- a/bob/measure/test_io.py +++ b/bob/measure/test_io.py @@ -8,6 +8,7 @@ """Tests the IO functionality of bob.measure.""" import bob.measure +import numpy import tempfile, os, shutil import bob.io.base.test_utils @@ -117,3 +118,56 @@ def test_openbr_search(): finally: shutil.rmtree(temp_dir) + + + +def test_from_openbr(): + # This function tests that the conversion from the OpenBR matrices work as expected + temp_dir = tempfile.mkdtemp(prefix='bob_test') + + # define input files + openbr_extensions = ('.mtx', '.mask') + matrix_file, mask_file = [bob.io.base.test_utils.datafile('scores%s' % ext, 'bob.measure') for ext in openbr_extensions] + + score_file = os.path.join(temp_dir, "scores") + load_functions = {'4col' : bob.measure.load.four_column, '5col' : bob.measure.load.five_column} + + try: + for variant in ('4col', '5col'): + # first, do not define keyword arguments -- let the file get the model and probe ids being created automatically + bob.measure.openbr.write_score_file(matrix_file, mask_file, score_file, score_file_format="%sumn"%variant) + assert os.path.exists(score_file) + # read the score file with bobs functionality + columns = list(load_functions[variant](score_file)) + + # check the contents + assert len(columns) == 2000 + + # now, generate model and probe names and ids + model_type = {"4col" : "%d", "5col" : "s%d"}[variant] + dev_ids = (3,4,7,8,9,13,15,18,19,22,23,25,28,30,31,32,35,37,38,40) + model_names = ["s%d" % c for c in dev_ids] + probe_names = ["s%d/%d" %(c,i) for c in dev_ids for i in (1,3,6,8,10)] + models_ids = ["%d" % c for c in dev_ids] + probes_ids = ["%d" % c for c in dev_ids for i in (1,3,6,8,10)] + + bob.measure.openbr.write_score_file(matrix_file, mask_file, score_file, models_ids=models_ids, probes_ids=probes_ids, model_names=model_names, probe_names=probe_names, score_file_format="%sumn"%variant) + + # check that we re-generated the bob score file + reference_file = bob.io.base.test_utils.datafile('scores-cmc-%s.txt' % variant, 'bob.measure') + + # assert that we can (almost) reproduce the score file + # ... read both files + columns = list(load_functions[variant](score_file)) + reference = list(load_functions[variant](reference_file)) + assert len(columns) == len(reference) + for i in range(len(columns)): + for j in range(len(columns[i])-1): + # check that the model and probe names are fine + assert columns[i][j] == reference[i][j], str(columns[i]) + " != " + str(reference[i]) + # check that the score is close (OpenBR write scores in float32 precision only) + assert abs(columns[i][-1] - numpy.float32(reference[i][-1])) <= 1e-8, str(columns[i][-1]) + " != " + str(reference[i][-1]) + assert numpy.isclose(columns[i][-1], reference[i][-1], atol = 1e-3, rtol=1e-8), str(columns[i][-1]) + " != " + str(reference[i][-1]) + + finally: + shutil.rmtree(temp_dir) diff --git a/doc/guide.rst b/doc/guide.rst index bf19b1f..726c4d5 100644 --- a/doc/guide.rst +++ b/doc/guide.rst @@ -411,7 +411,10 @@ Score file conversion Sometimes, it is required to export the score files generated by Bob to a different format, e.g., to be able to generate a plot comparing Bob's systems with other systems. In this package, we provide source code to convert between different types of score files. -One of the supported formats is the matrix format that the National Institute of Standards and Technology (NIST) uses, and which is supported by OpenBR. +Bob to OpenBR +============= + +One of the supported formats is the matrix format that the National Institute of Standards and Technology (NIST) uses, and which is supported by OpenBR_. The scores are stored in two binary matrices, where the first matrix (usually with a ``.mtx`` filename extension) contains the raw scores, while a second mask matrix (extension ``.mask``) contains information, which scores are positives, and which are negatives. To convert from Bob's four column or five column score file to a pair of these matrices, you can use the :py:func:`bob.measure.openbr.write_matrix` function. @@ -448,6 +451,32 @@ It specifies the number of highest scores per probe that should be kept. If the ``search`` parameter is set to a negative value, all scores will be kept. If the ``search`` parameter is higher as the actual number of models, ``NaN`` scores will be appended, and the according mask values will be set to ``0`` (i.e., to be ignored). + +OpenBR to Bob +============= + +On the other hand, you might also want to generate a Bob-compatible (four or five column) score file based on a pair of OpenBR matrix and mask files. +This is possible by using the :py:func:`bob.measure.openbr.write_score_file` function. +At the basic, it takes the given pair of matrix and mask files, as well as the desired output score file: + +.. code-block:: py + + >>> bob.measure.openbr.write_score_file('openbr.mtx', 'openbr.mask', 'four-column-sore-file') + +This score file is sufficient to compute a CMC curve (see `CMC`_), however it does not contain relevant client ids or paths for models and probes. +Particularly, it assumes that each client has exactly one associated model. + +To add/correct these information, you can use additional parameters to :py:func:`bob.measure.openbr.write_score_file`. +Client ids of models and probes can be added using the ``models_ids`` and ``probes_ids`` keyword arguments. +The length of these lists must be identical to the number of models and probes as given in the matrix files, **and they must be in the same order as used to compute the OpenBR matrix**. +This includes that the same same-client and different-client pairs as indicated by the OpenBR mask will be generated, which will be checked inside the function. + +To add model and probe path information, the ``model_names`` and ``probe_names`` parameters, which need to have the same size and order as the ``models_ids`` and ``probes_ids``. +These information are simply stored in the score file, and no further check is applied. + +.. note:: The ``model_names`` parameter is used only when writing score files in ``score_file_format='5column'``, in the ``'4column'`` format, this parameter is ignored. + + .. include:: links.rst .. Place youre references here: @@ -455,3 +484,4 @@ If the ``search`` parameter is higher as the actual number of models, ``NaN`` sc .. _`The Expected Performance Curve`: http://publications.idiap.ch/downloads/reports/2005/bengio_2005_icml.pdf .. _`The DET curve in assessment of detection task performance`: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.117.4489&rep=rep1&type=pdf .. _`plot()`: http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot +.. _openbr: http://openbiometrics.org