Added OpenBR to Bob score file conversion function

bioidiap · Oct 5, 2015 · 5b7f3a0 · 5b7f3a0
1 parent d72d342
commit 5b7f3a0
Show file tree

Hide file tree

Showing 3 changed files with 234 additions and 1 deletion.
diff --git a/bob/measure/openbr.py b/bob/measure/openbr.py
@@ -165,3 +165,152 @@ def _write_matrix(filename, matrix):
   # OK, now finally write the file in the desired format
   _write_matrix(mask_file, mask)
   _write_matrix(matrix_file, matrix)
+
+
+def write_score_file(
+    matrix_file,
+    mask_file,
+    score_file,
+    models_ids = None,
+    probes_ids = None,
+    model_names = None,
+    probe_names = None,
+    score_file_format = '4column',
+    replace_nan = None
+):
+  """Writes the Bob score file in the desired ``score_file_format`` (four or five column), given the OpenBR matrix and mask files.
+
+  In principle, the score file can be written based on the matrix and mask files, and the format suffice the requirements to compute CMC curves.
+  However, the contents of the score files can be adapted.
+  If given, the ``models_ids`` and ``probes_ids`` define the **client ids** of model and probe, and they have to be in the same order as used to compute the OpenBR matrix.
+  The ``model_names`` and ``probe_names`` define the **paths** of model and probe, and they should be in the same order as the ids.
+
+  In rare cases, the OpenBR matrix contains NaN values, which Bob's score files cannot handle.
+  You can use the ``replace_nan`` parameter to decide, what to do with these values.
+  By default (``None``), these values are ignored, i.e., not written into the score file.
+  This is, what OpenBR is doing as well.
+  However, you can also set ``replace_nan`` to any value, which will be written instead of the NaN values.
+
+  Keyword parameters:
+
+  matrix_file : str
+    The OpenBR matrix file that should be read.
+    Usually, the file name extension is ``.mtx``
+
+  mask_file : str
+    The OpenBR mask file that should be read.
+    Usually, the file name extension is ``.mask``
+
+  score_file : str
+    The 4 or 5 column style score file that should be written.
+
+  models_ids : [str] or ``None``
+    The client ids of the models that will be written in the first column of the score file.
+    If given, the size must be identical to the number of models (gallery templates) in the OpenBR files.
+    If not given, client ids of the model will be identical to the **gallery index** in the matrix file.
+
+  probes_ids : [str] or ``None``:
+    The client ids of the probes that will be written in the second/third column of the four/five column score file.
+    If given, the size must be identical to the number of probe templates in the OpenBR files.
+    It will be checked that the OpenBR mask fits to the model/probe client ids.
+    If not given, the probe ids will be estimated automatically, i.e., to fit the OpenBR matrix.
+
+  model_names : [str] or ``None``
+    A list of model path written in the second column of the five column score file.
+    If not given, the model index in the OpenBR file will be used.
+
+    .. note::
+       This entry is ignored in the four column score file format.
+
+  probe_names : [str] or ``None``
+    A list of probe path to be written in the third/fourth column in the four/five column score file.
+    If given, the size must be identical to the number of probe templates in the OpenBR files.
+    If not given, the probe index in the OpenBR file will be used.
+
+  score_file_format : one of ``('4column', '5column')``
+    The format, in which the ``score_file`` should be written.
+
+  replace_nan : float or ``None``:
+    If NaN values are encountered in the OpenBR matrix (which are not ignored due to the mask being non-NULL), this value will be written instead.
+    If ``None``, the values will not be written in the score file at all.
+  """
+  def _read_matrix(filename):
+    ## Helper function to read a matrix file as written by OpenBR
+    with open(filename, 'rb') as f:
+      # get version
+      header = f.readline()
+      assert header[:2] == "S2"
+      # skip gallery and probe files
+      f.readline()
+      f.readline()
+      # read size and type of matrix
+      size = f.readline()
+      splits = size.rstrip().split()
+      # TODO: check the endianess of the magic number stored in split[3]
+      assert splits[0][0] == 'M'
+      w,h = int(splits[1]), int(splits[2])
+      # read matrix data
+      data = numpy.fromfile(f, dtype={'B':numpy.uint8, 'F': numpy.float32}[splits[0][1]])
+      assert data.shape[0] == w*h
+      data.shape = (w,h)
+    return data
+
+  # check parameters
+  if score_file_format not in ("4column", "5column"):
+    raise ValueError("The given score file format %s is not known; choose one of ('4column', '5column')" % score_file_format)
+  # get type of score file
+  four_col = score_file_format == "4column"
+
+  # read the two matrices
+  scores = _read_matrix(matrix_file)
+  mask = _read_matrix(mask_file)
+
+  # generate the id lists, if not given
+  if models_ids is None:
+    models_ids = [str(g+1) for g in range(mask.shape[1])]
+  assert len(models_ids) == mask.shape[1]
+
+  if probes_ids is None:
+    probes_ids = []
+    # iterate over all probes
+    for p in range(mask.shape[0]):
+      # get indices, where model and probe id should be identical
+      equal_indices = numpy.where(mask[p] == 0xff)
+      if len(equal_indices):
+        # model id found, use the first one
+        probes_ids.append(models_ids[equal_indices[0]])
+      else:
+        # no model found; add non-existing id
+        probes_ids.append("unknown")
+  else:
+    assert len(probes_ids) == mask.shape[0]
+    # check that the probes client ids are in the correct order
+    for p in range(mask.shape[0]):
+      for g in range(mask.shape[1]):
+        if mask[p,g] == 0x7f:
+          if models_ids[g] == probes_ids[p]: raise ValueError("The probe id %s with index %d should not be identical to model id %s with index %d" % (probes_ids[p], p, models_ids[g], g))
+        elif mask[p,g] == 0xff:
+          if models_ids[g] != probes_ids[p]: raise ValueError("The probe id %s with index %d should be identical to model id %s with index %d" % (probes_ids[p], p, models_ids[g], g))
+
+  # generate model and probe names, if not given
+  if not four_col and model_names is None:
+    model_names = [str(g+1) for g in range(mask.shape[1])]
+  if probe_names is None:
+    probe_names = [str(p+1) for p in range(mask.shape[0])]
+
+
+  # iterate through the files and write scores
+  with open(score_file, 'w') as f:
+    for g in range(mask.shape[1]):
+      for p in range(mask.shape[0]):
+        if mask[p,g]:
+          score = scores[p,g]
+          # handle NaN values
+          if numpy.isnan(score):
+            if replace_nan is None: continue
+            score = replace_nan
+          # write score file
+          if four_col:
+            f.write("%s %s %s %3.8f\n" % (models_ids[g], probes_ids[p], probe_names[p], score))
+          else:
+            f.write("%s %s %s %s %3.8f\n" % (models_ids[g], model_names[g], probes_ids[p], probe_names[p], score))
diff --git a/bob/measure/test_io.py b/bob/measure/test_io.py
@@ -8,6 +8,7 @@
 """Tests the IO functionality of bob.measure."""
 
 import bob.measure
+import numpy
 import tempfile, os, shutil
 
 import bob.io.base.test_utils
@@ -117,3 +118,56 @@ def test_openbr_search():
 
   finally:
     shutil.rmtree(temp_dir)
+
+
+
+def test_from_openbr():
+  # This function tests that the conversion from the OpenBR matrices work as expected
+  temp_dir = tempfile.mkdtemp(prefix='bob_test')
+
+  # define input files
+  openbr_extensions = ('.mtx', '.mask')
+  matrix_file, mask_file = [bob.io.base.test_utils.datafile('scores%s' % ext, 'bob.measure') for ext in openbr_extensions]
+
+  score_file = os.path.join(temp_dir, "scores")
+  load_functions = {'4col' : bob.measure.load.four_column, '5col' : bob.measure.load.five_column}
+
+  try:
+    for variant in ('4col', '5col'):
+      # first, do not define keyword arguments -- let the file get the model and probe ids being created automatically
+      bob.measure.openbr.write_score_file(matrix_file, mask_file, score_file, score_file_format="%sumn"%variant)
+      assert os.path.exists(score_file)
+      # read the score file with bobs functionality
+      columns = list(load_functions[variant](score_file))
+
+      # check the contents
+      assert len(columns) == 2000
+
+      # now, generate model and probe names and ids
+      model_type = {"4col" : "%d", "5col" : "s%d"}[variant]
+      dev_ids = (3,4,7,8,9,13,15,18,19,22,23,25,28,30,31,32,35,37,38,40)
+      model_names = ["s%d" % c for c in dev_ids]
+      probe_names = ["s%d/%d" %(c,i) for c in dev_ids for i in (1,3,6,8,10)]
+      models_ids = ["%d" % c for c in dev_ids]
+      probes_ids = ["%d" % c for c in dev_ids for i in (1,3,6,8,10)]
+
+      bob.measure.openbr.write_score_file(matrix_file, mask_file, score_file, models_ids=models_ids, probes_ids=probes_ids, model_names=model_names, probe_names=probe_names, score_file_format="%sumn"%variant)
+
+      # check that we re-generated the bob score file
+      reference_file = bob.io.base.test_utils.datafile('scores-cmc-%s.txt' % variant, 'bob.measure')
+
+      # assert that we can (almost) reproduce the score file
+      # ... read both files
+      columns = list(load_functions[variant](score_file))
+      reference = list(load_functions[variant](reference_file))
+      assert len(columns) == len(reference)
+      for i in range(len(columns)):
+        for j in range(len(columns[i])-1):
+          # check that the model and probe names are fine
+          assert columns[i][j] == reference[i][j], str(columns[i]) + " != " + str(reference[i])
+        # check that the score is close (OpenBR write scores in float32 precision only)
+        assert abs(columns[i][-1] - numpy.float32(reference[i][-1])) <= 1e-8, str(columns[i][-1]) + " != " + str(reference[i][-1])
+        assert numpy.isclose(columns[i][-1], reference[i][-1], atol = 1e-3, rtol=1e-8), str(columns[i][-1]) + " != " + str(reference[i][-1])
+
+  finally:
+    shutil.rmtree(temp_dir)
diff --git a/doc/guide.rst b/doc/guide.rst
@@ -411,7 +411,10 @@ Score file conversion
 Sometimes, it is required to export the score files generated by Bob to a different format, e.g., to be able to generate a plot comparing Bob's systems with other systems.
 In this package, we provide source code to convert between different types of score files.
 
-One of the supported formats is the matrix format that the National Institute of Standards and Technology (NIST) uses, and which is supported by OpenBR.
+Bob to OpenBR
+=============
+
+One of the supported formats is the matrix format that the National Institute of Standards and Technology (NIST) uses, and which is supported by OpenBR_.
 The scores are stored in two binary matrices, where the first matrix (usually with a ``.mtx`` filename extension) contains the raw scores, while a second mask matrix (extension ``.mask``) contains information, which scores are positives, and which are negatives.
 
 To convert from Bob's four column or five column score file to a pair of these matrices, you can use the :py:func:`bob.measure.openbr.write_matrix` function.
@@ -448,10 +451,37 @@ It specifies the number of highest scores per probe that should be kept.
 If the ``search`` parameter is set to a negative value, all scores will be kept.
 If the ``search`` parameter is higher as the actual number of models, ``NaN`` scores will be appended, and the according mask values will be set to ``0`` (i.e., to be ignored).
 
+
+OpenBR to Bob
+=============
+
+On the other hand, you might also want to generate a Bob-compatible (four or five column) score file based on a pair of OpenBR matrix and mask files.
+This is possible by using the :py:func:`bob.measure.openbr.write_score_file` function.
+At the basic, it takes the given pair of matrix and mask files, as well as the desired output score file:
+
+.. code-block:: py
+
+   >>> bob.measure.openbr.write_score_file('openbr.mtx', 'openbr.mask', 'four-column-sore-file')
+
+This score file is sufficient to compute a CMC curve (see `CMC`_), however it does not contain relevant client ids or paths for models and probes.
+Particularly, it assumes that each client has exactly one associated model.
+
+To add/correct these information, you can use additional parameters to :py:func:`bob.measure.openbr.write_score_file`.
+Client ids of models and probes can be added using the ``models_ids`` and ``probes_ids`` keyword arguments.
+The length of these lists must be identical to the number of models and probes as given in the matrix files, **and they must be in the same order as used to compute the OpenBR matrix**.
+This includes that the same same-client and different-client pairs as indicated by the OpenBR mask will be generated, which will be checked inside the function.
+
+To add model and probe path information, the ``model_names`` and ``probe_names`` parameters, which need to have the same size and order as the ``models_ids`` and ``probes_ids``.
+These information are simply stored in the score file, and no further check is applied.
+
+.. note:: The ``model_names`` parameter is used only when writing score files in ``score_file_format='5column'``, in the ``'4column'`` format, this parameter is ignored.
+
+
 .. include:: links.rst
 
 .. Place youre references here:
 
 .. _`The Expected Performance Curve`: http://publications.idiap.ch/downloads/reports/2005/bengio_2005_icml.pdf
 .. _`The DET curve in assessment of detection task performance`: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.117.4489&rep=rep1&type=pdf
 .. _`plot()`: http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot
+.. _openbr: http://openbiometrics.org