Added search format for OpenBR files; added documentation

bioidiap · Sep 18, 2015 · dc772a2 · dc772a2
1 parent f035a5a
commit dc772a2
Show file tree

Hide file tree

Showing 8 changed files with 213 additions and 38 deletions.
diff --git a/bob/measure/data/search.mask b/bob/measure/data/search.mask
diff --git a/bob/measure/data/search.mtx b/bob/measure/data/search.mtx
diff --git a/bob/measure/load.py b/bob/measure/load.py
@@ -110,7 +110,7 @@ def cmc_four_column(filename):
   For each probe file, the tuple consists of a list of negative scores and a list of positive scores.
   Usually, the list of positive scores should contain only one element, but more are allowed.
 
-  The result of this function can directly be passed to, e.g., the bob.measure.cmc function.
+  The result of this function can directly be passed to, e.g., the :py:func:`bob.measure.cmc` function.
   """
   # extract positives and negatives
   pos_dict = {}
@@ -213,7 +213,7 @@ def cmc_five_column(filename):
   For each probe file, the tuple consists of a list of negative scores and a list of positive scores.
   Usually, the list of positive scores should contain only one element, but more are allowed.
 
-  The result of this function can directly be passed to, e.g., the bob.measure.cmc function.
+  The result of this function can directly be passed to, e.g., the :py:func:`bob.measure.cmc` function.
   """
   # extract positives and negatives
   pos_dict = {}

diff --git a/bob/measure/openbr.py b/bob/measure/openbr.py
@@ -15,10 +15,14 @@ def write_matrix(
     probe_names = None,
     score_file_format = '4column',
     gallery_file_name = 'unknown-gallery.lst',
-    probe_file_name = 'unknown-probe.lst'
+    probe_file_name = 'unknown-probe.lst',
+    search = None
 ):
   """Writes the OpenBR matrix and mask files (version 2), given the score file.
   If gallery and probe names are provided, the matrices in both files will be sorted by gallery and probe names.
+  Otherwise, the order will be the same as given in the score file.
+
+  If ``search`` is given (as an integer), the resulting matrix files will be in the *search* format, keeping the given number of gallery scores with the highest values for each probe.
 
   .. warning::
     When provided with a 4-column score file, this function will work only, if there is only a single model id for each client.
@@ -30,32 +34,40 @@ def write_matrix(
 
   matrix_file : str
     The OpenBR matrix file that should be written.
-    Usually, the file name extension is .mtx
+    Usually, the file name extension is ``.mtx``
 
   mask_file : str
     The OpenBR mask file that should be written.
     The mask file defines, which values are positives, negatives or to be ignored.
-
-  gallery_file_name : str
-    The name of the gallery file that will be written in the header of the OpenBR files.
-
-  probe_file_name : str
-    The name of the probe file that will be written in the header of the OpenBR files.
+    Usually, the file name extension is ``.mask``
 
   model_names : [str] or ``None``
     If given, the matrix will be written in the same order as the given model names.
     The model names must be identical with the second column in the 5-column ``score_file``.
 
     .. note::
-       If the score file is in four column format, the model_names must be the client ids stored in the first row.
+       If the score file is in four column format, the model_names must be the client ids stored in the first column.
        In this case, there might be only a single model per client
 
     Only the scores of the given models will be considered.
 
   probe_names : [str] or ``None``
-    If given, the matrix will be written in the same order as the given probe names (the path of the probe).
-    The probe names are identical to the third line of the ``score_file``.
+    If given, the matrix will be written in the same order as the given probe names (the ``path`` of the probe).
+    The probe names are identical to the third column of the 4-column (or the fourth column of the 5-column) ``score_file``.
     Only the scores of the given probe names will be considered in this case.
+
+  score_file_format : one of ``('4column', '5column')``
+    The format, in which the ``score_file`` is.
+
+  gallery_file_name : str
+    The name of the gallery file that will be written in the header of the OpenBR files.
+
+  probe_file_name : str
+    The name of the probe file that will be written in the header of the OpenBR files.
+
+  search : int or ``None``
+    If given, the scores will be sorted per probe, keeping the specified number of highest scores.
+    If the given number is higher than the models, ``NaN`` values will be added, and the mask will contain ``0x00`` values.
   """
 
   def _write_matrix(filename, matrix):
@@ -77,7 +89,7 @@ def _write_matrix(filename, matrix):
   read_function = {'4column' : four_column, '5column' : five_column}[score_file_format]
   offset = {'4column' : 0, '5column' : 1}[score_file_format]
 
-  # first, read the score file and estimate model ids and probe names, if not given
+  # first, read the score file and estimate model and probe names, if not given
   if model_names is None or probe_names is None:
     model_names, probe_names = [], []
     model_set, probe_set = set(), set()
@@ -92,32 +104,62 @@ def _write_matrix(filename, matrix):
         probe_names.append(probe)
         probe_set.add(probe)
 
-  # create a shortcut to get indices for client and probe subset (to increase speed)
-  model_dict, probe_dict = {}, {}
-  for i,m in enumerate(model_names): model_dict[m]=i
-  for i,p in enumerate(probe_names): probe_dict[p]=i
+  if search is None:
+    # create a shortcut to get indices for client and probe subset (to increase speed)
+    model_dict, probe_dict = {}, {}
+    for i,m in enumerate(model_names): model_dict[m]=i
+    for i,p in enumerate(probe_names): probe_dict[p]=i
+
+    # create the matrices in the desired size
+    matrix = numpy.ndarray((len(probe_names), len(model_names)), numpy.float32)
+    matrix[:] = numpy.nan
+    mask = numpy.zeros(matrix.shape, numpy.uint8)
+
+    # now, iterate through the score file and fill in the matrix
+    for line in read_function(score_file):
+      client, model, id, probe, score = line[0], line[offset], line[1+offset], line[2+offset], line[3+offset]
+
+      assert model in model_dict
+      assert probe in probe_dict
+
+      model_index = model_dict[model]
+      probe_index = probe_dict[probe]
+
+      # check, if we have already written something into that matrix element
+      if mask[probe_index, model_index]:
+        logger.warn("Overwriting existing matrix '%f' element of client '%s' and probe '%s' with '%f'", matrix[probe_index, model_index], client, probe, score)
 
-  # now, create the matrices in the desired size
-  matrix = numpy.ndarray((len(probe_names), len(model_names)), numpy.float32)
-  matrix[:] = numpy.nan
-  mask = numpy.zeros(matrix.shape, numpy.uint8)
+      matrix[probe_index, model_index] = score
+      mask[probe_index, model_index] = 0xff if client == id else 0x7f
 
-  # now, iterate through the score file and fill in the matrix
-  for line in read_function(score_file):
-    client, model, id, probe, score = line[0], line[offset], line[1+offset], line[2+offset], line[3+offset]
+  else:
+    # get the correct search parameter, if negative
+    if search < 0:
+      search = len(model_names)
+
+    # create the matrices in the desired size
+    matrix = numpy.ndarray((len(probe_names), search), numpy.float32)
+    matrix[:] = numpy.nan
+    mask = numpy.zeros(matrix.shape, numpy.uint8)
+
+    # get the scores, sorted by probe
+    scores = {}
+    for line in read_function(score_file):
+      client, model, id, probe, score = line[0], line[offset], line[1+offset], line[2+offset], line[3+offset]
 
-    assert model in model_dict
-    assert probe in probe_dict
+      if probe not in scores:
+        scores[probe] = []
+      scores[probe].append((score, 0xff if client == id else 0x7f))
 
-    model_index = model_dict[model]
-    probe_index = probe_dict[probe]
+    # go ahead and sort the scores per probe
+    scores = {k:sorted(v, key=lambda x: x[0], reverse=True) for k,v in scores.iteritems()}
 
-    # check, if we have already written something into that matrix element
-    if mask[probe_index, model_index]:
-      logger.warn("Overwriting existing matrix '%f' element of client '%s' and probe '%s' with '%f'", matrix[probe_index, model_index], client, probe, score)
+    # now, write matrix
+    for p, probe in enumerate(probe_names):
+      if probe in scores:
+        for m in range(min(search, len(scores[probe]))):
+          matrix[p,m], mask[p,m] = scores[probe][m]
 
-    matrix[probe_index, model_index] = score
-    mask[probe_index, model_index] = 0xff if client == id else 0x7f
 
   # OK, now finally write the file in the desired format
   _write_matrix(mask_file, mask)

diff --git a/bob/measure/plot.py b/bob/measure/plot.py
@@ -364,7 +364,24 @@ def det_axis(v, **kwargs):
   return mpl.axis(tv, **kwargs)
 
 def cmc(cmc_scores, logx = True, **kwargs):
-  """Plots the (cumulative) match characteristics curve and returns the maximum rank."""
+  """Plots the (cumulative) match characteristics curve and returns the maximum rank.
+
+  This function plots a CMC curve using the given CMC scores, which can be read from the our score files using the :py:func:`bob.measure.load.cmc_four_column` or :py:func:`bob.measure.load.cmc_five_column` methods.
+  The structure of the ``cmc_scores`` parameter is relatively complex.
+  It contains a list of pairs of lists.
+  For each probe object, a pair of list negative and positive scores is required.
+
+  Keyword parameters:
+
+  cmc_scores : [([negative],[positive])]
+    The list of scores to be plotted.
+
+  logx : bool
+    Plot the rank axis in logarithmic scale? (Default: ``True``)
+
+  kwargs
+    Remaining keyword arguments passed directly to the :py:func:`matplotlib.pyplot.plot` function.
+  """
   try:
     import matplotlib.pyplot as mpl
   except ImportError:

diff --git a/bob/measure/test_io.py b/bob/measure/test_io.py
@@ -49,8 +49,8 @@ def _check_binary_identical(name1, name2):
       assert md5(f1.read()).digest() == md5(f2.read()).digest()
 
 
-def test_convert_openbr():
-  # This function tests that the conversion to the OpenBR file works as expected
+def test_openbr_verify():
+  # This function tests that the conversion to the OpenBR verify file works as expected
   temp_dir = tempfile.mkdtemp(prefix='bob_test')
 
   # define output files
@@ -84,3 +84,40 @@ def test_convert_openbr():
 
   finally:
     shutil.rmtree(temp_dir)
+
+
+def test_openbr_search():
+  # This function tests that the conversion to the OpenBR search file works as expected
+  temp_dir = tempfile.mkdtemp(prefix='bob_test')
+
+  # define output files
+  openbr_extensions = ('.mtx', '.mask')
+  matrix_file, mask_file = [os.path.join(temp_dir, "search%s") % ext for ext in openbr_extensions]
+
+  try:
+    for variant in ('4col', '5col'):
+      # get score file
+      score_file = bob.io.base.test_utils.datafile('scores-cmc-%s.txt' % variant, 'bob.measure')
+
+      # first round, do not define keyword arguments -- let the file get the gallery and probe ids automatically
+      kwargs = {}
+      for i in range(2):
+        # get the files by automatically obtaining the identities
+        bob.measure.openbr.write_matrix(score_file, matrix_file, mask_file, score_file_format = "%sumn" % variant, search=50, **kwargs)
+
+        assert os.path.isfile(matrix_file) and os.path.isfile(mask_file)
+
+        # check that they are binary identical to the reference files (which are tested to work and give the same results with OpenBR)
+        matrix_ref, mask_ref = [bob.io.base.test_utils.datafile('search%s' % ext, 'bob.measure') for ext in openbr_extensions]
+        _check_binary_identical(matrix_file, matrix_ref)
+        _check_binary_identical(mask_file, mask_ref)
+
+        # define new kwargs for second round, i.e., define model and probe names
+        # these names are identical to what is found in the score file, which in turn comes from the AT&T database
+        model_type = {"4col" : "%d", "5col" : "s%d"}[variant]
+        dev_ids = (3,4,7,8,9,13,15,18,19,22,23,25,28,30,31,32,35,37,38,40)
+        kwargs['model_names'] = [model_type % c for c in dev_ids]
+        kwargs['probe_names'] = ["s%d/%d" %(c,i) for c in dev_ids for i in (1,3,6,8,10)]
+
+  finally:
+    shutil.rmtree(temp_dir)
diff --git a/doc/guide.rst b/doc/guide.rst
@@ -273,7 +273,7 @@ This will produce an image like the following one:
 EPC
 ===
 
-Drawing an EPC requires that both the development set negatives and positives are provided alognside
+Drawing an EPC requires that both the development set negatives and positives are provided alongside
 the test (or evaluation) set ones. Because of this the API is slightly modified:
 
 .. doctest::
@@ -298,6 +298,37 @@ This will produce an image like the following one:
    pyplot.grid(True)
    pyplot.title('EPC')
 
+
+CMC
+===
+
+The Cumulative Match Characteristics (CMC) curve estimates the probability that the correct model is in the *N* models with the highest similarity to a given probe.
+A CMC curve can be plotted using the :py:func:`bob.measure.plot.cmc` function.
+The CMC can be calculated from a relatively complex data structure, which defines a pair of positive and negative scores **per probe**:
+
+.. plot::
+
+   import numpy
+   import bob.measure
+   from matplotlib import pyplot
+
+   scores = []
+   for probe in range(10):
+     positives = numpy.random.normal(1, 1, 1)
+     negatives = numpy.random.normal(0, 1, 19)
+     scores.append((negatives, positives))
+   bob.measure.plot.cmc(scores, logx=False)
+   pyplot.title('CMC')
+   pyplot.xlabel('Rank')
+   pyplot.xticks([1,5,10,20])
+   pyplot.xlim([1,20])
+   pyplot.ylim([0,100])
+
+Usually, there is only a single positive score per probe, but this is not a fixed restriction.
+
+.. note::
+   The complex data structure can be read from our default 4 or 5 column score files using the :py:func:`bob.measure.load.cmc_four_column` or :py:func:`bob.measure.load.cmc_five_column` function.
+
 Fine-tunning
 ============
 
@@ -373,6 +404,50 @@ system.
 Use the ``--help`` option on the above-cited scripts to find-out about more
 options.
 
+
+Score file conversion
+---------------------
+
+Sometimes, it is required to export the score files generated by Bob to a different format, e.g., to be able to generate a plot comparing Bob's systems with other systems.
+In this package, we provide source code to convert between different types of score files.
+
+One of the supported formats is the matrix format that the National Institute of Standards and Technology (NIST) uses, and which is supported by OpenBR.
+The scores are stored in two binary matrices, where the first matrix (usually with a ``.mtx`` filename extension) contains the raw scores, while a second mask matrix (extension ``.mask``) contains information, which scores are positives, and which are negatives.
+
+To convert from Bob's four column or five column score file to a pair of these matrices, you can use the :py:func:`bob.measure.openbr.write_matrix` function.
+In the simplest way, this function takes a score file ``'five-column-sore-file'`` and writes the pair ``'openbr.mtx', 'openbr.mask'`` of OpenBR compatible files:
+
+.. code-block:: py
+
+   >>> bob.measure.openbr.write_matrix('five-column-sore-file', 'openbr.mtx', 'openbr.mask', score_file_format = '5column')
+
+In this way, the score file will be parsed and the matrices will be written in the same order that is obtained from the score file.
+
+For most of the applications, this should be sufficient, but as the identity information is lost in the matrix files, no deeper analysis is possible anymore when just using the matrices.
+To enforce an order of the models and probes inside the matrices, you can use the ``model_names`` and ``probe_names`` parameters of :py:func:`bob.measure.openbr.write_matrix`:
+
+* The ``probe_names`` parameter lists the ``path`` elements stored in the score files, which are the fourth column in a ``5column`` file, and the third column in a ``4column`` file, see :py:func:`bob.measure.load.five_column` and :py:func:`bob.measure.load.four_column`.
+
+* The ``model_names`` parameter is a bit more complicated.
+  In a ``5column`` format score file, the model names are defined by the second column of that file, see :py:func:`bob.measure.load.five_column`.
+  In a ``4column`` format score file, the model information is not contained, but only the client information of the model.
+  Hence, for the ``4column`` format, the ``model_names`` actually lists the client ids found in the first column, see :py:func:`bob.measure.load.four_column`.
+
+  .. warning::
+     The model information is lost, but required to write the matrix files.
+     In the ``4column`` format, we use client ids instead of the model information.
+     Hence, when several models exist per client, this function will not work as expected.
+
+Additionally, there are fields in the matrix files, which define the gallery and probe list files that were used to generate the matrix.
+These file names can be selected with the ``gallery_file_name`` and ``probe_file_name`` keyword parameters of :py:func:`bob.measure.openbr.write_matrix`.
+
+Finally, OpenBR defines a specific ``'search'`` score file format, which is designed to be used to compute CMC curves.
+The score matrix contains descendingly sorted and possibly truncated list of scores, i.e., for each probe, a sorted list of all scores for the models is generated.
+To generate these special score file format, you can specify the ``search`` parameter.
+It specifies the number of highest scores per probe that should be kept.
+If the ``search`` parameter is set to a negative value, all scores will be kept.
+If the ``search`` parameter is higher as the actual number of models, ``NaN`` scores will be appended, and the according mask values will be set to ``0`` (i.e., to be ignored).
+
 .. include:: links.rst
 
 .. Place youre references here:

diff --git a/doc/py_api.rst b/doc/py_api.rst
@@ -29,3 +29,7 @@ Plotting
 
 .. automodule:: bob.measure.plot
 
+OpenBR conversions
+------------------
+
+.. automodule:: bob.measure.openbr