Added function to convert Bob's 4col and 5col score files to OpenBR m…

…atrices
bioidiap · Sep 17, 2015 · fb45722 · fb45722
1 parent 58802bc
commit fb45722
Show file tree

Hide file tree

Showing 5 changed files with 175 additions and 4 deletions.
diff --git a/bob/measure/__init__.py b/bob/measure/__init__.py
@@ -9,6 +9,7 @@
 from . import plot
 from . import load
 from . import calibration
+from . import openbr
 import numpy
 
 def mse (estimation, target):

diff --git a/bob/measure/data/scores.mask b/bob/measure/data/scores.mask
@@ -0,0 +1,5 @@
+S2
+unknown-gallery.lst
+unknown-probe.lst
+MB 100 20 xV4
+˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙
diff --git a/bob/measure/data/scores.mtx b/bob/measure/data/scores.mtx
diff --git a/bob/measure/openbr.py b/bob/measure/openbr.py
@@ -0,0 +1,119 @@
+"""This file includes functionality to convert between Bob's four column or five column score files and the Matrix files used in OpenBR."""
+
+import numpy
+import logging
+logger = logging.getLogger("bob.measure")
+
+from .load import open_file, four_column, five_column
+
+def write_matrix(
+    score_file,
+    matrix_file,
+    mask_file,
+    model_names = None,
+    probe_names = None,
+    score_file_format = '4column',
+    gallery_file_name = 'unknown-gallery.lst',
+    probe_file_name = 'unknown-probe.lst'
+):
+  """Writes the OpenBR matrix and mask files (version 2), given the score file.
+  If gallery and probe names are provided, the matrices in both files will be sorted by gallery and probe names.
+
+  .. warning::
+    When provided with a 4-column score file, this function will work only, if there is only a single model id for each client.
+
+  Keyword parameters:
+
+  score_file : str
+    The 4 or 5 column style score file written by bob.
+
+  matrix_file : str
+    The OpenBR matrix file that should be written.
+    Usually, the file name extension is .mtx
+
+  mask_file : str
+    The OpenBR mask file that should be written.
+    The mask file defines, which values are positives, negatives or to be ignored.
+
+  gallery_file_name : str
+    The name of the gallery file that will be written in the header of the OpenBR files.
+
+  probe_file_name : str
+    The name of the probe file that will be written in the header of the OpenBR files.
+
+  model_names : [str] or ``None``
+    If given, the matrix will be written in the same order as the given model names.
+    The model names must be identical with the second column in the 5-column ``score_file``.
+
+    .. note::
+       If the score file is in four column format, the model_names must be the client ids stored in the first row.
+       In this case, there might be only a single model per client
+
+    Only the scores of the given models will be considered.
+
+  probe_names : [str] or ``None``
+    If given, the matrix will be written in the same order as the given probe names (the path of the probe).
+    The probe names are identical to the third line of the ``score_file``.
+    Only the scores of the given probe names will be considered in this case.
+  """
+
+  def _write_matrix(filename, matrix):
+    ## Helper function to write a matrix file as required by OpenBR
+    with open(filename, 'wb') as f:
+      # write the first four lines
+      f.write("S2\n%s\n%s\nM%s %d %d " % (gallery_file_name, probe_file_name, 'B' if matrix.dtype == numpy.uint8 else 'F', matrix.shape[0], matrix.shape[1]))
+      # write magic number
+      numpy.array(0x12345678, numpy.int32).tofile(f)
+      f.write("\n")
+      # write the matrix
+      matrix.tofile(f)
+
+
+  # define read functions, and which information should be read
+  read_function = {'4column' : four_column, '5column' : five_column}[score_file_format]
+  offset = {'4column' : 0, '5column' : 1}[score_file_format]
+
+  # first, read the score file and estimate model ids and probe names, if not given
+  if model_names is None or probe_names is None:
+    model_names, probe_names = [], []
+    model_set, probe_set = set(), set()
+
+    # read the score file
+    for line in read_function(score_file):
+      model, probe = line[offset], line[2+offset]
+      if model not in model_set:
+        model_names.append(model)
+        model_set.add(model)
+      if probe not in probe_set:
+        probe_names.append(probe)
+        probe_set.add(probe)
+
+  # create a shortcut to get indices for client and probe subset (to increase speed)
+  model_dict = {m:i for i,m in enumerate(model_names)}
+  probe_dict = {p:i for i,p in enumerate(probe_names)}
+
+  # now, create the matrices in the desired size
+  matrix = numpy.ndarray((len(probe_names), len(model_names)), numpy.float32)
+  matrix[:] = numpy.nan
+  mask = numpy.zeros(matrix.shape, numpy.uint8)
+
+  # now, iterate through the score file and fill in the matrix
+  for line in read_function(score_file):
+    client, model, id, probe, score = line[0], line[offset], line[1+offset], line[2+offset], line[3+offset]
+
+    assert model in model_dict
+    assert probe in probe_dict
+
+    model_index = model_dict[model]
+    probe_index = probe_dict[probe]
+
+    # check, if we have already written something into that matrix element
+    if mask[probe_index, model_index]:
+      logger.warn("Overwriting existing matrix '%f' element of client '%s' and probe '%s' with '%f'", matrix[probe_index, model_index], client, probe, score)
+
+    matrix[probe_index, model_index] = score
+    mask[probe_index, model_index] = 0xff if client == id else 0x7f
+
+  # OK, now finally write the file in the desired format
+  _write_matrix(mask_file, mask)
+  _write_matrix(matrix_file, matrix)
diff --git a/bob/measure/test_io.py b/bob/measure/test_io.py
@@ -8,7 +8,9 @@
 """Tests the IO functionality of bob.measure."""
 
 import bob.measure
-import pkg_resources
+import tempfile, os, shutil
+
+import bob.io.base.test_utils
 
 def test_load_scores():
   # This function tests the IO functionality of loading score files in different ways
@@ -18,18 +20,62 @@ def test_load_scores():
   cols = {'4col' : 4, '5col' : 5}
 
   for variant in ('4col', '5col'):
-
     # read score file in normal way
-    normal_score_file = pkg_resources.resource_filename('bob.measure', 'data/dev-%s.txt' % variant)
+    normal_score_file = bob.io.base.test_utils.datafile('dev-%s.txt' % variant, 'bob.measure')
     normal_scores = list(load_functions[variant](normal_score_file))
 
     assert len(normal_scores) == 910
     assert all(len(s) == cols[variant] for s in normal_scores)
 
     # read the compressed score file
-    compressed_score_file = pkg_resources.resource_filename('bob.measure', 'data/dev-%s.tar.gz' % variant)
+    compressed_score_file = bob.io.base.test_utils.datafile('dev-%s.tar.gz' % variant, 'bob.measure')
     compressed_scores = list(load_functions[variant](compressed_score_file))
 
     assert len(compressed_scores) == len(normal_scores)
     assert all(len(c) == cols[variant] for c in compressed_scores)
     assert all(c[i] == s[i] for c,s in zip(compressed_scores, normal_scores) for i in range(cols[variant]))
+
+
+def _check_binary_identical(name1, name2):
+  # see: http://www.peterbe.com/plog/using-md5-to-check-equality-between-files
+  import md5
+  # tests if two files are binary identical
+  with open(name1) as f1, open(name2) as f2:
+    assert md5.new(f1.read()).digest() == md5.new(f2.read()).digest()
+
+
+def test_convert_openbr():
+  # This function tests that the conversion to the OpenBR file works as expected
+  temp_dir = tempfile.mkdtemp(prefix='bob_test')
+
+  # define output files
+  openbr_extensions = ('.mtx', '.mask')
+  matrix_file, mask_file = [os.path.join(temp_dir, "scores%s") % ext for ext in openbr_extensions]
+
+  try:
+    for variant in ('4col', '5col'):
+      # get score file
+      score_file = bob.io.base.test_utils.datafile('scores-cmc-%s.txt' % variant, 'bob.measure')
+
+      # first round, do not define keyword arguments -- let the file get the gallery and probe ids automatically
+      kwargs = {}
+      for i in range(2):
+        # get the files by automatically obtaining the identities
+        bob.measure.openbr.write_matrix(score_file, matrix_file, mask_file, score_file_format = "%sumn" % variant, **kwargs)
+
+        assert os.path.isfile(matrix_file) and os.path.isfile(mask_file)
+
+        # check that they are binary identical to the reference files (which are tested to work and give the same results with OpenBR)
+        matrix_ref, mask_ref = [bob.io.base.test_utils.datafile('scores%s' % ext, 'bob.measure') for ext in openbr_extensions]
+        _check_binary_identical(matrix_file, matrix_ref)
+        _check_binary_identical(mask_file, mask_ref)
+
+        # define new kwargs for second round, i.e., define model and probe names
+        # these names are identical to what is found in the score file, which in turn comes from the AT&T database
+        model_type = {"4col" : "%d", "5col" : "s%d"}[variant]
+        dev_ids = (3,4,7,8,9,13,15,18,19,22,23,25,28,30,31,32,35,37,38,40)
+        kwargs['model_names'] = [model_type % c for c in dev_ids]
+        kwargs['probe_names'] = ["s%d/%d" %(c,i) for c in dev_ids for i in (1,3,6,8,10)]
+
+  finally:
+    shutil.rmtree(temp_dir)