Added support for NLTK-style segmentations.

cfournie · Jul 4, 2013 · e733ea6 · e733ea6
1 parent b74cf67
commit e733ea6
Show file tree

Hide file tree

Showing 11 changed files with 132 additions and 11 deletions.
diff --git a/segeval/__init__.py b/segeval/__init__.py
@@ -60,7 +60,8 @@
     'segeval.format':           ['BoundaryFormat',
                                  'boundary_string_from_masses',
                                  'convert_positions_to_masses',
-                                 'convert_masses_to_positions'],
+                                 'convert_masses_to_positions',
+                                 'convert_nltk_to_masses'],
 }
 
 

diff --git a/segeval/format.py b/segeval/format.py
@@ -7,7 +7,7 @@
 from .util.lang import enum
 
 
-BoundaryFormat = enum(position='position', mass='mass', sets='sets')
+BoundaryFormat = enum(position='position', mass='mass', sets='sets', nltk='nltk')
 
 
 def convert_positions_to_masses(positions):
@@ -62,3 +62,17 @@ def boundary_string_from_masses(masses):
         pos += mass
     # Return
     return tuple([frozenset(pb) for pb in string])
+
+
+def convert_nltk_to_masses(string, boundary_symbol='1'):
+    '''
+    Convert an NLTK-formatted segmentation into masses, e.g., ``000001000100000`` becomes
+    ``[5,3,5]``.
+
+    :param string: NLTK-formatted segmentation.
+    :type string: str
+    :param boundary_symbol: String that represents a boundary.
+    :type boundary_symbol: str
+    '''
+    masses = [len(segment) + 1 for segment in string.split(boundary_symbol)]
+    return tuple(masses)
diff --git a/segeval/format_test.py b/segeval/format_test.py
@@ -5,7 +5,7 @@
 '''
 import unittest
 from format import (convert_positions_to_masses, convert_masses_to_positions,
-                    boundary_string_from_masses)
+                    boundary_string_from_masses, convert_nltk_to_masses)
 
 
 class TestSegeval(unittest.TestCase):
@@ -76,3 +76,17 @@ def test_boundary_string_from_masses_one(self):
         '''
         string = boundary_string_from_masses([2,3])
         self.assertEqual(string, (set(), set([1]), set(), set()))
+
+    def test_convert_nltk_to_masses_pk_ab(self):
+        '''
+        NLTK-style segmentations starting with a boundary.
+        '''
+        self.assertEqual(convert_nltk_to_masses('100'), (1,3))
+        self.assertEqual(convert_nltk_to_masses('010'), (2,2))
+
+    def test_convert_nltk_to_masses_pk_long(self):
+        '''
+        NLTK-style segmentations starting with a boundary.
+        '''
+        self.assertEqual(convert_nltk_to_masses('0100100000'), (2, 3, 6))
+        self.assertEqual(convert_nltk_to_masses('0101000000'), (2, 2, 7))
diff --git a/segeval/similarity/__init__.py b/segeval/similarity/__init__.py
@@ -9,7 +9,7 @@
 from ..metric import METRIC_DEFAULTS
 from ..ml import ConfusionMatrix as cm
 from ..format import (BoundaryFormat, boundary_string_from_masses,
-                      convert_positions_to_masses)
+                      convert_positions_to_masses, convert_nltk_to_masses)
 from ..util import __fnc_metric__, SegmentationMetricError
 
 
@@ -26,6 +26,11 @@ def __boundary_statistics__(segs_a, segs_b, boundary_types, boundary_format, n_t
     Compute boundary similarity applying the weighting functions specified.
     '''
 
+    # Convert from NLTK types
+    if boundary_format == BoundaryFormat.nltk:
+        segs_a = convert_nltk_to_masses(segs_a)
+        segs_b = convert_nltk_to_masses(segs_b)
+        boundary_format = BoundaryFormat.mass
     # Check format
     if boundary_format == BoundaryFormat.sets:
         pass  # Correct boundary format

diff --git a/segeval/similarity/boundary_test.py b/segeval/similarity/boundary_test.py
@@ -7,8 +7,8 @@
 from decimal import Decimal
 from .boundary import boundary_similarity
 from .weight import weight_a, weight_s, weight_t
-from ..format import BoundaryFormat
 from ..util import SegmentationMetricError
+from ..format import BoundaryFormat
 from ..compute import summarize
 from ..data.samples import (MULTIPLE_BOUNDARY_TYPES, HEARST_1997_STARGAZER,
                             HYPOTHESIS_STARGAZER)
@@ -48,6 +48,13 @@ def test_one_minus(self):
         value = boundary_similarity([2, 3, 6], [2, 2, 7], one_minus=True)
         self.assertEqual(Decimal('0.25'), value)
 
+    def test_boundary_format_nltk(self):
+        '''
+        Test the nltk boundary format.
+        '''
+        value = boundary_similarity('01001000000', '01010000000', boundary_format=BoundaryFormat.nltk)
+        self.assertAlmostEqual(Decimal('0.75'), value)
+
     def test_clustered_fps(self):
         '''
         Test clustered fps.

diff --git a/segeval/similarity/segmentation_test.py b/segeval/similarity/segmentation_test.py
@@ -7,6 +7,7 @@
 from decimal import Decimal
 from .segmentation import segmentation_similarity
 from ..util import SegmentationMetricError
+from ..format import BoundaryFormat
 from ..data.samples import (HEARST_1997_STARGAZER, HYPOTHESIS_STARGAZER,
                             MULTIPLE_BOUNDARY_TYPES, KAZANTSEVA2012_G5)
 
@@ -45,6 +46,13 @@ def test_one_minus(self):
         value = segmentation_similarity([2, 3, 6], [2, 2, 7], one_minus=True)
         self.assertEqual(Decimal('0.05'), value)
 
+    def test_boundary_format_nltk(self):
+        '''
+        Test the nltk boundary format.
+        '''
+        value = segmentation_similarity('0100100000', '0101000000', boundary_format=BoundaryFormat.nltk)
+        self.assertAlmostEqual(Decimal('0.95'), value)
+
     def test_clustered_fps(self):
         '''
         Test near miss.

diff --git a/segeval/test.py b/segeval/test.py
@@ -34,8 +34,8 @@ def test_dir(self):
                               'boundary_similarity', 'segmentation_similarity',
                               'boundary_string_from_masses', 'compute_window_size',
                               'convert_masses_to_positions', 'convert_positions_to_masses',
-                              'fleiss_kappa_linear', 'fleiss_pi_linear', 'fmeasure',
-                              'input_linear_mass_json', 'input_linear_mass_tsv',
+                              'convert_nltk_to_masses', 'fleiss_kappa_linear', 'fleiss_pi_linear',
+                              'fmeasure', 'input_linear_mass_json', 'input_linear_mass_tsv',
                               'load_nested_folders_dict', 'output_linear_mass_json', 'pk',
                               'precision', 'recall', 'summarize', 'weight_t', 'weight_s_scale',
                               'weight_t_scale', 'weight_s', 'weight_a', 'window_diff']))

diff --git a/segeval/window/pk.py b/segeval/window/pk.py
@@ -8,12 +8,17 @@
 from decimal import Decimal
 from . import __compute_window_size__, WINDOW_METRIC_DEFAULTS
 from ..util import __fnc_metric__, SegmentationMetricError
-from ..format import BoundaryFormat, convert_masses_to_positions
+from ..format import (BoundaryFormat, convert_masses_to_positions, convert_nltk_to_masses)
 
 
 def __pk__(hypothesis, reference, window_size, one_minus, boundary_format,
            return_parts, fnc_round):
 
+    # Convert from NLTK types
+    if boundary_format == BoundaryFormat.nltk:
+        reference = convert_nltk_to_masses(reference)
+        hypothesis = convert_nltk_to_masses(hypothesis)
+        boundary_format = BoundaryFormat.mass
     # Convert from masses into positions
     if boundary_format == BoundaryFormat.mass:
         reference = convert_masses_to_positions(reference)
@@ -45,7 +50,7 @@ def __pk__(hypothesis, reference, window_size, one_minus, boundary_format,
             sum_differences += 1
         measurements += 1
     # Perform final division
-    value = Decimal(sum_differences) / measurements
+    value = Decimal(sum_differences) / measurements if measurements > 0 else 0
     if return_parts:
         return sum_differences, measurements
     else:

diff --git a/segeval/window/pk_test.py b/segeval/window/pk_test.py
@@ -176,6 +176,42 @@ def test_window_size_specified(self):
         value = pk([2, 3, 6], [2, 2, 7], window_size=2)
         self.assertAlmostEqual(Decimal('0.2222222'), value)
 
+    def test_boundary_format_nltk(self):
+        '''
+        Test the nltk boundary format.
+        '''
+        value = pk('0100100000', '0101000000', window_size=2, boundary_format=BoundaryFormat.nltk)
+        self.assertAlmostEqual(Decimal('0.2222222'), value)
+
+    def test_nltk(self):
+        '''
+        Runs Pk tests from https://github.com/nltk/nltk/blob/master/nltk/test/segmentation.doctest
+        '''
+        # Originally 0.0
+        self.assertAlmostEqual(
+            pk('1000100', '1000100', window_size=3, boundary_format=BoundaryFormat.nltk),
+            Decimal('0.0'))
+        # Originally 0.5
+        self.assertAlmostEqual(
+            pk('010', '100', window_size=2, boundary_format=BoundaryFormat.nltk), 
+            Decimal('0.5'))
+        # Originally 0.64
+        self.assertAlmostEqual(
+            pk('111111', '100100', window_size=2, boundary_format=BoundaryFormat.nltk), 
+            Decimal('0.4'))
+        # Originally 0.04
+        self.assertAlmostEqual(
+            pk('000000', '100100', window_size=2, boundary_format=BoundaryFormat.nltk), 
+            Decimal('0.6'))
+        # Originally 0.25
+        self.assertAlmostEqual(
+            pk('111111', '100100', window_size=3, boundary_format=BoundaryFormat.nltk), 
+            Decimal('0'))
+        # Originally 0.25
+        self.assertAlmostEqual(
+            pk('000000', '100100', window_size=3, boundary_format=BoundaryFormat.nltk), 
+            Decimal('1'))
+
 
 class TestPairwisePkMeasure(TestCase):
 

diff --git a/segeval/window/windowdiff.py b/segeval/window/windowdiff.py
@@ -10,7 +10,7 @@
 from decimal import Decimal
 from . import __compute_window_size__, WINDOW_METRIC_DEFAULTS
 from ..format import (BoundaryFormat, convert_masses_to_positions,
-                      convert_positions_to_masses)
+                      convert_positions_to_masses, convert_nltk_to_masses)
 from ..util import __fnc_metric__, SegmentationMetricError
 
 
@@ -73,7 +73,11 @@ def __window_diff__(hypothesis, reference, window_size, one_minus,
     .. note:: See :func:`segeval.convert_masses_to_positions` for an example of
               the input format.
     '''
-
+    # Convert from NLTK types
+    if boundary_format == BoundaryFormat.nltk:
+        reference = convert_nltk_to_masses(reference)
+        hypothesis = convert_nltk_to_masses(hypothesis)
+        boundary_format = BoundaryFormat.mass
     # Convert from masses into positions
     if boundary_format == BoundaryFormat.mass:
         reference = convert_masses_to_positions(reference)

diff --git a/segeval/window/windowdiff_test.py b/segeval/window/windowdiff_test.py
@@ -239,6 +239,33 @@ def test_window_size_specified(self):
         value = window_diff([2, 3, 6], [2, 2, 7], window_size=2)
         self.assertAlmostEqual(Decimal('0.2222222'), value)
 
+    def test_boundary_format_nltk(self):
+        '''
+        Test the nltk boundary format.
+        '''
+        value = window_diff('0100100000', '0101000000', window_size=2, boundary_format=BoundaryFormat.nltk)
+        self.assertAlmostEqual(Decimal('0.2222222'), value)
+
+    def test_nltk(self):
+        '''
+        Runs WD tests from https://github.com/nltk/nltk/blob/master/nltk/test/segmentation.doctest
+        '''
+        s1 = "000100000010"
+        s2 = "000010000100"
+        s3 = "100000010000"
+        # Originally 0.0
+        self.assertAlmostEqual(
+            window_diff(s1, s1, window_size=3, boundary_format=BoundaryFormat.nltk),
+            Decimal('0'))
+        # Originally 0.3
+        self.assertAlmostEqual(
+            window_diff(s2, s1, window_size=3, boundary_format=BoundaryFormat.nltk),
+            Decimal('0.3'))
+        # Originally 0.7
+        self.assertAlmostEqual(
+            window_diff(s3, s2, window_size=3, boundary_format=BoundaryFormat.nltk),
+            Decimal('0.8'))
+
 
 class TestPairwiseWindowDiff(TestCase):