Skip to content

Commit

Permalink
Added support for NLTK-style segmentations.
Browse files Browse the repository at this point in the history
  • Loading branch information
cfournie committed Jul 4, 2013
1 parent b74cf67 commit e733ea6
Show file tree
Hide file tree
Showing 11 changed files with 132 additions and 11 deletions.
3 changes: 2 additions & 1 deletion segeval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@
'segeval.format': ['BoundaryFormat',
'boundary_string_from_masses',
'convert_positions_to_masses',
'convert_masses_to_positions'],
'convert_masses_to_positions',
'convert_nltk_to_masses'],
}


Expand Down
16 changes: 15 additions & 1 deletion segeval/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .util.lang import enum


BoundaryFormat = enum(position='position', mass='mass', sets='sets')
BoundaryFormat = enum(position='position', mass='mass', sets='sets', nltk='nltk')


def convert_positions_to_masses(positions):
Expand Down Expand Up @@ -62,3 +62,17 @@ def boundary_string_from_masses(masses):
pos += mass
# Return
return tuple([frozenset(pb) for pb in string])


def convert_nltk_to_masses(string, boundary_symbol='1'):
'''
Convert an NLTK-formatted segmentation into masses, e.g., ``000001000100000`` becomes
``[5,3,5]``.
:param string: NLTK-formatted segmentation.
:type string: str
:param boundary_symbol: String that represents a boundary.
:type boundary_symbol: str
'''
masses = [len(segment) + 1 for segment in string.split(boundary_symbol)]
return tuple(masses)
16 changes: 15 additions & 1 deletion segeval/format_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
'''
import unittest
from format import (convert_positions_to_masses, convert_masses_to_positions,
boundary_string_from_masses)
boundary_string_from_masses, convert_nltk_to_masses)


class TestSegeval(unittest.TestCase):
Expand Down Expand Up @@ -76,3 +76,17 @@ def test_boundary_string_from_masses_one(self):
'''
string = boundary_string_from_masses([2,3])
self.assertEqual(string, (set(), set([1]), set(), set()))

def test_convert_nltk_to_masses_pk_ab(self):
'''
NLTK-style segmentations starting with a boundary.
'''
self.assertEqual(convert_nltk_to_masses('100'), (1,3))
self.assertEqual(convert_nltk_to_masses('010'), (2,2))

def test_convert_nltk_to_masses_pk_long(self):
'''
NLTK-style segmentations starting with a boundary.
'''
self.assertEqual(convert_nltk_to_masses('0100100000'), (2, 3, 6))
self.assertEqual(convert_nltk_to_masses('0101000000'), (2, 2, 7))
7 changes: 6 additions & 1 deletion segeval/similarity/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ..metric import METRIC_DEFAULTS
from ..ml import ConfusionMatrix as cm
from ..format import (BoundaryFormat, boundary_string_from_masses,
convert_positions_to_masses)
convert_positions_to_masses, convert_nltk_to_masses)
from ..util import __fnc_metric__, SegmentationMetricError


Expand All @@ -26,6 +26,11 @@ def __boundary_statistics__(segs_a, segs_b, boundary_types, boundary_format, n_t
Compute boundary similarity applying the weighting functions specified.
'''

# Convert from NLTK types
if boundary_format == BoundaryFormat.nltk:
segs_a = convert_nltk_to_masses(segs_a)
segs_b = convert_nltk_to_masses(segs_b)
boundary_format = BoundaryFormat.mass
# Check format
if boundary_format == BoundaryFormat.sets:
pass # Correct boundary format
Expand Down
9 changes: 8 additions & 1 deletion segeval/similarity/boundary_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from decimal import Decimal
from .boundary import boundary_similarity
from .weight import weight_a, weight_s, weight_t
from ..format import BoundaryFormat
from ..util import SegmentationMetricError
from ..format import BoundaryFormat
from ..compute import summarize
from ..data.samples import (MULTIPLE_BOUNDARY_TYPES, HEARST_1997_STARGAZER,
HYPOTHESIS_STARGAZER)
Expand Down Expand Up @@ -48,6 +48,13 @@ def test_one_minus(self):
value = boundary_similarity([2, 3, 6], [2, 2, 7], one_minus=True)
self.assertEqual(Decimal('0.25'), value)

def test_boundary_format_nltk(self):
'''
Test the nltk boundary format.
'''
value = boundary_similarity('01001000000', '01010000000', boundary_format=BoundaryFormat.nltk)
self.assertAlmostEqual(Decimal('0.75'), value)

def test_clustered_fps(self):
'''
Test clustered fps.
Expand Down
8 changes: 8 additions & 0 deletions segeval/similarity/segmentation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from decimal import Decimal
from .segmentation import segmentation_similarity
from ..util import SegmentationMetricError
from ..format import BoundaryFormat
from ..data.samples import (HEARST_1997_STARGAZER, HYPOTHESIS_STARGAZER,
MULTIPLE_BOUNDARY_TYPES, KAZANTSEVA2012_G5)

Expand Down Expand Up @@ -45,6 +46,13 @@ def test_one_minus(self):
value = segmentation_similarity([2, 3, 6], [2, 2, 7], one_minus=True)
self.assertEqual(Decimal('0.05'), value)

def test_boundary_format_nltk(self):
'''
Test the nltk boundary format.
'''
value = segmentation_similarity('0100100000', '0101000000', boundary_format=BoundaryFormat.nltk)
self.assertAlmostEqual(Decimal('0.95'), value)

def test_clustered_fps(self):
'''
Test near miss.
Expand Down
4 changes: 2 additions & 2 deletions segeval/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ def test_dir(self):
'boundary_similarity', 'segmentation_similarity',
'boundary_string_from_masses', 'compute_window_size',
'convert_masses_to_positions', 'convert_positions_to_masses',
'fleiss_kappa_linear', 'fleiss_pi_linear', 'fmeasure',
'input_linear_mass_json', 'input_linear_mass_tsv',
'convert_nltk_to_masses', 'fleiss_kappa_linear', 'fleiss_pi_linear',
'fmeasure', 'input_linear_mass_json', 'input_linear_mass_tsv',
'load_nested_folders_dict', 'output_linear_mass_json', 'pk',
'precision', 'recall', 'summarize', 'weight_t', 'weight_s_scale',
'weight_t_scale', 'weight_s', 'weight_a', 'window_diff']))
Expand Down
9 changes: 7 additions & 2 deletions segeval/window/pk.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,17 @@
from decimal import Decimal
from . import __compute_window_size__, WINDOW_METRIC_DEFAULTS
from ..util import __fnc_metric__, SegmentationMetricError
from ..format import BoundaryFormat, convert_masses_to_positions
from ..format import (BoundaryFormat, convert_masses_to_positions, convert_nltk_to_masses)


def __pk__(hypothesis, reference, window_size, one_minus, boundary_format,
return_parts, fnc_round):

# Convert from NLTK types
if boundary_format == BoundaryFormat.nltk:
reference = convert_nltk_to_masses(reference)
hypothesis = convert_nltk_to_masses(hypothesis)
boundary_format = BoundaryFormat.mass
# Convert from masses into positions
if boundary_format == BoundaryFormat.mass:
reference = convert_masses_to_positions(reference)
Expand Down Expand Up @@ -45,7 +50,7 @@ def __pk__(hypothesis, reference, window_size, one_minus, boundary_format,
sum_differences += 1
measurements += 1
# Perform final division
value = Decimal(sum_differences) / measurements
value = Decimal(sum_differences) / measurements if measurements > 0 else 0
if return_parts:
return sum_differences, measurements
else:
Expand Down
36 changes: 36 additions & 0 deletions segeval/window/pk_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,42 @@ def test_window_size_specified(self):
value = pk([2, 3, 6], [2, 2, 7], window_size=2)
self.assertAlmostEqual(Decimal('0.2222222'), value)

def test_boundary_format_nltk(self):
'''
Test the nltk boundary format.
'''
value = pk('0100100000', '0101000000', window_size=2, boundary_format=BoundaryFormat.nltk)
self.assertAlmostEqual(Decimal('0.2222222'), value)

def test_nltk(self):
'''
Runs Pk tests from https://github.com/nltk/nltk/blob/master/nltk/test/segmentation.doctest
'''
# Originally 0.0
self.assertAlmostEqual(
pk('1000100', '1000100', window_size=3, boundary_format=BoundaryFormat.nltk),
Decimal('0.0'))
# Originally 0.5
self.assertAlmostEqual(
pk('010', '100', window_size=2, boundary_format=BoundaryFormat.nltk),
Decimal('0.5'))
# Originally 0.64
self.assertAlmostEqual(
pk('111111', '100100', window_size=2, boundary_format=BoundaryFormat.nltk),
Decimal('0.4'))
# Originally 0.04
self.assertAlmostEqual(
pk('000000', '100100', window_size=2, boundary_format=BoundaryFormat.nltk),
Decimal('0.6'))
# Originally 0.25
self.assertAlmostEqual(
pk('111111', '100100', window_size=3, boundary_format=BoundaryFormat.nltk),
Decimal('0'))
# Originally 0.25
self.assertAlmostEqual(
pk('000000', '100100', window_size=3, boundary_format=BoundaryFormat.nltk),
Decimal('1'))


class TestPairwisePkMeasure(TestCase):

Expand Down
8 changes: 6 additions & 2 deletions segeval/window/windowdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from decimal import Decimal
from . import __compute_window_size__, WINDOW_METRIC_DEFAULTS
from ..format import (BoundaryFormat, convert_masses_to_positions,
convert_positions_to_masses)
convert_positions_to_masses, convert_nltk_to_masses)
from ..util import __fnc_metric__, SegmentationMetricError


Expand Down Expand Up @@ -73,7 +73,11 @@ def __window_diff__(hypothesis, reference, window_size, one_minus,
.. note:: See :func:`segeval.convert_masses_to_positions` for an example of
the input format.
'''

# Convert from NLTK types
if boundary_format == BoundaryFormat.nltk:
reference = convert_nltk_to_masses(reference)
hypothesis = convert_nltk_to_masses(hypothesis)
boundary_format = BoundaryFormat.mass
# Convert from masses into positions
if boundary_format == BoundaryFormat.mass:
reference = convert_masses_to_positions(reference)
Expand Down
27 changes: 27 additions & 0 deletions segeval/window/windowdiff_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,33 @@ def test_window_size_specified(self):
value = window_diff([2, 3, 6], [2, 2, 7], window_size=2)
self.assertAlmostEqual(Decimal('0.2222222'), value)

def test_boundary_format_nltk(self):
'''
Test the nltk boundary format.
'''
value = window_diff('0100100000', '0101000000', window_size=2, boundary_format=BoundaryFormat.nltk)
self.assertAlmostEqual(Decimal('0.2222222'), value)

def test_nltk(self):
'''
Runs WD tests from https://github.com/nltk/nltk/blob/master/nltk/test/segmentation.doctest
'''
s1 = "000100000010"
s2 = "000010000100"
s3 = "100000010000"
# Originally 0.0
self.assertAlmostEqual(
window_diff(s1, s1, window_size=3, boundary_format=BoundaryFormat.nltk),
Decimal('0'))
# Originally 0.3
self.assertAlmostEqual(
window_diff(s2, s1, window_size=3, boundary_format=BoundaryFormat.nltk),
Decimal('0.3'))
# Originally 0.7
self.assertAlmostEqual(
window_diff(s3, s2, window_size=3, boundary_format=BoundaryFormat.nltk),
Decimal('0.8'))


class TestPairwiseWindowDiff(TestCase):

Expand Down

0 comments on commit e733ea6

Please sign in to comment.