Skip to content

Commit

Permalink
Fixed a bug where agreement could not be computed on boundary-set-for…
Browse files Browse the repository at this point in the history
…mat segmentations and segmentations containing multiple boundary types.
  • Loading branch information
cfournie committed Sep 11, 2013
1 parent 3f2fdaf commit 0510ceb
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 16 deletions.
2 changes: 1 addition & 1 deletion segeval/__init__.py
Expand Up @@ -10,7 +10,7 @@


# Package description
__version_number__ = '2.0.8'
__version_number__ = '2.0.9'
__release__ = None
__version__ = '-'.join((__version_number__, __release__)) if __release__ is not None else __version_number__
__project__ = 'SegEval'
Expand Down
31 changes: 29 additions & 2 deletions segeval/agreement/__init__.py
Expand Up @@ -7,6 +7,9 @@
from ..data import get_coders
from ..similarity import SIMILARITY_METRIC_DEFAULTS
from ..similarity.boundary import boundary_similarity
from ..format import (BoundaryFormat, boundary_string_from_masses,
convert_positions_to_masses, convert_nltk_to_masses)
from ..util import SegmentationMetricError


AGREEMENT_METRIC_DEFAULTS = dict(SIMILARITY_METRIC_DEFAULTS)
Expand All @@ -19,9 +22,34 @@
def __fnc_metric__(fnc_metric, dataset, **kwargs):
metric_kwargs = dict(AGREEMENT_METRIC_DEFAULTS)
metric_kwargs.update(kwargs)
if hasattr(dataset, 'boundary_types'):
metric_kwargs['boundary_types'] = dataset.boundary_types
if hasattr(dataset, 'boundary_format'):
metric_kwargs['boundary_format'] = dataset.boundary_format
return fnc_metric(dataset, **metric_kwargs)


def __potential_boundaries__(segmentation, **kwargs):
boundary_format = kwargs['boundary_format']
boundary_types = kwargs['boundary_types']
boundary_string = segmentation
# Convert from NLTK types
if boundary_format == BoundaryFormat.nltk:
boundary_string = convert_nltk_to_masses(segmentation)
boundary_format = BoundaryFormat.mass
# Check format
if boundary_format == BoundaryFormat.sets:
pass
elif boundary_format == BoundaryFormat.mass:
boundary_string = boundary_string_from_masses(boundary_string)
elif boundary_format == BoundaryFormat.position:
boundary_string = convert_positions_to_masses(boundary_string)
boundary_string = boundary_string_from_masses(boundary_string)
else:
raise SegmentationMetricError('Unsupported boundary format')
return len(boundary_string) * len(boundary_types)


def __actual_agreement_linear__(dataset, **kwargs):
'''
Calculate actual (i.e., observed or :math:`\\text{A}_a`), segmentation
Expand Down Expand Up @@ -78,7 +106,6 @@ def __actual_agreement_linear__(dataset, **kwargs):
:mod:`segeval.data.Samples`.
'''

metric_kwargs = dict(kwargs)
del metric_kwargs['fnc_compare']
metric_kwargs['return_parts'] = True
Expand All @@ -102,7 +129,7 @@ def __actual_agreement_linear__(dataset, **kwargs):
numerator, denominator = \
fnc_compare(segs_a, segs_b, **metric_kwargs)[0:2]
# Obtain necessary values
pbs = sum(segs_a) - 1
pbs = __potential_boundaries__(segs_a, **metric_kwargs)
# Add all pbs
all_numerators.append(numerator)
all_denominators.append(denominator)
Expand Down
10 changes: 9 additions & 1 deletion segeval/agreement/bias_test.py
Expand Up @@ -7,7 +7,8 @@
from decimal import Decimal
from .bias import artstein_poesio_bias_linear
from ..data.samples import (KAZANTSEVA2012_G5, KAZANTSEVA2012_G2,
COMPLETE_AGREEMENT, LARGE_DISAGREEMENT)
COMPLETE_AGREEMENT, LARGE_DISAGREEMENT,
MULTIPLE_BOUNDARY_TYPES)


class TestBias(unittest.TestCase):
Expand Down Expand Up @@ -65,6 +66,13 @@ def test_bias_complete(self):
self.assertEqual(bias,
Decimal('0.01455229356727327645713789012'))

def test_multiple_boundary_types(self):
'''
Test multiple boundaries.
'''
value = artstein_poesio_bias_linear(MULTIPLE_BOUNDARY_TYPES)
self.assertEqual(value, Decimal('0.00'))

def test_bias_large(self):
'''
Test bias upon a hypothetical dataset containing large disagreement.
Expand Down
10 changes: 9 additions & 1 deletion segeval/agreement/kappa_test.py
Expand Up @@ -7,7 +7,8 @@
from decimal import Decimal
from .kappa import fleiss_kappa_linear
from ..data.samples import (KAZANTSEVA2012_G5, KAZANTSEVA2012_G2,
COMPLETE_AGREEMENT, LARGE_DISAGREEMENT)
COMPLETE_AGREEMENT, LARGE_DISAGREEMENT,
MULTIPLE_BOUNDARY_TYPES)


class TestKappa(unittest.TestCase):
Expand Down Expand Up @@ -86,6 +87,13 @@ def test_fleiss_kappa_complete(self):
kappa = fleiss_kappa_linear(data_complete)
self.assertEqual(kappa, Decimal('1.0'))

def test_multiple_boundary_types(self):
'''
Test multiple boundaries.
'''
value = fleiss_kappa_linear(MULTIPLE_BOUNDARY_TYPES)
self.assertEqual(value, Decimal('0.3333333333333333333333333333'))

def test_exception_coders(self):
'''
Test exception.
Expand Down
18 changes: 13 additions & 5 deletions segeval/agreement/pi_test.py
Expand Up @@ -7,7 +7,8 @@
from decimal import Decimal
from .pi import fleiss_pi_linear
from ..data.samples import (KAZANTSEVA2012_G5, KAZANTSEVA2012_G2,
COMPLETE_AGREEMENT, LARGE_DISAGREEMENT)
COMPLETE_AGREEMENT, LARGE_DISAGREEMENT,
MULTIPLE_BOUNDARY_TYPES)


class TestPi(unittest.TestCase):
Expand Down Expand Up @@ -60,20 +61,20 @@ def test_fleiss_pi(self):
Test Fleiss' Pi.
'''

data1 = {'i1': {'c1': [2,8,2,1],
'c2': [2,1,7,2,1]}}
data1 = {'i1': {'c1': [2, 8, 2, 1],
'c2': [2, 1, 7, 2, 1]}}
pi1 = fleiss_pi_linear(data1)
pi1f = fleiss_pi_linear(data1)
self.assertEqual(pi1,
Decimal('0.7090909090909090909090909091'))
self.assertEqual(pi1,pi1f)
self.assertEqual(pi1, pi1f)
data2 = {'i1': {'c1': [2, 8, 2, 1],
'c2': [11, 2]}}
pi2 = fleiss_pi_linear(data2)
pi2f = fleiss_pi_linear(data2)
self.assertEqual(pi2,
Decimal('0.1111111111111111111111111111'))
self.assertEqual(pi2,pi2f)
self.assertEqual(pi2, pi2f)
self.assertTrue(pi2 < pi1)

def test_fleiss_pi_complete(self):
Expand All @@ -85,6 +86,13 @@ def test_fleiss_pi_complete(self):
pi = fleiss_pi_linear(data_complete)
self.assertEqual(pi, Decimal('1'))

def test_multiple_boundary_types(self):
'''
Test multiple boundaries.
'''
value = fleiss_pi_linear(MULTIPLE_BOUNDARY_TYPES)
self.assertEqual(value, Decimal('0.3333333333333333333333333333'))

def test_exception_coders(self):
'''
Test exception.
Expand Down
66 changes: 65 additions & 1 deletion segeval/agreement/test.py
Expand Up @@ -5,7 +5,9 @@
'''
import unittest
from decimal import Decimal
from . import actual_agreement_linear
from . import actual_agreement_linear, __potential_boundaries__
from .. import BoundaryFormat
from ..util import SegmentationMetricError
from ..data.samples import (KAZANTSEVA2012_G5, KAZANTSEVA2012_G2,
COMPLETE_AGREEMENT, LARGE_DISAGREEMENT)

Expand Down Expand Up @@ -75,3 +77,65 @@ def test_disagreement_large(self):
self.assertTrue(agreement >= 0)
self.assertEqual(agreement,
Decimal('0'))

def test_potential_boundaries_mass(self):
'''
Test counting the number of potential boundaries for BoundaryFormat.mass.
'''
kwargs = {
'boundary_format' : BoundaryFormat.mass,
'boundary_types' : (1,)
}
self.assertEqual(4, __potential_boundaries__([2,3], **kwargs))

def test_potential_boundaries_mass(self):
'''
Test counting the number of potential boundaries for BoundaryFormat.mass.
'''
kwargs = {
'boundary_format' : BoundaryFormat.mass,
'boundary_types' : (1,)
}
self.assertEqual(4, __potential_boundaries__([2,3], **kwargs))


def test_potential_boundaries_position(self):
'''
Test counting the number of potential boundaries for BoundaryFormat.position.
'''
kwargs = {
'boundary_format' : BoundaryFormat.position,
'boundary_types' : (1,)
}
self.assertEqual(4, __potential_boundaries__([1,1,1,2,2], **kwargs))


def test_potential_boundaries_sets(self):
'''
Test counting the number of potential boundaries for BoundaryFormat.sets.
'''
kwargs = {
'boundary_format' : BoundaryFormat.sets,
'boundary_types' : (1,)
}
self.assertEqual(4, __potential_boundaries__([(),(),(1,),()], **kwargs))

def test_potential_boundaries_nltk(self):
'''
Test counting the number of potential boundaries for BoundaryFormat.nltk.
'''
kwargs = {
'boundary_format' : BoundaryFormat.nltk,
'boundary_types' : (1,)
}
self.assertEqual(4, __potential_boundaries__('0010', **kwargs))

def test_potential_boundaries_exception(self):
'''
Test an incorrect format when counting the number of potential boundaries.
'''
kwargs = {
'boundary_format' : 'incorrect',
'boundary_types' : (1,)
}
self.assertRaises(SegmentationMetricError, __potential_boundaries__, '0010', **kwargs)
10 changes: 5 additions & 5 deletions segeval/similarity/boundary_test.py
Expand Up @@ -64,7 +64,7 @@ def test_clustered_fps(self):

def test_positions(self):
'''
Test false negative.
Test position-format.
'''
a = [1,1,1,1,1,1,1,1,1,1,1,1,1]
b = [1,1,1,1,2,2,2,2,3,3,3,3,3]
Expand All @@ -74,7 +74,7 @@ def test_positions(self):

def test_format_exception(self):
'''
Test false negative.
Test incorrect format exception.
'''
a = [1,1,1,1,1,1,1,1,1,1,1,1,1]
b = [1,1,1,1,2,2,2,2,3,3,3,3,3]
Expand All @@ -83,7 +83,7 @@ def test_format_exception(self):

def test_arg_exception(self):
'''
Test false negative.
Test incorrect argument exception.
'''
a = [1,1,1,1,1,1,1,1,1,1,1,1,1]
b = [1,1,1,1,2,2,2,2,3,3,3,3,3]
Expand All @@ -93,15 +93,15 @@ def test_arg_exception(self):

def test_weight_t(self):
'''
Test false negative.
Test transposition weighting.
'''
value = boundary_similarity([2, 3, 6], [5, 6],
weight=(weight_a, weight_s, weight_t))
self.assertEqual(0.5, value)

def test_multiple_boundary_types(self):
'''
Test false negative.
Test multiple boundaries.
'''
value = summarize(boundary_similarity(MULTIPLE_BOUNDARY_TYPES))
self.assertEqual((Decimal('0.375'),
Expand Down

0 comments on commit 0510ceb

Please sign in to comment.