diff --git a/parser/layer/terms.py b/parser/layer/terms.py index c0dbc24..77b0827 100644 --- a/parser/layer/terms.py +++ b/parser/layer/terms.py @@ -1,6 +1,7 @@ from layer import Layer from parser import utils from parser.grammar.terms import term_parser +from parser.layer.paragraph_markers import ParagraphMarkers from parser.tree import struct import re @@ -31,8 +32,9 @@ def has_definitions(self, node): # Definitions are only in the reg text (not appendices/interprs) if not node['label']['parts'][1].isdigit(): return False + stripped = node['text'].strip(ParagraphMarkers.marker(node)).strip() return ( - node['text'].lower().startswith('definition') + stripped.lower().startswith('definition') or ('title' in node['label'] and 'definition' in node['label']['title'].lower())) @@ -85,7 +87,7 @@ def calculate_offsets(self, text, applicable_terms): larger (i.e. containing) terms.""" # longer terms first - applicable_terms.sort(key=lambda x: x[0], reverse=True) + applicable_terms.sort(key=lambda x: len(x[0]), reverse=True) matches = [] existing_defs = [] diff --git a/tests/layer_terms.py b/tests/layer_terms.py index a68230e..c39616b 100644 --- a/tests/layer_terms.py +++ b/tests/layer_terms.py @@ -30,6 +30,13 @@ def test_has_definitions(self): label=struct.label("101-22-c", ["101", "22", "c"], "But definition is in the title")))) + def test_has_definitions_p_marker(self): + t = Terms(None) + node = struct.node("(a) Definitions. For purposes of this " + + "section except blah", + [], + struct.label('88-20-a', ['88', '20', 'a'])) + self.assertTrue(t.has_definitions(node)) def test_node_definitions(self): t = Terms(None) @@ -116,7 +123,7 @@ def test_calculate_offsets(self): matches = t.calculate_offsets(text, applicable_terms) self.assertEqual(3, len(matches)) found = [False, False, False] - for term, ref, offsets in matches: + for _, ref, offsets in matches: if ref == 'a' and offsets == [(10,19)]: found[0] = True if ref == 'b' and offsets == [(30,34)]: @@ -125,6 +132,16 @@ def test_calculate_offsets(self): found[2] = True self.assertEqual([True,True,True], found) + def test_calculate_offsets_lexical_container(self): + applicable_terms = [('access device', 'a'), ('device', 'd')] + text = "This access device is fantastic!" + t = Terms(None) + matches = t.calculate_offsets(text, applicable_terms) + self.assertEqual(1, len(matches)) + _, ref, offsets = matches[0] + self.assertEqual('a', ref) + self.assertEqual([(5,18)], offsets) + def test_calculate_offsets_word_part(self): """If a defined term is part of another word, don't include it""" applicable_terms = [('act', 'a')]