Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing bug with cleaving hyphens #1

Merged
merged 1 commit into from
May 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions name_cleaver/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,12 @@ def without_extra_phrases(self):
if "-" in name:
hyphen_parts = name.rsplit("-", 1)
# if the part after the hyphen is shorter than the part before,
# AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X),
# AND the hyphen is preceded by either whitespace or at least four characters,
# AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X),
# discard the hyphen and whatever follows
if len(hyphen_parts[1]) < len(hyphen_parts[0]) and re.search(r'(\w{4,}|\s+)$', hyphen_parts[0]) and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]):
if len(hyphen_parts[1]) < len(hyphen_parts[0]) \
and re.search(r'^(\s+)|^(\w{0,4})$', hyphen_parts[1]) \
and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]):
name = hyphen_parts[0].strip()

return name
Expand Down
4 changes: 3 additions & 1 deletion name_cleaver/test_name_cleaver.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,15 @@ def test_expand_with_two_tokens_to_expand(self):
def test_dont_strip_after_hyphens_too_soon_in_a_name(self):
self.assertEqual('US-Russia Business Council', OrganizationNameCleaver('US-Russia Business Council').parse().kernel())
self.assertEqual('Wal-Mart Stores', OrganizationNameCleaver('Wal-Mart Stores, Inc.').parse().kernel())
self.assertEqual('Williams-Sonoma', OrganizationNameCleaver('Williams-Sonoma, Inc.').parse().kernel())
self.assertEqual('Austin American-Statesman', OrganizationNameCleaver('Austin American-Statesman').parse().kernel())

# these were new after the hyphen rewrite
self.assertEqual('Coca-Cola Company', OrganizationNameCleaver('Coca-Cola Co').parse().expand()) # used to return 'Coca'
self.assertEqual('Rolls-Royce PLC', OrganizationNameCleaver('Rolls-Royce PLC').parse().expand()) # used to return 'Rolls'

def test_drop_postname_hyphen_phrases(self):
self.assertEqual('Lawyers For Better Government', OrganizationNameCleaver('LAWYERS FOR BETTER GOVERNMENT-ILLINOIS').parse().without_extra_phrases())
self.assertEqual('Lawyers For Better Government-Illinois', OrganizationNameCleaver('LAWYERS FOR BETTER GOVERNMENT-ILLINOIS').parse().without_extra_phrases())
self.assertEqual('Jobs Opportunity And Freedom Political Action Committee', OrganizationNameCleaver('JOBS OPPORTUNITY AND FREEDOM POLITICAL ACTION COMMITTEE - JOFPAC').parse().without_extra_phrases())

def test_kernel(self):
Expand Down