In [57]:
# https://stackoverflow.com/questions/34805790/how-to-avoid-nltks-sentence-tokenizer-splitting-on-abbreviations

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
sentence_splitter = PunktSentenceTokenizer(punkt_param)
text = "is THAT what you mean, Mrs. Hussey?"
sentences = sentence_splitter.tokenize(text)

sentences

['is THAT what you mean, Mrs. Hussey?']

In [58]:
from nltk.tokenize import sent_tokenize
# from watermarkers.SemStamp.sampling_utils import tokenize_sentences

import re
from typing import *

def handle_bullet_points(sentences):
    new_sentences = []
    digit_pattern = re.compile(r'^\d+\.$')
    
    i = 0

    num_sentences = len(sentences)
    # print(f"Num sentences: {num_sentences}")

    while i < num_sentences - 1:
        if digit_pattern.match(sentences[i].strip()):
            modified_sentence = f"{sentences[i].strip()} {sentences[i + 1]}"
            new_sentences.append(modified_sentence)
            # print(f"Adding {modified_sentence}")
            i += 1  # Skip the next element as it's already added
        else:
            new_sentences.append(sentences[i])
        i += 1
        # print(f"i={i}")

    # Add the last sentence as well, if we don't want to skip it
    if i == num_sentences - 1:
        new_sentences.append(sentences[-1])
    
    return new_sentences

def tokenize_sentences(text: str) -> List[str]:
    sentences = sent_tokenize(text)
    processed_sentences = handle_bullet_points(sentences)
    return processed_sentences



In [59]:
txt = "They work with their colleagues in the Senate and the House of Representatives to draft, debate, and pass legislation."

sents = sent_tokenize(txt)
sents

['They work with their colleagues in the Senate and the House of Representatives to draft, debate, and pass legislation.']

In [60]:
txt = """As a helpful personal assistant, a United States Senator has several key responsibilities that shape the country's laws, policies, and direction. Here are the main responsibilities of an American Senator:

1."""

sents = tokenize_sentences(txt)

sents

["As a helpful personal assistant, a United States Senator has several key responsibilities that shape the country's laws, policies, and direction.",
 'Here are the main responsibilities of an American Senator:\n\n1.']

In [61]:
txt = """2. Boran"""

sents = tokenize_sentences(txt)

sents

['2. Boran']

In [62]:
sample_text_1 = """
1. Apples
2. Bananas
3. Milk
4. Bread
5. Eggs
"""

sample_text_2 = """
1. Welcome and Introductions
2. Review of Previous Meeting Minutes
3. Project Updates
4. Budget Review
5. Q&A Session
"""

sample_text_3 = """
1. Preheat the oven to 350°F (175°C).
2. Grease and flour a 9x9 inch pan.
3. In a medium bowl, mix together flour, sugar, and baking powder.
4. Add eggs, milk, and butter; beat until smooth.
5. Pour batter into the prepared pan and bake for 30-35 minutes.
"""

sample_text_4 = """
1. High-resolution display
2. Long-lasting battery life
3. Fast processor
4. Multiple camera lenses
5. 5G connectivity
"""

sents = tokenize_sentences(sample_text_1)
print(sents)

sents = tokenize_sentences(sample_text_2)
print(sents)

sents = tokenize_sentences(sample_text_3)
print(sents)

sents = tokenize_sentences(sample_text_4)
print(sents)

['1. Apples\n2.', 'Bananas\n3.', 'Milk\n4.', 'Bread\n5.', 'Eggs']
['1. Welcome and Introductions\n2. Review of Previous Meeting Minutes\n3.', 'Project Updates\n4.', 'Budget Review\n5.', 'Q&A Session']
['1. Preheat the oven to 350°F (175°C).', '2. Grease and flour a 9x9 inch pan.', '3. In a medium bowl, mix together flour, sugar, and baking powder.', '4. Add eggs, milk, and butter; beat until smooth.', '5. Pour batter into the prepared pan and bake for 30-35 minutes.']
['1. High-resolution display\n2.', 'Long-lasting battery life\n3.', 'Fast processor\n4.', 'Multiple camera lenses\n5.', '5G connectivity']


### Test Cases

In [63]:
import unittest

# Define the test cases
class TestTokenizeSentences(unittest.TestCase):

    def test_bullet_points_with_different_formats(self):
        text = """
        1. First bullet point
        2) Second bullet point
        a. Third bullet point
        b) Fourth bullet point
        - Fifth bullet point
        """
        expected = [
            "1. First bullet point",
            "2) Second bullet point",
            "a. Third bullet point",
            "b) Fourth bullet point",
            "- Fifth bullet point"
        ]
        result = tokenize_sentences(text)
        self.assertEqual(result, expected)
    
    def test_mixed_bullet_points_and_regular_sentences(self):
        text = """
        This is a regular sentence.
        1. This is a bullet point.
        This is another regular sentence following a bullet point.
        """
        expected = [
            "This is a regular sentence.",
            "1. This is a bullet point. This is another regular sentence following a bullet point."
        ]
        result = tokenize_sentences(text)
        self.assertEqual(result, expected)
    
    def test_single_bullet_point_with_trailing_sentence(self):
        text = """
        1. Single bullet point
        Trailing sentence not part of the bullet point.
        """
        expected = [
            "1. Single bullet point Trailing sentence not part of the bullet point."
        ]
        result = tokenize_sentences(text)
        self.assertEqual(result, expected)
    
    def test_bullet_points_with_punctuation(self):
        text = """
        1. This is the first bullet point.
        2. This is the second bullet point; with more text.
        3. Third bullet point: continues here.
        """
        expected = [
            "1. This is the first bullet point.",
            "2. This is the second bullet point; with more text.",
            "3. Third bullet point: continues here."
        ]
        result = tokenize_sentences(text)
        self.assertEqual(result, expected)

# Run the tests
unittest.TextTestRunner().run(unittest.makeSuite(TestTokenizeSentences))

F.FF
FAIL: test_bullet_points_with_different_formats (__main__.TestTokenizeSentences)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_1585470/1629283226.py", line 22, in test_bullet_points_with_different_formats
    self.assertEqual(result, expected)
AssertionError: Lists differ: ['1. [13 chars]point\n        2) Second bullet point\n       [84 chars]int'] != ['1. [13 chars]point', '2) Second bullet point', 'a. Third bu[57 chars]int']

First differing element 0:
'1. First bullet point\n        2) Second bullet point\n        a.'
'1. First bullet point'

Second list contains 3 additional elements.
First extra element 2:
'a. Third bullet point'

- ['1. First bullet point\n        2) Second bullet point\n        a.',
+ ['1. First bullet point',
+  '2) Second bullet point',
-  'Third bullet point\n'
?                     --

+  'a. Third bullet point',
?   +++                   +

-  '        b) Fourth bullet p

<unittest.runner.TextTestResult run=4 errors=0 failures=3>

In [64]:
tokenize_sentences("""**Positive Impacts:**

1. **Job Creation:** A climate-focused bill could lead to an increase in jobs related to renewable energy, sustainable infrastructure, to green technology, and environmental conservation. This could boost local employment rates and stimulate economic growth.
2. **Investment Attraction:** Bipartisan support for climate action can attract investors seeking to capitalize on emerging clean technologies and sustainable industries. This influx of capital can revitalize local economies and create new business opportunities.
3. **Infrastructure Development:** Climate-resilient infrastructure projects, such as sea walls, levees, and green roofs can generate construction jobs and stimulate local spending.
4. **Innovation Hubs:** Regions with strong research institutions or existing cleantech industries may become hubs for innovation, to climate-related R&D, driving economic growth through knowledge-based entrepreneurship.

**Challenges and Risks:**

1. **Transition Costs:** The shift away from fossil fuels and towards cleaner energy sources can result in short-term job losses and economic disruption in regions heavily reliant on traditional energy industries.
2. **Regulatory Burden:** Stricter environmental regulations may increase compliance costs for businesses, potentially affecting their competitiveness and profitability.
3.""")



['**Positive Impacts:**\n\n1.',
 '**Job Creation:** A climate-focused bill could lead to an increase in jobs related to renewable energy, sustainable infrastructure, to green technology, and environmental conservation.',
 'This could boost local employment rates and stimulate economic growth.',
 '2. **Investment Attraction:** Bipartisan support for climate action can attract investors seeking to capitalize on emerging clean technologies and sustainable industries.',
 'This influx of capital can revitalize local economies and create new business opportunities.',
 '3. **Infrastructure Development:** Climate-resilient infrastructure projects, such as sea walls, levees, and green roofs can generate construction jobs and stimulate local spending.',
 '4. **Innovation Hubs:** Regions with strong research institutions or existing cleantech industries may become hubs for innovation, to climate-related R&D, driving economic growth through knowledge-based entrepreneurship.',
 '**Challenges and Ri