# Dialogic Conversion

In [1]:
import nltk
import re

nltk.download("punkt")

def split_text_into_chunks(text, max_tokens=128):
    """
    Splits text into chunks of roughly `max_tokens` tokens each.
    You can modify the logic to chunk by paragraph or subheading instead.
    """
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sent in sentences:
        sent_tokens = sent.split()
        if current_length + len(sent_tokens) > max_tokens:
            # Close off the current chunk
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

        current_chunk.append(sent)
        current_length += len(sent_tokens)

    # If there's a remainder, add it as the final chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Example usage:
text = """
Alan Watts was a British writer and speaker, known for interpreting and popularizing
Japanese, Chinese, and Indian traditions of Buddhist, Taoist, and Hindu philosophy
for a Western audience. He wrote extensively on personal identity, the true nature of reality,
and the pursuit of happiness. His works often tackled the tension between spirituality and
modern society, encouraging readers to explore alternative ways of perceiving life.
"""
chunks = split_text_into_chunks(text, max_tokens=50)
for i, c in enumerate(chunks):
    print(f"Chunk {i}:\n{c}\n")


Chunk 0:

Alan Watts was a British writer and speaker, known for interpreting and popularizing
Japanese, Chinese, and Indian traditions of Buddhist, Taoist, and Hindu philosophy
for a Western audience. He wrote extensively on personal identity, the true nature of reality,
and the pursuit of happiness.

Chunk 1:
His works often tackled the tension between spirituality and
modern society, encouraging readers to explore alternative ways of perceiving life.



[nltk_data] Downloading package punkt to
[nltk_data]     /home/peacelovephysics/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
