In [None]:
import os
import re
import sys

import dspy
from rich import print
from sentence_transformers import SentenceTransformer, util

src_parent = os.path.abspath(os.path.join(os.path.dirname(os.getcwd())))
sys.path.append(src_parent)

from text_processing.examples import InputOutput, get_all_examples

In [24]:
import nltk
from nltk.tokenize import sent_tokenize

# Make sure you have the punkt tokenizer downloaded
nltk.download('punkt')



[nltk_data] Downloading package punkt to /home/thankgod/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
text = """
The concept of sustainability was originally coined in forestry, where it means never harvesting
more than what the forest yields in new growth. The word Nachhaltigkeit (the German term for
sustainability) was first used with this meaning in 1713. The concern with preserving natural
resources for the future is perennial, of course: undoubtedly our Palaeolithic ancestors worried about
their prey becoming extinct, and early farmers must have been apprehensive about maintaining soil
fertility.
"""



In [130]:
gpt4 = dspy.LM("openai/gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
llama1b = dspy.LM("ollama_chat/llama3.2:1b", api_base="http://localhost:11434")
gemma1b = dspy.LM("ollama_chat/gemma3:1b", api_base="http://localhost:11434") #not capable
gemma3_270 = dspy.LM("ollama_chat/gemma3:270m", api_base="http://localhost:11434")

dspy.configure(lm=gpt4)


dspy.Predict("question -> answer")(question="say hello!")

Prediction(
    answer='Hello!'
)

In [13]:
# dspy.settings.configure(lm=lm)

Prediction(
    answer='Hello! How can I assist you today?'
)

In [15]:
get_all_examples()[0].output

['\n            The concept of sustainability was originally coined in forestry,\n            where it means never harvesting more than what the forest yields in new growth [2].\n            The word Nachhaltigkeit (the German term for sustainability)\n            was first used with this meaning in 1713 [3].\n            The concern with preserving natural resources for the future is perennial,\n            of course: undoubtedly our Palaeolithic ancestors worried about their prey\n            becoming extinct, and early farmers must have been apprehensive about maintaining soil\n            fertility. Traditional beliefs enjoined thinking in terms of stewardship and concern\n            for future generations, as expressed in the oft-quoted words of a Nigerian tribal\n            chief who saw the community as consisting of “many dead, few living and countless\n            others unborn” [4,5]. Perhaps there have always been two opposing views of the relation\n            between hum

In [16]:
#text = get_example(0).output[0]
def strip_text(text):
    return re.sub(r'\s+', ' ', text.strip())

In [17]:
def chunk_examples(examples : list[InputOutput]) -> list([dspy.Example]):
    return [
        dspy.Example(
        text_block=strip_text(examples()[i].input), 
        splits=[strip_text(output) for output in examples()[i].output]
        ).with_inputs("text_block")
        for i in range(len(examples()))
    ]

data=chunk_examples(get_all_examples)
data

[Example({'text_block': 'The concept of sustainability was originally coined in forestry, where it means never harvesting more than what the forest yields in new growth [2]. The word Nachhaltigkeit (the German term for sustainability) was first used with this meaning in 1713 [3]. The concern with preserving natural resources for the future is perennial, of course: undoubtedly our Palaeolithic ancestors worried about their prey becoming extinct, and early farmers must have been apprehensive about maintaining soil fertility. Traditional beliefs enjoined thinking in terms of stewardship and concern for future generations, as expressed in the oft-quoted words of a Nigerian tribal chief who saw the community as consisting of “many dead, few living and countless others unborn” [4,5]. Perhaps there have always been two opposing views of the relation between humankind and nature: one which stresses adaptation and harmony, and another which sees nature as something to be conquered. While this l

In [18]:
data[0].text_block

'The concept of sustainability was originally coined in forestry, where it means never harvesting more than what the forest yields in new growth [2]. The word Nachhaltigkeit (the German term for sustainability) was first used with this meaning in 1713 [3]. The concern with preserving natural resources for the future is perennial, of course: undoubtedly our Palaeolithic ancestors worried about their prey becoming extinct, and early farmers must have been apprehensive about maintaining soil fertility. Traditional beliefs enjoined thinking in terms of stewardship and concern for future generations, as expressed in the oft-quoted words of a Nigerian tribal chief who saw the community as consisting of “many dead, few living and countless others unborn” [4,5]. Perhaps there have always been two opposing views of the relation between humankind and nature: one which stresses adaptation and harmony, and another which sees nature as something to be conquered. While this latter view may have been

In [19]:
#from typing import List

class TextSplitter(dspy.Signature):
    text_block: str = dspy.InputField(desc="Full text block")
    splits: list[str] = dspy.OutputField(desc="No paraphrase of full text block in a list of semantically consistent splits usually 100-250 words in each split")


In [20]:
dspy.Predict(TextSplitter)(text_block=data[1].text_block).splits

["The Brundtland report speaks of two concerns that should be reconciled: development and the environment. They can also be interpreted as needs versus resources, or as the short versus the long term. Today, however, sustainability is almost always seen in terms of three dimensions: social, economic and environmental. This is embodied in the definition of sustainability adopted by the United Nations in its Agenda for Development: Development is a multidimensional undertaking to achieve a higher quality of life for all people. Economic development, social development and environmental protection are interdependent and mutually reinforcing components of sustainable development. But what are economic and social development and how are they different? Robert Gibson, a political scientist, says that the distinction is needed because 'material gains are not sufficient measures or preservers of human well-being.' The same author also suggests that the three dimensions or 'pillars' reflect the

In [21]:
data[1].splits

['The Brundtland report speaks of two concerns that should be reconciled: development and the environment. They can also be interpreted as needs versus resources, or as the short versus the long term. Today, however, sustainability is almost always seen in terms of three dimensions: social, economic and environmental [8–11]. This is embodied in the definition of sustainability adopted by the United Nations in its Agenda for Development: Development is a multidimensional undertaking to achieve a higher quality of life for all people. Economic development, social development and environmental protection are interdependent and mutually reinforcing components of sustainable development [12]. But what are economic and social development and how are they different? Robert Gibson, a political scientist, says that the distinction is needed because “material gains are not sufficient measures or preservers of human well-being” [13]. The same author also suggests that the three dimensions or ‘pil

In [105]:
text_input = data[1].text_block
print(text_input)

In [106]:
# Split into sentences
sentences = sent_tokenize(text_input)

for i, sentence in enumerate(sentences, 1):
    print(f"{i}: {sentence}")

In [None]:
# sentences[0], sentences[1]

# class CompareSentence(dspy.Signature):
#     sentence1: str = dspy.InputField(desc="Previous Sentence")
#     sentence2: str = dspy.InputField(desc="Current Sentence")
#     output: bool = dspy.OutputField(desc="Decide if sentence 2 is a likely continuation of sentence 3 up to 90 percent sematic consitency")
    

In [53]:
sentences[0:2]

['The concept of sustainability was originally coined in forestry, where it means never harvesting more than what the forest yields in new growth [2].',
 'The word Nachhaltigkeit (the German term for sustainability) was first used with this meaning in 1713 [3].']

In [64]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# Create embeddings
embedding1 = model.encode(sentences[0], convert_to_tensor=True)
embedding2 = model.encode(sentences[1], convert_to_tensor=True)
score = util.pytorch_cos_sim(embedding1, embedding2).item()

In [49]:
sentences[1:3]

['The word Nachhaltigkeit (the German term for sustainability) was first used with this meaning in 1713 [3].',
 'The concern with preserving natural resources for the future is perennial, of course: undoubtedly our Palaeolithic ancestors worried about their prey becoming extinct, and early farmers must have been apprehensive about maintaining soil fertility.']

In [117]:
class CompareSentence(dspy.Signature):
    sentence1: str = dspy.InputField(desc="Previous sentence")
    sentence2: str = dspy.InputField(desc="Next sentence")
    similarity_score: float = dspy.InputField(desc="Cosine similarity between sentence embeddings (range -1 to 1)")
    relation: str = dspy.OutputField(desc="Relationship type")
    keep_together: bool = dspy.OutputField(desc="True if they belong in the same chunk")


In [118]:
sentence_comparison = dspy.ChainOfThought(CompareSentence)
prediction = sentence_comparison(sentence1=sentences[10], sentence2=sentences[11], similarity_score=score)
print(prediction)

In [131]:
class DecideChunkSplit(dspy.Signature):
    context: str = dspy.InputField(desc="Previous 3–5 sentences")
    candidate_sentence: str = dspy.InputField(desc="Next sentence to consider")
    similarity_score: float = dspy.InputField(desc="Cosine similarity between candidate and previous sentences")
    keep_together: bool = dspy.OutputField(desc="True if sentence continues current block, else False")
    #reasoning: str = dspy.OutputField(desc="Brief explanation for the decision")


In [132]:
window_size = 3
chunks = []
current_chunk = []

for i, sent in enumerate(sentences):
    current_chunk.append(sent)
    if i >= window_size:
        context = " ".join(current_chunk[-window_size:])
        sim_score = float(util.pytorch_cos_sim(
            model.encode(sent, convert_to_tensor=True),
            model.encode(" ".join(current_chunk[-window_size-1:-1]), convert_to_tensor=True)
        ).item())
        
        sentence_comparison = dspy.ChainOfThought(DecideChunkSplit)
        decision = sentence_comparison(
                context=context,
                candidate_sentence=sent,
                similarity_score=sim_score
            )
        
        if not decision.keep_together:
            # Split here
            chunks.append(" ".join(current_chunk[:-1]))
            current_chunk = [sent]


In [133]:
chunks

['The Brundtland report speaks of two concerns that should be reconciled: development and the environment. They can also be interpreted as needs versus resources, or as the short versus the long term. Today, however, sustainability is almost always seen in terms of three dimensions: social, economic and environmental [8-11].',
 'This is embodied in the definition of sustainability adopted by the United Nations in its Agenda for Development: Development is a multidimensional undertaking to achieve a higher quality of life for all people. Economic development, social development and environmental protection are interdependent and mutually reinforcing components of sustainable development [12]. But what are economic and social development and how are they different?',
 'Robert Gibson, a political scientist, says that the distinction is needed because “material gains are not sufficient measures or preservers of human well-being” [13].',
 'The same author also suggests that the three dimens

In [107]:
embeddings = model.encode(sentences, convert_to_tensor=True)
similarities = util.pytorch_cos_sim(embeddings, embeddings).cpu().numpy()
similarities

array([[1.        , 0.32952085, 0.41659182, ..., 0.38608938, 0.3074102 ,
        0.4594906 ],
       [0.32952085, 1.        , 0.30671704, ..., 0.3280038 , 0.36409843,
        0.41086665],
       [0.41659182, 0.30671704, 1.0000001 , ..., 0.5469956 , 0.39978334,
        0.58567995],
       ...,
       [0.38608938, 0.3280038 , 0.5469956 , ..., 1.0000001 , 0.36492258,
        0.5056557 ],
       [0.3074102 , 0.36409843, 0.39978334, ..., 0.36492258, 1.        ,
        0.3486545 ],
       [0.4594906 , 0.41086665, 0.58567995, ..., 0.5056557 , 0.3486545 ,
        1.        ]], shape=(38, 38), dtype=float32)

In [94]:
import numpy as np

In [112]:
window_size = 2  # number of previous sentences to compare
threshold = 0.3 # similarity below which a cut is likely
cut_points = []

for i in range(1, len(sentences)):
    # compare sentence i to previous window
    prev_window = embeddings[max(0, i-window_size):i]
    avg_similarity = np.mean(util.pytorch_cos_sim(embeddings[i], prev_window).cpu().numpy())
    
    if avg_similarity < threshold:
        cut_points.append(i)

print("Cut points:", cut_points)


In [99]:
chunks = []
start = 0
for cut in cut_points + [len(sentences)]:
    chunk = " ".join(sentences[start:cut])
    chunks.append(chunk)
    start = cut

# Each chunk is now a semantically consistent block
for idx, c in enumerate(chunks, 1):
    print(f"Chunk {idx} ({len(c.split())} words):\n{c}\n---")


In [None]:

sentence_comparison = dspy.ChainOfThought(CompareSentence)
prediction = dspy.Predict(sentence_comparison(sentence1=sentences[1:2], sentence2=sentences[3]))
print(prediction)

In [22]:
prediction = dspy.ChainOfThought(TextSplitter)(text_block=data[1].text_block)
print(prediction)

In [45]:
# dspy.Predict(dspy.ChainOfThought(TextSplitter)) (text_block=data[0].text_block).splits

In [None]:
# generate_splits = dspy.Predict(TextSplitter)
# prediction = generate_splits(text_block=data[0].text_block)
# data[0].splits == prediction

False

In [30]:
data[0].splits 

['The concept of sustainability was originally coined in forestry, where it means never harvesting more than what the forest yields in new growth [2]. The word Nachhaltigkeit (the German term for sustainability) was first used with this meaning in 1713 [3]. The concern with preserving natural resources for the future is perennial, of course: undoubtedly our Palaeolithic ancestors worried about their prey becoming extinct, and early farmers must have been apprehensive about maintaining soil fertility. Traditional beliefs enjoined thinking in terms of stewardship and concern for future generations, as expressed in the oft-quoted words of a Nigerian tribal chief who saw the community as consisting of “many dead, few living and countless others unborn” [4,5]. Perhaps there have always been two opposing views of the relation between humankind and nature: one which stresses adaptation and harmony, and another which sees nature as something to be conquered. While this latter view may have bee

In [31]:
prediction.splits

["The concept of sustainability was originally coined in forestry, where it means never harvesting more than what the forest yields in new growth. The word Nachhaltigkeit (the German term for sustainability) was first used with this meaning in 1713. The concern with preserving natural resources for the future is perennial, of course: undoubtedly our Palaeolithic ancestors worried about their prey becoming extinct, and early farmers must have been apprehensive about maintaining soil fertility. Traditional beliefs enjoined thinking in terms of stewardship and concern for future generations, as expressed in the oft-quoted words of a Nigerian tribal chief who saw the community as consisting of 'many dead, few living and countless others unborn'. Perhaps there have always been two opposing views of the relation between humankind and nature: one which stresses adaptation and harmony, and another which sees nature as something to be conquered.",
 'While this latter view may have been rather d

In [17]:
data[0].splits == prediction.splits

False

In [20]:

def text_splitter_metric(example: dspy.Example, prediction, trace=None) -> bool:
    return prediction.splits == example.splits


evaluate_correctness = dspy.Evaluate(
    devset=data,
    metric=text_splitter_metric,
    num_threads=24,
    display_progress=True,
    diplay_table=True
)

evaluate_correctness(text_splitter, devset=data)

Average Metric: 0.00 / 2 (0.0%): 100%|██████████| 2/2 [00:00<00:00, 1226.05it/s]

2025/08/14 17:39:04 INFO dspy.evaluate.evaluate: Average Metric: 0 / 2 (0.0%)





0.0

In [27]:
mipro_optimizer = dspy.MIPROv2(
    metric=text_splitter_metric,
    auto="medium",
)
bootstap_optimizer = dspy.BootstrapFewShot(
    metric=text_splitter_metric,
    #auto="medium",
)
optimized_text_splitter = bootstap_optimizer.compile(
    text_splitter,
    trainset=data,
    #max_bootstrapped_demos=4,
    # minibatch=False
    #requires_permission_to_run=False
)

100%|██████████| 2/2 [00:13<00:00,  6.59s/it]


Bootstrapped 0 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.


In [12]:
print(dspy.inspect_history(n=1))





[34m[2025-08-14T17:05:47.010257][0m

[31mSystem message:[0m

Your input fields are:
1. `text_block` (str): Full text block
Your output fields are:
1. `reasoning` (str): 
2. `splits` (list[str]): Exact splits of text chunks usually 100-200 words from text block
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## text_block ## ]]
{text_block}

[[ ## reasoning ## ]]
{reasoning}

[[ ## splits ## ]]
{splits}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are a sustainability expert tasked with analyzing the provided text block. Your goal is to identify and produce key splits that reflect the multifaceted nature of sustainability, incorporating historical influences, economic theories, and philosophical viewpoints. Consider how these elements interact and contribute to a deeper un