In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%aimport openai, pandas, time, os, re, math, ast

import pandas as pd
import os
import ast

In [4]:
from openai import OpenAI

In [5]:
from math import ceil

In [6]:
from data_processing.text_processing import get_text_from_file, set_working_directory, get_working_directory
from data_processing.text_processing import normalize_quotes

set_working_directory("../../books/private_books")

In [7]:
LONG_QUERY_RATIO = 0.003
MEDIUM_QUERY_RATIO = 0.003
SHORT_QUERY_RATIO = 0.005
MAX_QUERY_COUNT = 20

In [8]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [9]:
system_instructions = """You are a thorough, insightful, and consistent assistant generating (query, text) pairs for a project aimed at training a BERT-based search model on finding relevant passages in the works of Thich Nhat Hanh. 
Think of queries from a wide range of people: those new to and curious about the Plum Village tradition up to experienced monastics researching Thay's life, teachings, or deep Buddhist principles.
The generated queries should capture the key concepts and themes from the text (without too much semantic overlap), some broad context questions generally relevant to Plum Village (not directly from the text), and also some detailed, or complex information specific to the text. 
Metadata, such as tagged titles, quotes, etc. can be used to guide generation. Queries specific to Named Entities in the text may also be relevant.
For the longer queries, focus on more complex or philosophical aspects of the text or connections to broader Buddhist teachings.
"""

In [10]:
print(system_instructions)

You are a thorough, insightful, and consistent assistant generating (query, text) pairs for a project aimed at training a BERT-based search model on finding relevant passages in the works of Thich Nhat Hanh. 
Think of queries from a wide range of people: those new to and curious about the Plum Village tradition up to experienced monastics researching Thay's life, teachings, or deep Buddhist principles.
The generated queries should capture the key concepts and themes from the text (without too much semantic overlap), some broad context questions generally relevant to Plum Village (not directly from the text), and also some detailed, or complex information specific to the text. 
Metadata, such as tagged titles, quotes, etc. can be used to guide generation. Queries specific to Named Entities in the text may also be relevant.
For the longer queries, focus on more complex or philosophical aspects of the text or connections to broader Buddhist teachings.



You are a thorough, insightful assistant generating (query, text) pairs to train a BERT-based search model on Thich Nhat Hanh's works. 
Generate queries for a range of audiences, from beginners in the Plum Village tradition to advanced monastics. 

Use metadata, such as titles, quotes, or gathas, as cues to identify central themes or key concepts in the text. Queries should capture essential topics, themes, or questions (without too much semantic overlap), including both broad context questions and detailed insights. 

For extended queries, explore deeper, complex, or philosophical connections to Thich Nhat Hanh’s life and teachings, as well as broader Buddhist principles.

In [11]:
user_input_wrapper = """Generate:

{short_query_count} queries: 1-3 words in length
{medium_query_count} queries: 4-6 words in length
{long_query_count} queries: full-sentence questions

From this text:
{text_segment}

Provide a pure plain text output of (query, text) pairs as a Python list of tuples. 
Do not include any Python code syntax such as python, triple backticks, or queries =. 
Only return the list content, with each pair in the form (query, text) as a tuple, in a clean text format.
For example: [(text, text), (text, text), ...]
"""

In [12]:
test_input = """
<section level="3" type="exercise">
<title>Exercise 7| Parts of the Body</title>

<sutra-quote> Further, the practitioner meditates on his very own body from the soles of the feet upwards and then from the hair on top of the head downwards, a body contained inside the skin and full of all the impurities which belong to the body: ‘Here is the hair of the head, the hairs on the body, the nails, teeth, skin, flesh, sinews, bones, bone marrow, kidneys, heart, liver, diaphragm, spleen, lungs, intestines, bowels, excrement, bile, phlegm, pus, blood, sweat, fat, tears, grease, saliva, mucus, synovic fluid, urine.’</sutra-quote>

This exercise brings us into even deeper contact with our body. Here we observe the body in all its parts, from the hair on the head to the skin on the soles of the feet. In the process of our observation, we scan all the parts of the body, including the brain, heart, lungs, gall bladder, spleen, blood, urine, and so forth. The Buddha gives us the example of a farmer pouring the contents of a sack filled with a variety of seeds onto the floor and then observing and identifying each kind of seed: “This is rice, these are beans, these are sesame seeds.”

We use our conscious breathing in order to observe mindfully all the parts of the body. For example: “Breathing in, I am aware of the hair on my head. Breathing out, I know that this is the hair on my head.” Breathing consciously helps us dwell in mindfulness more easily and sustain the work of observing each part of the body. In addition to the conscious breathing, we can use the method of silently calling each part of the body by name to enable these parts to become increasingly clear in the light of mindfulness.

Why do we need to observe in mindfulness the different parts of the body? First of all, it is to be in contact with the body. We often have the impression that we’re already totally in touch with our body, but often we’re wrong. Between us and our body there can be a large separation, and our body remains a stranger to us. Sometimes we hate our body. There are even people who see their body as a prison and a place of punishment. To come back to our body is to become familiar with it and to establish harmony with it. We know that if our body isn’t happy, we’re not happy, and so we want our body to be calm and peaceful. To do so, we come back to our body and make peace with it.

We can try touching the different parts of our body to make their acquaintance. We should touch each part in an affectionate and caring way. For several decades, our eyes, feet, and heart have done their work devotedly and faithfully with us and for us, but we never really give them much attention or express our gratitude to them. It’s necessary to establish a close relationship with our body.

The second reason for mindfully observing the different parts of the body is that each part can be the door to liberation and awakening. At first we’ll only recognize the presence of the part of the body being observed, but later we’ll come to see its true nature. Every hair on our head and every cell in our body contains the entire universe. Observing the interdependent nature of a single hair can help us to see into the nature of the universe.

The exercise of observing every part of the body begins with the hair on the head and goes down to the skin on the soles of the feet. Sometimes we observe just one part of the body deeply, such as our eyes, heart, or toe. In the process of observation from the head to the feet, some observations may spring up in our mind. For example, as we pass our heart, we may think, “My friend John has a heart condition. I must visit him soon to see if he’s all right.” We can note these observations and then continue with the work of observing the remaining parts of the body. Later we can return to those observations.

</section>
"""

In [13]:
test_input = normalize_quotes(test_input)

In [14]:
def count_words(text):
    """Simple word counter based on whitespace."""
    return len(re.findall(r'\w+', text))

In [15]:
def calc_query_counts(wc):
    lq, mq, sq = (ceil(LONG_QUERY_RATIO * wc), ceil(MEDIUM_QUERY_RATIO * wc), ceil(SHORT_QUERY_RATIO * wc))
    lq, mq, sq = min(MAX_QUERY_COUNT, lq), min(MAX_QUERY_COUNT, mq), min(MAX_QUERY_COUNT, sq)
    return lq, mq, sq

In [16]:
calc_query_counts(5000)

(15, 15, 20)

In [17]:
def generate_messages(text_segment):

    wc = count_words(text_segment)

    long_count, medium_count, short_count = calc_query_counts(wc)
    
    messages = [
                {
                    "role": "system",
                    "content": system_instructions
                },
                {
                    "role": "user",
                    "content": user_input_wrapper.format(text_segment=text_segment,
                                                         long_query_count=long_count,
                                                         medium_query_count=medium_count,
                                                         short_query_count=short_count 
                                                        )
                }
            ]
    return messages

In [18]:
def generate_queries(text_segment):

    messages = generate_messages(text_segment)

    try:
        chat_completion = client.chat.completions.create(
            messages=messages,
            model="gpt-4o",
        )
        
        return chat_completion
    
    except Exception as e:
        print(f"Error: {e}")
        return None

In [19]:
def get_completion_content(completion):
   return completion.choices[0].message.content

In [20]:
messages = generate_messages(test_input)
print(messages[0]['content'])
print(messages[1]['content'])

You are a thorough, insightful, and consistent assistant generating (query, text) pairs for a project aimed at training a BERT-based search model on finding relevant passages in the works of Thich Nhat Hanh. 
Think of queries from a wide range of people: those new to and curious about the Plum Village tradition up to experienced monastics researching Thay's life, teachings, or deep Buddhist principles.
The generated queries should capture the key concepts and themes from the text (without too much semantic overlap), some broad context questions generally relevant to Plum Village (not directly from the text), and also some detailed, or complex information specific to the text. 
Metadata, such as tagged titles, quotes, etc. can be used to guide generation. Queries specific to Named Entities in the text may also be relevant.
For the longer queries, focus on more complex or philosophical aspects of the text or connections to broader Buddhist teachings.

Generate:

4 queries: 1-3 words in l

Generate:

12 queries: 1-3 words in length
11 queries: 4-7 words in length
9 queries: full-sentence questions

From this text:


<section level="3" type="exercise">
<title>Exercise 7| Parts of the Body</title>

<sutra-quote> Further, the practitioner meditates on his very own body from the soles of the feet upwards and then from the hair on top of the head downwards, a body contained inside the skin and full of all the impurities which belong to the body: ‘Here is the hair of the head, the hairs on the body, the nails, teeth, skin, flesh, sinews, bones, bone marrow, kidneys, heart, liver, diaphragm, spleen, lungs, intestines, bowels, excrement, bile, phlegm, pus, blood, sweat, fat, tears, grease, saliva, mucus, synovic fluid, urine.’</sutra-quote>

This exercise brings us into even deeper contact with our body. Here we observe the body in all its parts, from the hair on the head to the skin on the soles of the feet. In the process of our observation, we scan all the parts of the body, including the brain, heart, lungs, gall bladder, spleen, blood, urine, and so forth. The Buddha gives us the example of a farmer pouring the contents of a sack filled with a variety of seeds onto the floor and then observing and identifying each kind of seed: “This is rice, these are beans, these are sesame seeds.”

We use our conscious breathing in order to observe mindfully all the parts of the body. For example: “Breathing in, I am aware of the hair on my head. Breathing out, I know that this is the hair on my head.” Breathing consciously helps us dwell in mindfulness more easily and sustain the work of observing each part of the body. In addition to the conscious breathing, we can use the method of silently calling each part of the body by name to enable these parts to become increasingly clear in the light of mindfulness.

Why do we need to observe in mindfulness the different parts of the body? First of all, it is to be in contact with the body. We often have the impression that we’re already totally in touch with our body, but often we’re wrong. Between us and our body there can be a large separation, and our body remains a stranger to us. Sometimes we hate our body. There are even people who see their body as a prison and a place of punishment. To come back to our body is to become familiar with it and to establish harmony with it. We know that if our body isn’t happy, we’re not happy, and so we want our body to be calm and peaceful. To do so, we come back to our body and make peace with it.

We can try touching the different parts of our body to make their acquaintance. We should touch each part in an affectionate and caring way. For several decades, our eyes, feet, and heart have done their work devotedly and faithfully with us and for us, but we never really give them much attention or express our gratitude to them. It’s necessary to establish a close relationship with our body.

The second reason for mindfully observing the different parts of the body is that each part can be the door to liberation and awakening. At first we’ll only recognize the presence of the part of the body being observed, but later we’ll come to see its true nature. Every hair on our head and every cell in our body contains the entire universe. Observing the interdependent nature of a single hair can help us to see into the nature of the universe.

The exercise of observing every part of the body begins with the hair on the head and goes down to the skin on the soles of the feet. Sometimes we observe just one part of the body deeply, such as our eyes, heart, or toe. In the process of observation from the head to the feet, some observations may spring up in our mind. For example, as we pass our heart, we may think, “My friend John has a heart condition. I must visit him soon to see if he’s all right.” We can note these observations and then continue with the work of observing the remaining parts of the body. Later we can return to those observations.

</section>


Provide a pure plain text output of (query, text) pairs as a Python list of tuples. 
Do not include any Python code syntax such as python, triple backticks, or queries =. 
Only return the list content, with each pair in the form (query, text) as a tuple, in a clean text format.
For example: [(text, text), (text, text), ...]

In [None]:
#completion = generate_queries(test_input)

'[  \n  ("Body meditation", "Further, the practitioner meditates on his very own body from the soles of the feet upwards and then from the hair on top of the head downwards, a body contained inside the skin and full of all the impurities which belong to the body: \'Here is the hair of the head, the hairs on the body, the nails, teeth, skin, flesh, sinews, bones, bone marrow, kidneys, heart, liver, diaphragm, spleen, lungs, intestines, bowels, excrement, bile, phlegm, pus, blood, sweat, fat, tears, grease, saliva, mucus, synovic fluid, urine.\'"),  \n  ("Mindful observation", "Why do we need to observe in mindfulness the different parts of the body? First of all, it is to be in contact with the body."),  \n  ("Interdependence", "Every hair on our head and every cell in our body contains the entire universe. Observing the interdependent nature of a single hair can help us to see into the nature of the universe."),  \n  ("Conscious breathing", "We use our conscious breathing in order to o

In [177]:
output = get_completion_content(completion)
queries_list = ast.literal_eval(output)

In [179]:
queries_list

[('Body meditation',
  "Further, the practitioner meditates on his very own body from the soles of the feet upwards and then from the hair on top of the head downwards, a body contained inside the skin and full of all the impurities which belong to the body: 'Here is the hair of the head, the hairs on the body, the nails, teeth, skin, flesh, sinews, bones, bone marrow, kidneys, heart, liver, diaphragm, spleen, lungs, intestines, bowels, excrement, bile, phlegm, pus, blood, sweat, fat, tears, grease, saliva, mucus, synovic fluid, urine.'"),
 ('Mindful observation',
  'Why do we need to observe in mindfulness the different parts of the body? First of all, it is to be in contact with the body.'),
 ('Interdependence',
  'Every hair on our head and every cell in our body contains the entire universe. Observing the interdependent nature of a single hair can help us to see into the nature of the universe.'),
 ('Conscious breathing',
  'We use our conscious breathing in order to observe mindfu

In [22]:
book_xml_str = get_text_from_file("TH_working4.xml")

In [23]:
from lxml import etree

def process_sections(section):
    """
    Collects section data with paragraph and word counts.
    """
    section_data = []
    
    # Extract section attributes
    level = section.get("level")
    section_type = section.get("type", "")
    title = section.findtext("title", default="")

    # Count paragraphs and words within this section
    paragraphs = section.findall("p")
    paragraph_count = len(paragraphs)
    word_count = sum(count_words(p.text) for p in paragraphs if p.text)

    # Append section data
    section_data.append({
        "level": level,
        "type": section_type,
        "title": title,
        "paragraph_count": paragraph_count,
        "word_count": word_count
    })
    
    # Process nested sections recursively
    for sub_section in section.findall("section"):
        section_data.extend(process_sections(sub_section))
    
    return section_data

# Parse XML and iterate over top-level sections
root = etree.fromstring(book_xml_str)

# Collect all section data
all_section_data = []
for section in root.findall(".//section"):
    all_section_data.extend(process_sections(section))

# Example output for debugging
for section_info in all_section_data:
    print(section_info)

{'level': '1', 'type': 'note', 'title': 'A NOTE ON THE TEXT', 'paragraph_count': 0, 'word_count': 0}
{'level': '1', 'type': 'introduction', 'title': 'Introduction', 'paragraph_count': 0, 'word_count': 0}
{'level': '1', 'type': 'sutra', 'title': 'Sutra on the Four Establishments of Mindfulness', 'paragraph_count': 0, 'word_count': 0}
{'level': '1', 'type': 'summary', 'title': 'Summary of the Sutra', 'paragraph_count': 0, 'word_count': 0}
{'level': '1', 'type': 'exercise_group', 'title': 'Mindfulness Exercises', 'paragraph_count': 0, 'word_count': 0}
{'level': '2', 'type': 'exercise_group', 'title': 'EXERCISES FOR OBSERVING THE BODY', 'paragraph_count': 0, 'word_count': 0}
{'level': '2', 'type': 'note', 'title': 'REMARKS ON THE FIRST NINE EXERCISES', 'paragraph_count': 0, 'word_count': 0}
{'level': '2', 'type': 'exercise_group', 'title': 'EXERCISES FOR OBSERVING THE FEELINGS', 'paragraph_count': 0, 'word_count': 0}
{'level': '2', 'type': 'exercise_group', 'title': 'EXERCISES FOR OBSERVIN

In [24]:
from lxml import etree
import os


def generate_chunks(xml_filename, ignore_list=None):
    """
    Generates fine-grained and broad-scope chunks from XML data, applying filters.

    Fine-grained chunks are each level 2 section.
    Broad-scope chunks are entire level 1 sections with nested level 2 sections.
    Sections with types in the ignore list or with zero paragraphs and no subsections are excluded.

    Args:
    - xml_filename: Name of the XML file within the working directory.
    - ignore_list: List of keywords to ignore based on 'type' (case insensitive).

    Returns:
    - fine_grained_chunks: List of strings, each containing a level 2 section as XML.
    - broad_scope_chunks: List of strings, each containing a level 1 section as XML with all nested content.
    """
    
    # Construct the full path to the XML file
    wdir = get_working_directory()

    if wdir:
        xml_file = os.path.join(wdir, xml_filename)
    else:
        xml_file = xml_filename
    
    # Parse XML and prepare containers for chunks
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    ignore_list = ignore_list or []
    fine_grained_chunks = []
    broad_scope_chunks = []

    # Helper function to check if a section should be ignored
    def should_ignore(section):
        section_type = section.get("type", "").lower()
        paragraph_count = len(section.findall("p"))
        has_subsections = len(section.findall("section")) > 0
        
        # Ignore if type matches any keyword in ignore list or if it has zero paragraphs and no subsections
        return (
            any(keyword.lower() in section_type for keyword in ignore_list) or
            (paragraph_count == 0 and not has_subsections)
        )

    # Collect level 2 sections as fine-grained chunks
    for level_2_section in root.findall(".//section[@level='2']"):
        if not should_ignore(level_2_section):
            fine_grained_chunks.append(etree.tostring(level_2_section, encoding='unicode'))
    
    # Collect entire level 1 sections as broad-scope chunks
    for level_1_section in root.findall(".//section[@level='1']"):
        if not should_ignore(level_1_section):
            broad_scope_chunks.append(etree.tostring(level_1_section, encoding='unicode'))
    
    return fine_grained_chunks, broad_scope_chunks

# Example usage
xml_filename = "TH_working4.xml"
ignore_keywords = ["bibliographic-data"]

fine_grained_chunks, broad_scope_chunks = generate_chunks(xml_filename, ignore_list=ignore_keywords)

# Print samples for inspection
print("Fine-Grained Chunks Sample:", fine_grained_chunks[:3])
print("Broad-Scope Chunks Sample:", broad_scope_chunks[:1])

Fine-Grained Chunks Sample: ['<section level="2" type="sutra-section">\n            <title>I.</title>\n            <p>I heard these words of the Buddha one time when he was living at Kammassadharma, a\n                market town of the Kuru people. The Buddha addressed the bhikkhus, "O bhikkhus."</p>\n            <p>And the bhikkhus replied, "Venerable Lord."</p>\n            <p>The Buddha said, "Bhikkhus, there is a most wonderful way to help living beings\n                realize purification, overcome directly grief and sorrow, end pain and anxiety,\n                travel the right path, and realize nirvana. This way is the Four Establishments of\n                Mindfulness.</p>\n            <p>"What are the Four Establishments?</p>\n            <list>\n                <item>1. "Bhikkhus, a practitioner remains established in the observation of the\n                    body in the body, diligent, with clear understanding, mindful, having abandoned\n                    every cravi

In [25]:
print(fine_grained_chunks[0])

<section level="2" type="sutra-section">
            <title>I.</title>
            <p>I heard these words of the Buddha one time when he was living at Kammassadharma, a
                market town of the Kuru people. The Buddha addressed the bhikkhus, "O bhikkhus."</p>
            <p>And the bhikkhus replied, "Venerable Lord."</p>
            <p>The Buddha said, "Bhikkhus, there is a most wonderful way to help living beings
                realize purification, overcome directly grief and sorrow, end pain and anxiety,
                travel the right path, and realize nirvana. This way is the Four Establishments of
                Mindfulness.</p>
            <p>"What are the Four Establishments?</p>
            <list>
                <item>1. "Bhikkhus, a practitioner remains established in the observation of the
                    body in the body, diligent, with clear understanding, mindful, having abandoned
                    every craving and every distaste for this life.</item>

In [26]:
print(broad_scope_chunks[2])

<section level="1" type="sutra">
        <title>Sutra on the Four Establishments of Mindfulness</title>
        <sutra-source>Satipatthana Sutta (Theravada) from Majjhima Nikaya, 10.</sutra-source>
        <section level="2" type="sutra-section">
            <title>I.</title>
            <p>I heard these words of the Buddha one time when he was living at Kammassadharma, a
                market town of the Kuru people. The Buddha addressed the bhikkhus, "O bhikkhus."</p>
            <p>And the bhikkhus replied, "Venerable Lord."</p>
            <p>The Buddha said, "Bhikkhus, there is a most wonderful way to help living beings
                realize purification, overcome directly grief and sorrow, end pain and anxiety,
                travel the right path, and realize nirvana. This way is the Four Establishments of
                Mindfulness.</p>
            <p>"What are the Four Establishments?</p>
            <list>
                <item>1. "Bhikkhus, a practitioner remains establ

In [27]:
fgc = [count_words(chunk) for chunk in fine_grained_chunks]

In [28]:
fgc

[226,
 2132,
 339,
 429,
 2319,
 291,
 8390,
 1915,
 2987,
 5957,
 7242,
 424,
 504,
 640,
 1325,
 508,
 1188,
 4443,
 2765,
 2047]

In [45]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o")

def token_count(text):
    return len(encoding.encode(text))

fg_tokens = [token_count(chunk) for chunk in fine_grained_chunks]

In [46]:
fg_tokens

[368,
 2973,
 462,
 606,
 3159,
 424,
 11748,
 2678,
 4064,
 8025,
 10010,
 635,
 687,
 886,
 1835,
 745,
 1788,
 6483,
 4118,
 2948]

In [30]:
fine_grained_chunks[6]

'<section level="2" type="exercise_group">\n            <title>EXERCISES FOR OBSERVING THE BODY</title>\n            <p>The First Establishment of Mindfulness is the body, which includes the breath, the\n                positions of the body, the actions of the body, the parts of the body, the four\n                elements of which the body is composed, and the dissolution of the body.</p>\n            <section level="3" type="exercise">\n                <title>Exercise 1 | Conscious Breathing</title>\n                <sutra-quote>He goes to the forest, to the foot of a tree, or to an empty room, sits\n                    down\n                    cross-legged in the lotus position, holds his body straight, and establishes\n                    mindfulness in front of him. He breathes in, aware that he is breathing in. He\n                    breathes out, aware that he is breathing out.</sutra-quote>\n                <p>The first practice is the full awareness of breathing. When we br

In [31]:
bsc = [count_words(chunk) for chunk in broad_scope_chunks]

In [32]:
broad_scope_chunks[-1]

'<section level="1" type="appendix">\n        <title>Appendix Three Versions of the Sutra</title>\n        <section level="2" type="note">\n            <title>A NOTE ON THE HISTORY OF THE TEXTS</title>\n            <p>During his lifetime, the Buddha delivered his discourses in the Ardhamagadhi\n                language. But his teachings spread far beyond the area where that language was\n                spoken, and there is no doubt that in the practice centers that lay beyond the\n                alluvial plain of the Ganges River, the monks and nuns would have studied the Dharma\n                in their own local languages. One day in the Jetavana monastery, two monks, Yamelu\n                and Tekula, asked the Buddha if they could translate all his teachings into the\n                classical meter of the Vedic language. They told the Buddha that they wanted to do\n                so in order to protect the beauty and accuracy of the Dharma. But the Buddha did not\n           

In [33]:
fg_totals = [calc_query_counts(wc) for wc in fgc]
fg_totals

[(1, 1, 2),
 (7, 7, 11),
 (2, 2, 2),
 (2, 2, 3),
 (7, 7, 12),
 (1, 1, 2),
 (20, 20, 20),
 (6, 6, 10),
 (9, 9, 15),
 (18, 18, 20),
 (20, 20, 20),
 (2, 2, 3),
 (2, 2, 3),
 (2, 2, 4),
 (4, 4, 7),
 (2, 2, 3),
 (4, 4, 6),
 (14, 14, 20),
 (9, 9, 14),
 (7, 7, 11)]

In [34]:
sum([x+y+z for (x,y,z) in fg_totals])

466

In [35]:
bc_totals = [calc_query_counts(wc) for wc in bsc]
bc_totals

[(1, 1, 2),
 (4, 4, 7),
 (18, 18, 20),
 (2, 2, 3),
 (20, 20, 20),
 (11, 11, 18),
 (3, 3, 4),
 (20, 20, 20)]

In [36]:
sum([x+y+z for (x,y,z) in bc_totals])

252

In [37]:
150*3 * 15

6750

In [38]:
count_words(test_input)

720

In [39]:
8 / 718, 10 / 718

(0.011142061281337047, 0.013927576601671309)

In [40]:
def build_messages_from_chunks(chunk_list):
    messages = []
    for chunk in chunk_list:
        messages.append(generate_messages(chunk))
    return messages

In [41]:
test = build_messages_from_chunks(broad_scope_chunks)

In [42]:
test

[[{'role': 'system',
   'content': "You are a thorough, insightful, and consistent assistant generating (query, text) pairs for a project aimed at training a BERT-based search model on finding relevant passages in the works of Thich Nhat Hanh. \nThink of queries from a wide range of people: those new to and curious about the Plum Village tradition up to experienced monastics researching Thay's life, teachings, or deep Buddhist principles.\nThe generated queries should capture the key concepts and themes from the text (without too much semantic overlap), some broad context questions generally relevant to Plum Village (not directly from the text), and also some detailed, or complex information specific to the text. \nMetadata, such as tagged titles, quotes, etc. can be used to guide generation. Queries specific to Named Entities in the text may also be relevant.\nFor the longer queries, focus on more complex or philosophical aspects of the text or connections to broader Buddhist teaching

In [43]:
print(test[1][1]['content'])

Generate:

7 queries: 1-3 words in length
4 queries: 4-6 words in length
4 queries: full-sentence questions

From this text:
<section level="1" type="introduction">
        <title>Introduction: What Is Mindfulness?</title>
        <p>We practice mindfulness in order to realize liberation, peace, and joy in our everyday
            lives. Liberation and happiness are linked to each other; if there is liberation, there
            is happiness, and greater liberation brings greater happiness. If there is liberation,
            peace and joy exist in the present moment. We don't need to wait ten or fifteen years to
            realize them. They're available as soon as we begin the practice. However modest these
            elements may be, they form the basis for greater liberation, peace, and joy in the
            future.</p>
        <p>To practice meditation is to look deeply in order to see into the essence of things. With
            insight and understanding we can realize liberat

In [44]:
import json
import os

def create_jsonl_file_for_batch(system_message, user_messages, output_file_path="batch_requests.jsonl"):
    """
    Creates a JSONL file for batch processing, with each request using the same system message and different user messages.

    Args:
        system_message (str): The system message to be included in each request.
        user_messages (list): List of user messages (each message will create a separate request).
        output_file_path (str): The path where the .jsonl file will be saved.
    
    Returns:
        str: The path to the generated .jsonl file.
    """
    requests = []
    for i, user_message in enumerate(user_messages):
        request_obj = {
            "custom_id": f"request-{i+1}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o",
                "messages": [
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ]
            }
        }
        requests.append(request_obj)

    # Write requests to JSONL file
    with open(output_file_path, "w") as f:
        for request in requests:
            json.dump(request, f)
            f.write("\n")
    
    return output_file_path

def upload_batch_file(file_path, endpoint="/v1/chat/completions", completion_window="24h"):
    """
    Uploads the generated .jsonl file for batch processing and creates the batch.

    Args:
        file_path (str): Path to the .jsonl file containing batch requests.
        endpoint (str): The endpoint to be used for all requests in the batch (e.g., "/v1/chat/completions").
        completion_window (str): Time frame within which the batch should be processed (currently "24h" only).
    
    Returns:
        dict: Response from the OpenAI API for batch creation.
    """
    # Upload the JSONL file to OpenAI with the purpose 'batch'
    upload_response = client.files.create(file=open(file_path, "rb"), purpose="batch")
    input_file_id = upload_response["id"]

    # Create the batch with the uploaded file
    batch_response = client.batches.create(
        input_file_id=input_file_id,
        endpoint=endpoint,
        completion_window=completion_window
    )
    return batch_response