In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%aimport openai, pandas, time, os, re, math, ast, tiktoken

import pandas as pd
import os
import ast
import re
import tiktoken

In [None]:
from openai import OpenAI

In [None]:
from math import ceil

In [None]:
from data_processing.text_processing import get_text_from_file, set_working_directory, get_working_directory
from data_processing.text_processing import normalize_quotes

set_working_directory("../../books/private_books")

In [None]:
LONG_QUERY_RATIO = 0.007
MEDIUM_QUERY_RATIO = 0.007
SHORT_QUERY_RATIO = 0.007
MAX_QUERY_COUNT = 25

In [None]:
encoding = tiktoken.encoding_for_model("gpt-4o")

def token_count(text):
    return len(encoding.encode(text))

In [None]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [None]:
system_instructions = """You are a thorough, insightful, and consistent assistant generating (query, text) pairs to train a BERT-based search model on Thich Nhat Hanh's works. 
Generate queries for a range of audiences, from beginners in the Plum Village tradition to advanced monastics. 

Use metadata, such as titles, quotes, or gathas, as cues to identify central themes or key concepts in the text. 
Queries should capture essential topics, themes, or questions (without too much semantic overlap), including both broad scope questions, and questions about specific insights or details. 

For extended queries, explore deeper, complex, or philosophical connections to Buddhist Teachings and Thich Nhat Hanh's life and teachings. 
Novel or unexpected queries can also be considered.
"""

### Old text: modified with GPT help Nov 5, 2024
You are a thorough, insightful, and consistent assistant generating (query, text) pairs for a project aimed at training a BERT-based search model on finding relevant passages in the works of Thich Nhat Hanh. 
Think of queries from a wide range of people: those new to and curious about the Plum Village tradition up to experienced monastics researching Thay's life, teachings, or deep Buddhist principles.
The generated queries should capture the key concepts and themes from the text (without too much semantic overlap), some broad context questions generally relevant to Plum Village (not directly from the text), and also some detailed, or complex information specific to the text. 
Metadata, such as tagged titles, quotes, etc. can be used to guide generation. Queries specific to Named Entities in the text may also be relevant.
For the longer queries, focus on more complex or philosophical aspects of the text or connections to broader Buddhist teachings.

### New system message, generated with chat gpt help on Nov 5, 2024

You are a thorough, insightful assistant generating (query, text) pairs to train a BERT-based search model on Thich Nhat Hanh's works. 
Generate queries for a range of audiences, from beginners in the Plum Village tradition to advanced monastics. 

Use metadata, such as titles, quotes, or gathas, as cues to identify central themes or key concepts in the text. Queries should capture essential topics, themes, or questions (without too much semantic overlap), including both broad context questions and detailed insights. 

For extended queries, explore deeper, complex, or philosophical connections to Thich Nhat Hanh’s life and teachings, as well as broader Buddhist principles.

In [None]:
print(system_instructions)

In [None]:
token_count(system_instructions)

Generate 4 queries (1-3 words), 3 queries (4-6 words), and 3 full-sentence questions based on this text:

<section level="3" type="exercise">
<title>Exercise 7| Parts of the Body</title>
[more text here]

Only return the list of pairs in this format: [(query, text), (query, text), ...]. Avoid Python syntax, backticks, or any code elements.

In [None]:
user_input_wrapper = """Generate {short_query_count} queries (1-3 words), {medium_query_count} queries (4-6 words), and {long_query_count} full-sentence questions based on this text:
---

{text_segment}

---
The output will be imported into python using ast. Do not include any Python code syntax, variables, triple backticks or other code elements. 
Only return the list content, with each pair in the form (query, text) as a tuple.
For example: [("query", "text"), ("query", "text"), ...]. Where "query" is your specific query, and "text" is the unique matching phrase or sentence from the text.
"""

In [None]:
test_input = """            <section level="3" type="exercise">
                <title>Exercise 7| Parts of the Body</title>
                <sutra-quote>Further, the practitioner meditates on his very own body from the soles
                    of the
                    feet upwards and then from the hair on top of the head downwards, a body
                    contained inside the skin and full of all the impurities which belong to the
                    body: 'Here is the hair of the head, the hairs on the body, the nails, teeth,
                    skin, flesh, sinews, bones, bone marrow, kidneys, heart, liver, diaphragm,
                    spleen, lungs, intestines, bowels, excrement, bile, phlegm, pus, blood, sweat,
                    fat, tears, grease, saliva, mucus, synovic fluid, urine.'</sutra-quote>
                <p>This exercise brings us into even deeper contact with our body. Here we observe
                    the body in all its parts, from the hair on the head to the skin on the soles of
                    the feet. In the process of our observation, we scan all the parts of the body,
                    including the brain, heart, lungs, gall bladder, spleen, blood, urine, and so
                    forth. The Buddha gives us the example of a farmer pouring the contents of a
                    sack filled with a variety of seeds onto the floor and then observing and
                    identifying each kind of seed: "This is rice, these are beans, these are sesame
                    seeds."</p>
                <p>We use our conscious breathing in order to observe mindfully all the parts of the
                    body. For example: "Breathing in, I am aware of the hair on my head. Breathing
                    out, I know that this is the hair on my head." Breathing consciously helps us
                    dwell in mindfulness more easily and sustain the work of observing each part of
                    the body. In addition to the conscious breathing, we can use the method of
                    silently calling each part of the body by name to enable these parts to become
                    increasingly clear in the light of mindfulness.</p>
                <p>Why do we need to observe in mindfulness the different parts of the body? First
                    of all, it is to be in contact with the body. We often have the impression that
                    we're already totally in touch with our body, but often we're wrong. Between us
                    and our body there can be a large separation, and our body remains a stranger to
                    us. Sometimes we hate our body. There are even people who see their body as a
                    prison and a place of punishment. To come back to our body is to become familiar
                    with it and to establish harmony with it. We know that if our body isn't happy,
                    we're not happy, and so we want our body to be calm and peaceful. To do so, we
                    come back to our body and make peace with it.</p>
                <p>We can try touching the different parts of our body to make their acquaintance.
                    We should touch each part in an affectionate and caring way. For several
                    decades, our eyes, feet, and heart have done their work devotedly and faithfully
                    with us and for us, but we never really give them much attention or express our
                    gratitude to them. It's necessary to establish a close relationship with our
                    body.</p>
                <p>The second reason for mindfully observing the different parts of the body is that
                    each part can be the door to liberation and awakening. At first we'll only
                    recognize the presence of the part of the body being observed, but later we'll
                    come to see its true nature. Every hair on our head and every cell in our body
                    contains the entire universe. Observing the interdependent nature of a single
                    hair can help us to see into the nature of the universe.</p>
                <p>The exercise of observing every part of the body begins with the hair on the head
                    and goes down to the skin on the soles of the feet. Sometimes we observe just
                    one part of the body deeply, such as our eyes, heart, or toe. In the process of
                    observation from the head to the feet, some observations may spring up in our
                    mind. For example, as we pass our heart, we may think, "My friend John has a
                    heart condition. I must visit him soon to see if he's all right." We can note
                    these observations and then continue with the work of observing the remaining
                    parts of the body. Later we can return to those observations.</p>
            </section>"""

In [None]:
token_count(test_input)

In [None]:
4 / 977

In [None]:
test_input = normalize_quotes(test_input)

In [None]:
def count_words(text):
    """Simple word counter based on whitespace."""
    return len(re.findall(r'\w+', text))

In [None]:
def calc_query_counts(tokens):
    lq, mq, sq = (ceil(LONG_QUERY_RATIO * tokens), ceil(MEDIUM_QUERY_RATIO * tokens), ceil(SHORT_QUERY_RATIO * tokens))
    lq, mq, sq = min(MAX_QUERY_COUNT, lq), min(MAX_QUERY_COUNT, mq), min(MAX_QUERY_COUNT, sq)
    return lq, mq, sq

In [None]:
calc_query_counts(5000)

In [None]:
def generate_messages(text_segment):

    tokens = token_count(text_segment)

    long_count, medium_count, short_count = calc_query_counts(tokens)
    
    messages = [
                {
                    "role": "system",
                    "content": system_instructions
                },
                {
                    "role": "user",
                    "content": user_input_wrapper.format(text_segment=text_segment,
                                                         long_query_count=long_count,
                                                         medium_query_count=medium_count,
                                                         short_query_count=short_count 
                                                        )
                }
            ]
    return messages

In [None]:
def generate_queries(text_segment):

    messages = generate_messages(text_segment)

    try:
        chat_completion = client.chat.completions.create(
            messages=messages,
            model="gpt-4o",
        )
        
        return chat_completion
    
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
def get_completion_content(completion):
   return completion.choices[0].message.content

In [None]:
messages = generate_messages(test_input)
print(messages[0]['content'])
print(messages[1]['content'])

In [None]:
completion = generate_queries(test_input)

In [None]:
output = get_completion_content(completion)
print(output)
#queries_list = ast.literal_eval(output)

In [None]:
queries_list

In [None]:
book_xml_str = get_text_from_file("TH_working4.xml")

In [None]:
from lxml import etree

def process_sections(section):
    """
    Collects section data with paragraph and word counts.
    """
    section_data = []
    
    # Extract section attributes
    level = section.get("level")
    section_type = section.get("type", "")
    title = section.findtext("title", default="")

    # Count paragraphs and words within this section
    paragraphs = section.findall("p")
    paragraph_count = len(paragraphs)
    word_count = sum(count_words(p.text) for p in paragraphs if p.text)

    # Append section data
    section_data.append({
        "level": level,
        "type": section_type,
        "title": title,
        "paragraph_count": paragraph_count,
        "word_count": word_count
    })
    
    # Process nested sections recursively
    for sub_section in section.findall("section"):
        section_data.extend(process_sections(sub_section))
    
    return section_data

# Parse XML and iterate over top-level sections
root = etree.fromstring(book_xml_str)

# Collect all section data
all_section_data = []
for section in root.findall(".//section"):
    all_section_data.extend(process_sections(section))

# Example output for debugging
for section_info in all_section_data:
    print(section_info)

In [None]:
from lxml import etree
import os


def generate_chunks(xml_filename, ignore_list=None):
    """
    Generates fine-grained and broad-scope chunks from XML data, applying filters.

    Fine-grained chunks are each level 2 section.
    Broad-scope chunks are entire level 1 sections with nested level 2 sections.
    Sections with types in the ignore list or with zero paragraphs and no subsections are excluded.

    Args:
    - xml_filename: Name of the XML file within the working directory.
    - ignore_list: List of keywords to ignore based on 'type' (case insensitive).

    Returns:
    - fine_grained_chunks: List of strings, each containing a level 2 section as XML.
    - broad_scope_chunks: List of strings, each containing a level 1 section as XML with all nested content.
    """
    
    # Construct the full path to the XML file
    wdir = get_working_directory()

    if wdir:
        xml_file = os.path.join(wdir, xml_filename)
    else:
        xml_file = xml_filename
    
    # Parse XML and prepare containers for chunks
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    ignore_list = ignore_list or []
    fine_grained_chunks = []
    broad_scope_chunks = []

    # Helper function to check if a section should be ignored
    def should_ignore(section):
        section_type = section.get("type", "").lower()
        paragraph_count = len(section.findall("p"))
        has_subsections = len(section.findall("section")) > 0
        
        # Ignore if type matches any keyword in ignore list or if it has zero paragraphs and no subsections
        return (
            any(keyword.lower() in section_type for keyword in ignore_list) or
            (paragraph_count == 0 and not has_subsections)
        )

    # Collect level 2 sections as fine-grained chunks
    for level_2_section in root.findall(".//section[@level='2']"):
        if not should_ignore(level_2_section):
            fine_grained_chunks.append(etree.tostring(level_2_section, encoding='unicode'))
    
    # Collect entire level 1 sections as broad-scope chunks
    for level_1_section in root.findall(".//section[@level='1']"):
        if not should_ignore(level_1_section):
            broad_scope_chunks.append(etree.tostring(level_1_section, encoding='unicode'))
    
    return fine_grained_chunks, broad_scope_chunks

# Example usage
xml_filename = "TH_working4.xml"
ignore_keywords = ["bibliographic-data"]

fine_grained_chunks, broad_scope_chunks = generate_chunks(xml_filename, ignore_list=ignore_keywords)

# Print samples for inspection
print("Fine-Grained Chunks Sample:", fine_grained_chunks[:3])
print("Broad-Scope Chunks Sample:", broad_scope_chunks[:1])

In [None]:
print(fine_grained_chunks[0])

In [None]:
print(broad_scope_chunks[2])

In [None]:
fgc = [count_words(chunk) for chunk in fine_grained_chunks]

In [None]:
fgc

In [None]:
fg_tokens = [token_count(chunk) for chunk in fine_grained_chunks]

In [None]:
fg_tokens

In [None]:
fine_grained_chunks[6]

In [None]:
bsc = [count_words(chunk) for chunk in broad_scope_chunks]

In [None]:
bs_tokens = [token_count(chunk) for chunk in broad_scope_chunks]

In [None]:
broad_scope_chunks[-1]

In [None]:
fg_totals = [calc_query_counts(tc) for tc in fg_tokens]
fg_totals

In [None]:
sum([x+y+z for (x,y,z) in fg_totals])

In [None]:
100 * 30

In [None]:
bc_totals = [calc_query_counts(tc) for tc in bs_tokens]
bc_totals

In [None]:
sum([x+y+z for (x,y,z) in bc_totals])

In [None]:
150*3 * 15

In [None]:
count_words(test_input)

In [None]:
8 / 718, 10 / 718

In [None]:
def build_messages_from_chunks(chunk_list):
    messages = []
    for chunk in chunk_list:
        messages.append(generate_messages(chunk))
    return messages

In [None]:
test = build_messages_from_chunks(broad_scope_chunks)

In [None]:
test

In [None]:
print(test[1][1]['content'])

In [None]:
import json
import os

def create_jsonl_file_for_batch(messages, output_file_path="batch_requests.jsonl"):
    """
    Creates a JSONL file for batch processing, with each request using the same system message and different user messages.

    Args:
        messages: to be sent for completion
        output_file_path (str): The path where the .jsonl file will be saved.
    
    Returns:
        str: The path to the generated .jsonl file.
    """
    requests = []
    for i, message in enumerate(messages):
        request_obj = {
            "custom_id": f"request-{i+1}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o",
                "messages": message,
                "max_tokens": 3000
            },
        }
        requests.append(request_obj)

    # Write requests to JSONL file
    with open(output_file_path, "w") as f:
        for request in requests:
            json.dump(request, f)
            f.write("\n")
    
    return output_file_path

In [None]:
# recreate fine grained batch request file:
chunks_to_build = fine_grained_chunks
messages_fg = build_messages_from_chunks(chunks_to_build)
create_jsonl_file_for_batch(messages_fg, "batch_requests_fine_grained.jsonl")

In [None]:
chunks_to_test = broad_scope_chunks

In [None]:
len(chunks_to_test)

In [None]:
tokens = [token_count(chunk) for chunk in chunks_to_test]
sum(tokens)

In [None]:
messages = build_messages_from_chunks(chunks_to_test)
create_jsonl_file_for_batch(messages)

In [None]:
batch_input_file = client.files.create(
  file=open("batch_requests.jsonl", "rb"),
  purpose="batch"
)

In [None]:
batch_input_file_id = batch_input_file.id

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "first (query,text) generation task run."
    }
)
batch

In [None]:
client.batches.retrieve(batch.id)

In [None]:
import ast

def retrieve_batch_results(batch):
    """
    Retrieves the status of a batch job and returns the result if completed.
    Parses the JSON result file, collects the output messages (query-text pairs),
    and converts them to Python lists.
    
    Args:
    - batch (Batch): The batch object to retrieve status and results for.

    Returns:
    - If completed: A list of lists containing query-text pairs.
    - If not completed: A string with the batch status.
    """
    # Check the batch status
    batch_status = client.batches.retrieve(batch.id)
    if batch_status.status != 'completed':
        return f"Batch status: {batch_status.status}"

    # Retrieve the output file contents
    file_id = batch_status.output_file_id
    file_response = client.files.content(file_id)

    # Parse the JSON lines in the output file
    results = []
    for line in file_response.text.splitlines():
        data = json.loads(line)  # Parse each line as JSON
        response_body = data.get("response", {}).get("body", {})
        if response_body:
            # Convert the content field in message to a list of tuples
            content = response_body["choices"][0]["message"]["content"]
            try:
                # Safely evaluate the string to convert it to a Python list of tuples
                query_text_pairs = ast.literal_eval(content)
                if isinstance(query_text_pairs, list):
                    results.append(query_text_pairs)
            except (SyntaxError, ValueError):
                continue

    return results

In [None]:
output_bs_chunks = retrieve_batch_results(batch)

In [None]:
# output_fg_chunks = output1

In [None]:
output_fg_chunks

In [None]:
output_fg_chunks

In [None]:
output_bs_chunks

In [None]:
fine_grained_chunks

In [None]:
import json

def write_data_to_json(data, filename):
    """
    Writes a list of data to a file in JSONL format, where each line is a
    JSON object corresponding to an element in the list.

    Parameters:
    data (list): A list of JSON-serializable elements to write to the file.
    filename (str): The name of the file to write the JSONL data to.

    Example:
    >>> write_data_to_json([{"key1": "value1"}, {"key2": "value2"}], "output.jsonl")
    """
    with open(filename, "w") as f:
        for item in data:
            json.dump(item, f)
            f.write("\n")  # Newline after each JSON object for JSONL format

# Example usage
# Assuming your list is already JSON-serializable
# my_data = [{"name": "Alice"}, {"name": "Bob"}, {"name": "Charlie"}]
# write_data_to_json(my_data, "output.jsonl")

In [None]:
def listify(query_text_list):
    return [[query, text] for query, text in query_text_list]

In [None]:
[[query, text] for query, text in output_fg_chunks[0]]

In [None]:
listify(output_fg_chunks[1])

In [None]:
fg_data = [list(item[0], item[1]) for item in zip(fine_grained_chunks, output_fg_chunks)]

In [None]:
def check_output_structure(output_chunks):
    """
    Checks that each element in the nested list `output_chunks` is a pair (has exactly two items).
    Prints the index and content of any elements that do not match the expected structure.
    
    Parameters:
    output_chunks (list): A list of lists, each containing tuples/lists expected to be of length 2.
    
    Returns:
    bool: True if all elements are pairs; False if any issues are found.
    """
    issues_found = False
    
    for i, query_text_list in enumerate(output_chunks):
        for j, item in enumerate(query_text_list):
            if len(item) != 2:
                print(f"Problematic element at output_chunks[{i}][{j}]: {item}")
                issues_found = True
                
    if not issues_found:
        print("All elements have the correct structure (pairs).")
    return not issues_found  # Returns True if no issues were found, False otherwise

# Example usage:
# result = check_output_structure(output_fg_chunks)
# if result:
#     print("Structure is as expected.")
# else:
#     print("Found structural issues in the data.")

In [None]:
def repair_output_structure(output_chunks):
    """
    Repairs each element in `output_chunks` to ensure it is a tuple pair. If an item is not a tuple,
    it is converted to a tuple with an empty string as the second element. If a tuple has more than
    2 elements, a warning is issued and it is truncated to the first two elements.
    
    Parameters:
    output_chunks (list): A list of lists, each containing items expected to be tuples of length 2.
    
    Returns:
    list: A modified copy of `output_chunks` with all elements as tuple pairs of length 2.
    """
    repaired_chunks = []
    
    for query_text_list in output_chunks:
        repaired_list = []
        for item in query_text_list:
            if isinstance(item, tuple):
                if len(item) > 2:
                    print(f"Warning: Truncating item with more than 2 elements: {item}")
                    repaired_list.append((item[0], item[1]))
                else:
                    repaired_list.append(item)
            else:
                # Convert non-tuple items into a tuple pair with an empty string
                repaired_list.append((item, ""))
                
        repaired_chunks.append(repaired_list)
    
    return repaired_chunks

# Example usage:
# repaired_output_fg_chunks = repair_output_structure(output_fg_chunks)

In [None]:
check_output_structure(output_fg_chunks)

In [None]:
repaired_fg = repair_output_structure(output_fg_chunks)

In [None]:
repaired_fg[6]

In [None]:
check_output_structure(repaired_fg)

In [None]:
output_fg_chunks[6]

In [None]:
fg_out_lists = [listify(query_text_list) for query_text_list in repaired_fg]

In [None]:
bs_out_lists = [listify(query_text_list) for query_text_list in output_bs_chunks]

In [None]:
len(bs_out_lists)

In [None]:
fg_out_lists[0]

In [None]:
fg_out_lists

In [None]:
write_data_to_json(fg_out_lists, "TH_fine_grain_query_text_data.jsonl")

In [None]:
write_data_to_json(bs_out_lists, "TH_broad_scope_query_text_data.jsonl")

In [None]:
fg_out_lists[6]

In [None]:
repair_for_seg6 = [
  ("How does observing the body's processes help practitioners achieve mindfulness and understanding?", "Observing the impermanent, selfless, and interdependent nature of all that is doesn't lead us to feel aversion for life. On the contrary, it helps us see the preciousness of all that lives. Liberation doesn't mean running away from or destroying life."),
  ("What role does insight play in recognizing the impermanent and interdependent nature of the body?", "The words recognition, insight, clarity, and realization here mean that the practitioner recognizes, sees, sheds light on, and realizes the impermanent and interdependent nature of the body and all that is, by means of the mindful observation of the body."),
  ("How is the practice of observing the body articulated in different versions of the sutras?", "In the second version of the sutra, the description of each body meditation exercise is as follows: This is how the practitioner is aware of body as body, both within and without, and establishes mindfulness in the body with understanding, insight, clarity, and realization."),
  ("Why is it important to observe the impermanence and selflessness of the Five Aggregates?", "In the same way, the ordinary man caught in dualistic conceptions is accustomed to thinking that the Five Aggregates are the root of his suffering, but in fact the root of suffering is the lack of understanding about the impermanent, selfless, and interdependent nature of the Five Aggregates."),
  ("How can mindfulness of the body's impermanence increase our appreciation for life?", "To observe the impermanence of things is not to reject them, but to be in contact with them with deep understanding, without being caught in desire and attachment."),
  ("What misconceptions about Buddhism and non-attachment are addressed in this text?", "Many people present Buddhism as a path that denies life, that transcends the world of the Five Aggregates... To present Buddhism in this way is no different from saying that the object of our practice is to arrive at the absence of life or nothingness."),
  ("Why is it essential to distinguish between desire that nourishes life and desire that leads to suffering?", "So we can say that to eat and drink so that the body is strong and healthy is to walk on the path of emancipation, while to eat and drink in a way that causes our body and others to suffer is to go against the way of liberation."),
  ("How does the Buddha's appreciation of beauty relate to his teachings on impermanence?", "The Buddha was not afraid of beautiful things, because he was able to see the impermanent nature of everything, beautiful or ugly. He didn't chase after things, and he didn't run away from them either."),
  ("In what ways do misinterpretations about the root of suffering impact Buddhist practice?", "There are people who, because of their incorrect understanding of what the root of suffering is, instead of dealing with their attitude of attachment, think they have to deal with their organs of sense and the aggregates, and so they fear form, sound, smell, taste, touch, and objects of mind and feel aversion for the body, feelings, perceptions, mental formations, and consciousness."),
  ("What does the example of the dog and the clod of earth illustrate about suffering and attachment?", "In the same way, the ordinary man caught in dualistic conceptions is accustomed to thinking that the Five Aggregates are the root of his suffering, but in fact the root of suffering is the lack of understanding about the impermanent, selfless, and interdependent nature of the Five Aggregates."),
  ("Why is it said that reality is not to be found in terms of existence or nonexistence?", "In the Kaccayana Gotta Sutta, the Buddha also taught that reality is not to be found in terms of existence or nonexistence. His meaning is perfectly clear: suffering is not brought about by life, the Five Skandhas, or the selfless and interdependent nature of all that is."),
  ("How does the understanding of interdependence and selflessness contribute to liberation?", "Only when, thanks to mindful observation, we realize the impermanent, selfless, and interdependent nature of all that is, can we achieve freedom and liberation."),
  ("In what ways does recognizing impermanence liberate a practitioner from attachment and sorrow?", "Because we can see the impermanent nature of the flowers, we can appreciate all the more the beauty of each flower. To observe the impermanence of things is not to reject them, but to be in contact with them with deep understanding, without being caught in desire and attachment."),
  ("What is Thich Nhat Hanh's perspective on the relationship between Buddhism and everyday enjoyment such as eating and drinking?", "If we've had nothing to eat for three days, we feel like eating. Is that desire? Is the natural desire for the indispensable elements of life a desire we need to destroy?... To eat when hungry, to drink when thirsty, is that to go against the path which leads to emancipation?"),
  ("How does understanding the impermanent nature of desires lead to freedom?", "In identifying the mind of desire, in observing the nature of that mind and the nature of the object of desire, we'll see the impermanence, selflessness, and interdependence of it, and we'll no longer be dominated by that state of mind."),
  ("How does the practice of peace and joy fit into the broader context of mindfulness?", "The tenth exercise is taken from the second version of the sutra (see Appendix). It is a practice of peace and joy."),
  ("Why is the Buddha portrayed with a heart of love and a smile in opposition to an emaciated arhat?", "The image of the bodhisattva is very close to the image of the Buddha entering life with a heart of love and compassion and a smile on his lips."),
  ("How does mindfulness of desire differ from being dominated by desire?", "To know how to appreciate a beautiful sunset is not desire, if we 'remain established in the observation, free and not caught up in any worldly consideration.'"),
  ("How can we apply the teachings of the impermanence of flowers to our daily life?", "If we cut flowers from our garden to place on the altar, that is because we acknowledge the beauty of those flowers... When the flowers wilt in a few days, we won't suffer or feel sad.")
]

In [None]:
output_fg_chunks[6]

In [None]:
test_repair = output_fg_chunks

In [None]:
len(repair_for_seg6)

In [None]:
test_repair[6][28:] = repair_for_seg6

In [None]:
test_repair[6][28]

In [None]:
test_repair[6][29]

In [None]:
test_repair[6][30]

In [None]:
fg_out_lists = [listify(query_text_list) for query_text_list in test_repair]

In [None]:
fg_out_lists[-1]

In [None]:
write_data_to_json(fg_out_lists, "TH_fine_grain_query_text_data.jsonl")