In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from typing import List, Dict, Optional
import logging
import json

from pydantic import BaseModel, Field
from typing import List, Optional

from tnh_scholar.utils.file_utils import get_text_from_file, write_text_to_file
from tnh_scholar.xml_processing import wrap_lines, unwrap_lines, lines_from_wrapped_text
from tnh_scholar.text_processing import process_text
from tnh_scholar.utils import iterate_subdir, load_json_into_model, save_model_to_json

In [None]:
from tnh_scholar.openai_interface import token_count

In [None]:
# Configure main logger using setup_logger
import tnh_scholar.logging_config as logging_config
from tnh_scholar.logging_config import setup_logging, get_child_logger

In [None]:
setup_logging(log_filepath="postprocessing_english.log")
logger = get_child_logger("postprocessing_english")

In [None]:
from tnh_scholar import PROJECT_ROOT_DIR

In [None]:
video_storage_dir = PROJECT_ROOT_DIR / "sandbox/video_transcriptions"

In [None]:
class Section(BaseModel):
    title_vi: str = Field(
        ..., 
        description="The title of the section in Vietnamese."
    )
    title_en: str = Field(
        ..., 
        description="The translation of the title of the section in English."
    )
    summary: str = Field(
        ..., 
        description="A summary of the section in English."
    )
    start_line: int = Field(
        ..., 
        description="The starting line number of this section."
    )
    end_line: int = Field(
        ...,
        description="The ending line number of this section."
    )

class DharmaTalkSections(BaseModel):
    talk_summary: str = Field(
        ..., 
        description="A summary of the Dharma talk in English."
    )
    sections: List[Section] = Field(
        ..., 
        description="An ordered list of sections with their titles and included start and end line numbers. The sequence of line ranges for the sections must cover every line from start to finish without any overlaps or gaps."
    )

In [None]:
class TranslatedSection(Section):
    content_vi: Optional[str] = Field(
        None,
        description="The full content of the section in Vietnamese."
    )
    content_en: Optional[str] = Field(
        None,
        description="The translation of the full content of the section in English."
    )

In [None]:
def process_sections(output_file: Path, wrapped_transcript: str, section_object: DharmaTalkSections, instructions: str) -> None:
    """
    Processes sections of a document by applying provided instructions
    and writing the results to an output file.

    Args:
        output_file (Path): Path to the file where the processed sections will be written.
        wrapped_transcript (str): The transcripted with line number wrapping
        section_object: Object containing the sections to process. Each section should have 'start_line', 
                        'end_line', and 'title' attributes.
        instructions (str): Instructions for processing each section.

    Example:
        process_sections(
            output_file="output.xml",
            section_object=my_section_object,
            instructions="Process section titled '{section_title}' carefully."
        )
    """
    sections = section_object.sections
    sections_processed = []
    
    write_text_to_file(output_file, "<document>\n", overwrite=True)
    logger.info(f"Sections to process: {len(sections)}")
    for i, section in enumerate(sections):
        logger.info(f"Processing section {i+1}: '{section.title}'...")
        original_lines = lines_from_wrapped_text(
            wrapped_transcript,  
            start=section.start_line,
            end=section.end_line,
            keep_brackets=False
        )
        section_instructions = instructions.format(section_title=section.title)
        
        if i == 0:
            logger.info(f"Processing instructions:\n{section_instructions}")
        
        processed_lines = process_text(original_lines, section_instructions, batch=False)
        sections_processed.append(processed_lines)
        write_text_to_file(output_file, processed_lines, append=True)
    write_text_to_file(output_file, "</document>", append=True)
    return sections_processed
    

In [None]:
from typing import List

def convert_wrapped_lines_to_xml(wrapped_lines: str) -> str:
    """
    Converts a list of wrapped lines into a valid XML structure with <line number="x"> tags.

    Args:
        lines (List[str]): A list of strings, where each line is in the format "<n: ...>".

    Returns:
        str: A string containing valid XML.

    Example:
        lines = "\n".join([
            "<1:Today is the 20th of November, 1994.>",
            "<2:The theme of this winter retreat is>",
        ])
        print(convert_wrapped_lines_to_xml(lines))
    """
    xml_lines = []
    for line in wrapped_lines.split("\n"):
        line = line.strip()
        # Extract the line number and content using slicing
        if line.startswith('<') and line.endswith('>'):
            try:
                colon_index = line.index(':')  # Find colon separating number and content
                number = line[1:colon_index]  # Extract the line number
                content = line[colon_index + 1:-1].strip()  # Extract the content
                # Wrap the content in a valid <line> tag
                xml_lines.append(f'  <line number="{number}">{content}</line>')
            except ValueError:
                raise ValueError(f"Invalid format: {line}")
        else:
            raise ValueError(f"Invalid line format: {line}")
    return '\n'.join(xml_lines)



In [None]:
process_section_instructions_vi = """You are a highly skilled and meticulous assistant processing an audio transcript of a Dharma Talk given by Thich Nhat Hanh in Vietnamese.

Each line of the transcript is numbered in the format: <NUM:LINE> 

You goal is to divide the entire transcript into {section_count} logical sections based on content. 

For each section, give the title in Vietnamese and English, a summary in English, and the starting and ending line numbers of the section.

Also provide a summary of the talk in English.

IMPORTANT: Every line in the transcript must belong to a section. Don't leave out any lines. Don't include lines in more than one section."""


In [None]:
section_instructions_translate_vi = """You are the world's leading expert at translating Dharma talks transcribed from spoken Vietnamese.

You are translating a section titled '{section_title}' from a Dharma talk offered by Thich Nhat Hanh (Thay) in Plum Village, France.

Lines of the transcript are numbered and are given in the format <NUM:LINE>.

Your task is to translate each line into correct, clear and typical English. Add correct punctuation to create meaning that matches the speakers style and intent.

You may have to infer the Thay's intent in order to correct transcription or speaking errors and to generate a text that most closely matches the speaker's meaning,
while still giving clear and eloquent English. Give the best approximation or contextual guess if the transcript is difficult or unclear. Make no comments. 

Use Plum Village typical English style when making translations.

You may consider adjacent lines for corrections and context when generating a line, however each line of translation should be as close as possible a translation of the original line.

Some transcriptions may be from sounds such as a bell. These can be marked as [Bell].

You must faithfully capture Thay's style and presentation while creating a meaningful flow.

Do not leave out any content or summarize. 

The final output should match the same line structure and line numbering using <> as the original.

Your output should be a polished section.

Make no other changes; add no content.

Output the final text only."""

In [None]:
talk_name = "Kinh Tư Lượng [TTSĐCTTĐB 01] ｜ TS Thích Nhất Hạnh (20-11-1994, Xóm Thượng, Làng Mai)"

In [None]:
video_dir = video_storage_dir / talk_name

In [None]:
transcript_path = video_dir / f"{talk_name}.txt"

In [None]:
print(transcript_path)
transcript_path.exists()

In [None]:
transcript = get_text_from_file(transcript_path)

In [None]:
print(transcript[:1000])

In [None]:
wrapped_transcript = wrap_lines(transcript, number=True)

In [None]:
print(wrapped_transcript[:1000])

In [None]:
section_instructions = process_section_instructions_vi.format(section_count="")

In [None]:
print(section_instructions)

In [None]:
section_object = process_text(wrapped_transcript, section_instructions, response_object=DharmaTalkSections, max_tokens=5000)

In [None]:
json_section_path = video_dir / f"section_{talk_name}.json"

In [None]:
save_model_to_json(json_section_path, section_object)

In [None]:
# section_object = load_json_into_model(json_section_path, DharmaTalkSections)

In [None]:
print(section_object.talk_summary)

In [None]:
section_object.sections

In [None]:
len(section_object.sections)

In [None]:
output_xml_path = video_dir / f"formatted_{talk_name}.xml"

In [None]:
print(output_xml_path)

### for repairing: conditionally adding some sections or all sections as specified by the section_range

In [None]:
sections = section_object.sections
sections_processed = []

section_range = range(0, 10)

logger.info(f"Sections to process: {list(section_range)}")
for i in section_range:
    section = sections[i]
    logger.info(f"Processing section {i+1}: '{section.title_en}'...")
    original_lines = lines_from_wrapped_text(
        wrapped_transcript,  
        start=section.start_line,
        end=section.end_line,
        keep_brackets=True
    )
    section_instructions = section_instructions_translate_vi.format(section_title=section.title_en)
    
    if i == 0:
        logger.info(f"Processing instructions:\n{section_instructions}")
    
    processed_lines = process_text(original_lines, section_instructions, batch=False)
    processed_line = processed_lines
    sections_processed.append(f"<section>\n<title>{section.title_en}</title>\n{processed_lines}\n</section>")
output_str = "<document>\n" + "\n\n".join(sections_processed) + "\n</document>"
write_text_to_file(output_xml_path, output_str, overwrite=True)


In [None]:
print(output_str)

In [None]:
print(output_xml_path)

In [None]:
#process_sections(output_xml_path, transcript, section_object, postprocess_format_instructions_en_2)

In [None]:
# sections_formatted = []
# sections_original = []
# sections = section_object.sections
# section_range = range(0, 2)
# output_file = test_dir / f"formatted_{talk_name}.xml"
# for i in section_range:
#     section = sections[i]
#     original_lines = lines_from_wrapped_text(wtest, section.start_line, section.end_line, keep_brackets=False)
#     format_instructions = postprocess_format_instructions_en_2.format(section_title=section.title)
#     logger.info(f"Formatting section '{section.title}'...")

#     if i == 0:
#         logger.info(f"Translation instructions:\n{format_instructions}")
    
#     translated_lines = postprocess_text(original_lines, format_instructions, batch=False)
#     sections_formatted.append(translated_lines)
#     write_text_to_file(output_file, translated_lines, append=True)
    

In [None]:
for i, video_dir in enumerate(iterate_subdir(video_storage_dir)):
    try:
        talk_name = video_dir.name
        
        logger.info(f"Processing talk {i+1}: '{talk_name}'") 
        
        transcript_file = video_dir / f"{talk_name}.txt"

        section_output_path = video_dir / f"section_{talk_name}.json"

        output_xml_path = video_dir / f"formatted_{talk_name}.xml"

    except Exception as e:
        logger.error(f"Error in file setup for {talk_name}: {e}, skipping.")

    if transcript_file.exists():
        
        logger.info(f"Transcript found: {transcript_file}")

        if output_xml_path.exists():
            logger.info(f"Formatted xml file found. Skipping {talk_name}.")
            
        else: 
            try:
                transcript = get_text_from_file(transcript_file)

                wrapped_transcript = wrap_lines(transcript, number=True)

                logger.info(f"Starting sectioning postprocess for {talk_name}...")

                section_object = postprocess_text(wrapped_transcript, postprocess_section_instructions_en, response_object=DharmaTalkSections, max_tokens=5000)

                write_text_to_file(section_output_path, section_object.model_dump_json())
                logger.info(f"Sectioning for {talk_name} completed. Dumped section data to {section_output_path}.")
                
                logger.info(f"Starting postprocess for {talk_name}: section formatting sequence.")            
                process_sections(output_xml_path, wrapped_transcript, section_object, postprocess_format_instructions_en_2)
                logger.info(f"Postprocessing completed for {talk_name}")

            except Exception as e:
                logger.error(f"Error processing {talk_name}: {e}. Partial processing may have been saved. Skipping to next talk file.")

    else:
        logger.info(f"No transcript found in {transcript_file}. Skipping {talk_name}")

In [None]:
section_object.sections

In [None]:
token_count(str(section_object))