In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from typing import List, Dict
import logging
import json

from pydantic import BaseModel, Field
from typing import List, Optional

from tnh_scholar.text_processing import wrap_lines, unwrap_lines, lines_from_wrapped_text
from tnh_scholar.text_processing import get_text_from_file, write_text_to_file
from tnh_scholar.utils import iterate_subdir, load_json_into_model, save_model_to_json

In [None]:
from tnh_scholar.text_processing import process_text

In [None]:
from tnh_scholar.openai_interface import token_count

In [None]:
# Configure main logger using setup_logger
import tnh_scholar.logging_config as logging_config
from tnh_scholar.logging_config import setup_logging, get_child_logger

In [None]:
setup_logging(log_filepath="postprocessing_english.log")
logger = get_child_logger("postprocessing_english")

In [None]:
video_storage_dir = Path("processed_videos/video_transcriptions")

In [None]:
class Section(BaseModel):
    title: str = Field(
        ..., 
        description="The title of the section"
    )
    summary: str = Field(
        ..., 
        description="A summary of the section"
    )
    start_line: int = Field(
        ..., 
        description="The starting line number of the section."
    )
    end_line: int = Field(
        ...,
        description="The ending line number of the section."
    )

class DharmaTalkSections(BaseModel):
    talk_summary: str = Field(
        ..., 
        description="A summary of the Dharma talk content."
    )
    sections: List[Section] = Field(
        ..., 
        description="An ordered list of sections with their titles and included start and end line numbers."
    )

In [None]:
def process_sections(output_file: Path, wrapped_transcript: str, section_object: DharmaTalkSections, instructions: str) -> None:
    """
    Processes sections of a document by applying provided instructions
    and writing the results to an output file.

    Args:
        output_file (Path): Path to the file where the processed sections will be written.
        wrapped_transcript (str): The transcripted with line number wrapping
        section_object: Object containing the sections to process. Each section should have 'start_line', 
                        'end_line', and 'title' attributes.
        instructions (str): Instructions for processing each section.

    Example:
        process_sections(
            output_file="output.xml",
            section_object=my_section_object,
            instructions="Process section titled '{section_title}' carefully."
        )
    """
    sections = section_object.sections
    sections_processed = []
    
    write_text_to_file(output_file, "<document>\n", overwrite=True)
    logger.info(f"Sections to process: {len(sections)}")
    for i, section in enumerate(sections):
        logger.info(f"Processing section {i+1}: '{section.title}'...")
        original_lines = lines_from_wrapped_text(
            wrapped_transcript,  
            start=section.start_line,
            end=section.end_line,
            keep_brackets=False
        )
        section_instructions = instructions.format(section_title=section.title)
        
        if i == 0:
            logger.info(f"Processing instructions:\n{section_instructions}")
        
        processed_lines = process_text(original_lines, section_instructions, batch=False)
        sections_processed.append(processed_lines)
        write_text_to_file(output_file, processed_lines, append=True)
    write_text_to_file(output_file, "</document>", append=True)
    return sections_processed
    

In [None]:
postprocess_format_en_1 = """You are the world's leading expert at formatting Dharma talk audio transcriptions into written text for native, and partly fluent English speakers. 

The current text is from a Dharma Talk offered by a California-based English-speaking monastic.

Your goal is to format the text into meaningful paragraphs and sections while correcting errors (logical, transcription, or grammatical). 

Insert <section> and <title> tags where appropriate in the text to mark natural sections in the talk; give these sections appropriate titles.

Make necessary corrections to grammar to create correct English sentence structure and logical flow. 

You may have to infer the speaker's intent in order to correct transcription or speaking errors and to generate a text that most closely matches the speaker's meaning in clear and eloquent English.

Faithfully to convey the speaker’s intended meaning as accurately as possible while maintaining the original tone and style. Use the speaker's original phrasing as much as possible.

Do not leave out any content. Do not summarize. 

Output the final text only."""

In [None]:
# postprocess_format_instructions_en_2 = """You are the world's leading expert at formatting Dharma talk audio transcriptions into written text for native, and partly fluent English speakers. 

# The current text is from a Dharma Talk offered by a California-based English-speaking monastic. 

# Your goal is to process the transcription into meaningful paragraphs and sections while correcting errors (logical, transcription, or grammatical). 

# Use <p> tags to mark paragraphs. Insert <section> and <title> tags where appropriate in the text to mark natural sections in the talk; give these sections appropriate titles.

# You may have to infer the speaker's intent, and also use clues from context, in order to correct transcription or speaking errors and to generate a text that most closely matches the speaker's meaning in clear and eloquent English.

# Faithfully convey the speaker's intended meaning as accurately as possible while maintaining the original tone and style. Use the speaker's original phrasing if it is correct.

# Do not leave out any content. Do not add any content. Do not summarize. 

# Output the final text only."""


In [None]:
postprocess_format_instructions_en_2 = """You are the world's leading expert at formatting Dharma talk audio transcriptions into written text. The talks are given by native, and mostly-fluent, English speakers. 

The current text is a section entitled '{section_title}' from a Dharma Talk offered by a California-based English-speaking monastic of the Plum Village tradition established by Thich Nhat Hanh. 

Your goal is to process the section into meaningful paragraphs while correcting errors (logical, speaking, transcription, or grammatical). 

Use <p> tags to mark paragraphs. Insert <section> and <title> tags at the beginning of the text and close with a </section> tag. 

You may have to infer the speaker's intent, and also use clues from context, in order to correct transcription or speaking errors and to generate a text that most closely matches the speaker's meaning in clear and eloquent English.

Faithfully convey the speaker's intended meaning as accurately as possible while maintaining the original tone and style. Use the speaker's original phrasing if it works well and is correct.

For corrections or language inference, you may refer to the language on the plumvillage.org website.

The final section should be polished and publication ready.

Do not leave out any content. Do not add any content. Do not summarize. 

Output the final text only."""

In [None]:
postprocess_format_en_3 = """You are the world's leading expert at formatting Dharma talk audio transcriptions into written text for native, and partly fluent English speakers. 

The current text is from a Dharma Talk offered by a  California-based English-speaking monastic. 

Your goal is to process the transcription into meaningful concise paragraphs while correcting errors (logical, transcription, or grammatical). 

You may have to infer the speaker's intent, and also use clues from context, in order to correct transcription or speaking errors and to generate a text that most closely matches the speaker's meaning in clear and eloquent English.

Faithfully convey the speaker’s intended meaning as accurately as possible while maintaining the original tone and style. Use the speaker's original phrasing if it is correct and clear.

Do not leave out any content. Do not add any content.  Do not summarize.  

Output the final text only."""

In [None]:
postprocess_section_instructions_en = """You are a highly skilled and meticulous assistant processing an audio transcript of a Dharma Talk given in English.

Each line of the transcript is numbered in the format: <NUM:LINE> 

You goal is to divide the entire transcript into {section_count} logical sections based on content. 

For each section, give the title, a brief summary, and starting and ending line numbers.

Also provide a brief summary of the whole text.

IMPORTANT: Every line in the transcript must belong to a section. Don't leave out any lines. Don't include lines in more than one section."""


In [None]:
talk_name = "Taking Care of Our Fear ｜ Br. Phap Luu ｜ 2024-11-06"

In [None]:
video_dir = video_storage_dir / talk_name

In [None]:
transcript_path = video_dir / f"{talk_name}.txt"

In [None]:
print(transcript_path)
transcript_path.exists()

In [None]:
transcript = get_text_from_file(transcript_path)

In [None]:
print(transcript[:1000])

In [None]:
wrapped_transcript = wrap_lines(transcript, number=True)

In [None]:
print(wrapped_transcript[:1000])

In [None]:
section_instructions = postprocess_section_instructions_en.format(section_count=4)

In [None]:
print(section_instructions)

In [None]:
section_object = process_text(wrapped_transcript, section_instructions, response_object=DharmaTalkSections)

In [None]:
json_section_path = video_dir / f"section_{talk_name}.json"

In [None]:
save_model_to_json(json_section_path, section_object)

In [None]:
# section_object = load_json_into_model(json_section_path, DharmaTalkSections)

In [None]:
print(section_object.talk_summary)

In [None]:
section_object.sections

In [None]:
output_xml_path = video_dir / f"formatted_{talk_name}.xml"

In [None]:
print(output_xml_path)

### for repairing: conditionally adding sections

In [None]:
sections = section_object.sections
sections_processed = []

section_range = range(0, 4)

logger.info(f"Sections to process: {len(sections)}")
for i in section_range:
    section = sections[i]
    logger.info(f"Processing section {i+1}: '{section.title}'...")
    original_lines = lines_from_wrapped_text(
        wrapped_transcript,  
        start=section.start_line,
        end=section.end_line,
        keep_brackets=False
    )
    section_instructions = postprocess_format_instructions_en_2.format(section_title=section.title)
    
    if i == 0:
        logger.info(f"Processing instructions:\n{section_instructions}")
    
    processed_lines = process_text(original_lines, section_instructions, batch=False)
    sections_processed.append(processed_lines)
    write_text_to_file(output_xml_path, processed_lines, append=True)
write_text_to_file(output_xml_path, "</document>", append=True)

In [None]:
print(output_xml_path)

In [None]:
process_sections(output_xml_path, transcript, section_object, postprocess_format_instructions_en_2)

In [None]:
# sections_formatted = []
# sections_original = []
# sections = section_object.sections
# section_range = range(0, 2)
# output_file = test_dir / f"formatted_{talk_name}.xml"
# for i in section_range:
#     section = sections[i]
#     original_lines = lines_from_wrapped_text(wtest, section.start_line, section.end_line, keep_brackets=False)
#     format_instructions = postprocess_format_instructions_en_2.format(section_title=section.title)
#     logger.info(f"Formatting section '{section.title}'...")

#     if i == 0:
#         logger.info(f"Translation instructions:\n{format_instructions}")
    
#     translated_lines = postprocess_text(original_lines, format_instructions, batch=False)
#     sections_formatted.append(translated_lines)
#     write_text_to_file(output_file, translated_lines, append=True)
    

In [None]:
for i, video_dir in enumerate(iterate_subdir(video_storage_dir)):
    try:
        talk_name = video_dir.name
        
        logger.info(f"Processing talk {i+1}: '{talk_name}'") 
        
        transcript_file = video_dir / f"{talk_name}.txt"

        section_output_path = video_dir / f"section_{talk_name}.json"

        output_xml_path = video_dir / f"formatted_{talk_name}.xml"

    except Exception as e:
        logger.error(f"Error in file setup for {talk_name}: {e}, skipping.")

    if transcript_file.exists():
        
        logger.info(f"Transcript found: {transcript_file}")

        if output_xml_path.exists():
            logger.info(f"Formatted xml file found. Skipping {talk_name}.")
            
        else: 
            try:
                transcript = get_text_from_file(transcript_file)

                wrapped_transcript = wrap_lines(transcript, number=True)

                logger.info(f"Starting sectioning postprocess for {talk_name}...")

                section_object = process_text(wrapped_transcript, postprocess_section_instructions_en, response_object=DharmaTalkSections, max_tokens=5000)

                write_text_to_file(section_output_path, section_object.model_dump_json())
                logger.info(f"Sectioning for {talk_name} completed. Dumped section data to {section_output_path}.")
                
                logger.info(f"Starting postprocess for {talk_name}: section formatting sequence.")            
                process_sections(output_xml_path, wrapped_transcript, section_object, postprocess_format_instructions_en_2)
                logger.info(f"Postprocessing completed for {talk_name}")

            except Exception as e:
                logger.error(f"Error processing {talk_name}: {e}. Partial processing may have been saved. Skipping to next talk file.")

    else:
        logger.info(f"No transcript found in {transcript_file}. Skipping {talk_name}")

In [None]:
section_object.sections

In [None]:
token_count(str(section_object))