In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from typing import List, Dict
import logging
import json

from pydantic import BaseModel, Field
from typing import List, Optional

from data_processing.xml_processing import wrap_lines, unwrap_lines, lines_from_wrapped_text
from data_processing.text_processing import get_text_from_file, write_text_to_file

In [None]:
from dp_video_processing import get_youtube_urls_from_file, download_audio_yt, detect_boundaries, split_audio_at_boundaries, process_audio_chunks, split_on_silence, postprocess_text

In [None]:
from data_processing.gpt_processing import token_count

In [None]:
# Configure main logger using setup_logger
import logging_config
from logging_config import setup_logging
from logging_config import get_child_logger

In [None]:
setup_logging(log_filename="audio_extract_testing.log", log_level=logging.DEBUG)
logger = get_child_logger("audio_extract_testing")

In [None]:
audio_storage_dir = Path("processed_videos/video_transcriptions")

In [None]:
talk_name = "PTTT 04⧸08⧸2024 ｜ Pháp Thoại Hoà Thượng Thích Phước Tịnh ｜ TV Lộc Uyển (Làng Mai tại Mỹ)"

In [None]:
# talk_name = "'Awakening Together' Family Retreat Dharma Talk ｜ Sister Kính Nghiêm ｜ 2024-06-27"

In [None]:
audio_working_dir = audio_storage_dir / talk_name
print(audio_working_dir)

In [None]:
audio_file_path = audio_working_dir / f"{talk_name}.mp3"
print(audio_file_path)

In [None]:
transcript_path = audio_working_dir / f"{talk_name}.txt"
print(transcript_path)

In [None]:
audio_file_path.exists(), transcript_path.exists()

In [None]:
transcript_path = audio_working_dir / f"{talk_name}.txt"
print(transcript_path.name)

In [None]:
wrap_transcript_path = audio_working_dir / f"wrap_{talk_name}.txt"

In [None]:
section_output_path = audio_working_dir / f"section_{talk_name}.json"
print(section_output_path)

In [None]:
post_output_path = audio_working_dir / f"format_{talk_name}.txt"

In [None]:
postprocess_section_vi = """You are a highly skilled and meticulous assistant processing an audio transcript of a Dharma Talk given in Vietnamese.
Each line of the transcript is numbered in the format: <NUM:LINE> 
You goal is to divide the entire transcript into logical sections based on content. 
For each section, give the title, both in Vietnamese and English, and starting and ending line numbers.
Also provide a brief summary of the text in English.
IMPORTANT: Every line in the transcript must belong to a section. Don't leave out any lines. Don't include lines in more than one section."""


### sectioning test messages:
"""You are a highly skilled and meticulous assistant processing an audio transcript of a Dharma Talk given in Vietnamese.
Each line of the transcript is numbered in the format: <NUM:LINE> 
You goal is to divide the entire transcript into logical sections based on content. 
For each section, give the title, both in Vietnamese and English, and starting and ending line numbers.
Also provide a brief summary of the text in English.
IMPORTANT: Every line in the transcript must belong to a section. Don't leave out any lines. Don't include lines in more than one section."""

You are a highly skilled and meticulous assistant processing an audio transcript of a Dharma Talk given in Vietnamese.
You goal is to divide the entire transcript into logical sections based on content. 
For each section insert a <section> tag, and close the section with </section> 
IMPORTANT: Every line in the transcript must belong to a section. Don't leave out any lines. Don't include lines in more than one section.

In [None]:
instructions_translate_vi_1 = """You are the world's leading expert at translating Dharma talks transcribed from spoken Vietnamese.

You are translating a section titled '{section_title}' from a Dharma talk offered by a Venerable Vietnamese Monastic at Deer Park Monastery in California.

Some transcriptions may be from sounds such as a bell. These can be marked as [Bell].

You may have to infer the speaker's intent in order to correct transcription or speaking errors and to generate a text that most closely matches the speaker's meaning,
while still giving clear and eloquent English.

Your goal is to translate the text and to correct errors (logical, transcription, or grammatical). 
You must faithfully capture the speaker's style and presentation while creating a meaningful flow using common, clear, and typical English. 
Translate faithfully and as carefully as possible. 

Do not leave out any content or summarize. 

The final output should match approximately the length and detail of the original.

Your output should be a polished section.

Make no other changes; add no content.

Output the final text only."""

In [None]:
instructions_translate_vi_2 = """You are the world's leading expert at translating Dharma talks transcribed from spoken Vietnamese.

You are translating a section titled '{section_title}' from a Dharma talk offered by a Venerable Vietnamese Monastic at Deer Park Monastery in California.

Lines of the transcript are numbered and are given in the format <NUM:LINE>.

Your task is to translate each line into correct, clear and typical English. Add correct punctuation to create meaning that matches the speakers style and intent.

You may have to infer the speaker's intent in order to correct transcription or speaking errors and to generate a text that most closely matches the speaker's meaning,
while still giving clear and eloquent English. Give the best approximation or contextual guess if the transcript is difficult or unclear. Make no comments.

You may consider adjacent lines for corrections and context when generating a line, however each line of translation should be as close as possible a translation of the original line.

Some transcriptions may be from sounds such as a bell. These can be marked as [Bell].

You must faithfully capture the speaker's style and presentation while creating a meaningful flow.

Do not leave out any content or summarize. 

The final output should match the same line structure as the original.

Your output should be a polished section.

Make no other changes; add no content.

Output the final text only."""

In [None]:
instructions_translate_vi_3 = """You are the world's leading expert at translating Dharma talks transcribed from spoken Vietnamese.

You are translating a section titled '{section_title}' from a Dharma talk offered by a Venerable Vietnamese Monastic at Deer Park Monastery in California.

Lines of the transcript are numbered and are given in the format <NUM:LINE>.

Your task is to translate each line into correct, clear and typical English. 

Add correct punctuation to create meaning that matches the speakers style and intent and creates flow between lines. You may adjust capitalization as needed for correctness.

You may have to infer the speaker's intent in order to correct transcription or speaking errors and to generate a text that most closely matches the speaker's meaning,
while still giving clear and eloquent English. Give the best approximation or contextual guess if the transcript is difficult or unclear. Make no comments.

You may consider adjacent lines for corrections and context when generating a line, however each line of translation should be based on the original line.

Some transcriptions may be from sounds such as a bell. These can be marked as [Bell].

You must faithfully capture the speaker's style and presentation while creating a meaningful flow.

Do not leave out any content or summarize. 

The final output should match the same line structure as the original.

Your output should be a polished section.

Make no other changes; add no content.

Output the final text only."""

You are the world's leading expert at translating Dharma talks transcribed from spoken Vietnamese.

You are translating a section titled '{section_title}' from a Dharma talk offered by a Venerable Vietnamese Monastic at Deer Park Monastery in California.

Lines of the transcript are numbered and are given in the format <NUM:LINE>.

Your task is to translate each line into correct, clear and typical English.

You may have to infer the speaker's intent in order to correct transcription or speaking errors and to generate a text that most closely matches the speaker's meaning,
while still giving clear and eloquent English.

You may consider adjacent lines for corrections and context when creating a line, however each line of translation should be as close as possible a translation of the original line.

Some transcriptions may be from sounds such as a bell. These can be marked as [Bell].

You must faithfully capture the speaker's style and presentation while creating a meaningful flow.

Translate faithfully and as carefully as possible. 

Do not leave out any content or summarize. 

The final output should match the same line structure as the original.

Your output should be a polished section.

Make no other changes; add no content.

Output the final text only.

You are the world's leading expert at translating Dharma talks transcribed from spoken Vietnamese.

You are translating a section titled '{section_title}' from a Dharma talk offered by a Venerable Vietnamese Monastic at Deer Park Monastery in California.

Lines of the transcript are numbered and are given in the format <NUM:LINE>.

Your task is to translate each line into correct, clear and typical English.

You may have to infer the speaker's intent in order to correct transcription or speaking errors and to generate a translation
that matches the speaker's meaning and intent within the context of the section, and is in creates a meaningful, logical and correct flow in English.

Consider adjacent lines and the context of the section for corrections when translating a line, however, each line of translation should be based on the original line.

Some transcription lines may be from sounds such as a bell. These can be marked as [Bell].
 
You must faithfully capture the speaker's style, presentation and intent while creating coherent, eloquent content across all lines.

Do not leave out any content or summarize. 

The final output should match exactly the same line structure as the original.

Make no other changes; add no content.

Output the final text only.

In [None]:
postprocess_format_en = """You are the world's leading expert at formatting Dharma talk audio transcriptions into written text for native, and partly fluent English speakers. 

The current text is from a Dharma Talk offered by a Southern Californian English-speaking monastic.

Make necessary corrections to grammar to create correct English sentence structure and logical flow. 

Insert <section> and <title> tags where appropriate in the text to mark natural sections in the talk; give these sections appropriate titles.

You may have to infer the speaker's intent in order to correct transcription or speaking errors and to generate a text that most closely matches the speaker's meaning in clear and eloquent English.

Your goal is to format the text into meaningful paragraphs and sections while correcting errors (logical, transcription, or grammatical). 

Faithfully to convey the speaker’s intended meaning as accurately as possible while maintaining the original tone and style. Use the speaker's original phrasing as much as possible.

Do not leave out any content. Do not summarize. 

Output the final text only."""

In [None]:
text = get_text_from_file(transcript_path)

In [None]:
wtext = wrap_lines(text, number=True)

In [None]:
print(wtext)

In [None]:
wtext_test = lines_from_wrapped_text(wtext, 1, 100)

In [None]:
wrap_transcript_path

In [None]:
write_text_to_file(wrap_transcript_path, wtext)

In [None]:
class Section(BaseModel):
    title_vi: str = Field(
        ..., 
        description="The title of the section in Vietnamese."
    )
    title_en: str = Field(
        ..., 
        description="The translation of the title of the section in English."
    )
    summary: str = Field(
        ..., 
        description="A summary of the section in English."
    )
    start_line: int = Field(
        ..., 
        description="The starting line number of this section."
    )
    end_line: int = Field(
        ...,
        description="The ending line number of this section."
    )

class DharmaTalkSections(BaseModel):
    talk_summary: str = Field(
        ..., 
        description="A summary of the Dharma talk in English."
    )
    sections: List[Section] = Field(
        ..., 
        description="An ordered list of sections with their titles and included start and end line numbers. The sequence of line ranges for the sections must cover every line from start to finish without any overlaps or gaps."
    )

In [None]:
wrap_transcript_path

In [None]:
print(postprocess_section_vi)

In [None]:
section_object = postprocess_text(wrap_transcript_path, postprocess_section_vi, response_object=DharmaTalkSections, batch=False, max_tokens=5000)

In [None]:
section_object.sections

In [None]:
len(section_object.sections)

In [None]:
sections_translated = []
sections_original = []
sections = section_object.sections
section_range = range(0, 16)
backup_file = Path("backup_section_data.txt")
for i in section_range:
    section = sections[i]
    original_lines = lines_from_wrapped_text(wtext, section.start_line, section.end_line, keep_brackets=True)
    translate_instructions = instructions_translate_vi_2.format(section_title=section.title_en)
    logger.info(f"Translating section '{section.title_en}'...")

    if i == 0:
        logger.info(f"Translation instructions:\n{translate_instructions}")
    
    translated_lines = postprocess_text(original_lines, translate_instructions, batch=False)
    sections_translated.append(translated_lines)
    write_text_to_file(backup_file, translated_lines, append=True)
    sections_original.append(original_lines)



In [None]:
original_lines

In [None]:
for i in section_range:
    original_lines = unwrap_lines(sections_original[i], number=True).split('\n')
    translated_lines = unwrap_lines(sections_translated[i], number=True).split('\n')

    # Get the maximum length of the original lines
    max_original_len = max(len(line) for line in translated_lines)
    
    # Format output with columns
    for j, (o_l, t_l) in enumerate(zip(original_lines, translated_lines)):
        print(f"{j:>4}: {t_l:<{max_original_len + 5}}<<<   {o_l}")
    # print(section_original[i])
    # print(section_translated[i])

In [None]:
print(postprocess_translate_vi.format(section_title=section.title_en))

In [None]:
section_object.talk_summary

In [None]:
section_object.model_dump_json()

In [None]:
write_text_to_file(section_output_path, section_object.model_dump_json())

In [None]:
postprocess_text(transcript_path, post_output_path, postprocess_format_en, batch=False)