In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage
from textwrap import dedent
import translation_helper
import json


line_list_format = {
    "lines": ["<japanese text line 1>", "<japanese text line 2>"]
}

line_dict_format = {
    "line_0": "<japanese text line 1>",
    "line_1": "<japanese text line 2>"
}

translation_output_format = {"translation": "<english translation>"}

word_dict_format = {
    "line_0": ["<japanese word 1>", "<japanese word 2>"],
    "line_1": ["<japanese word 3>", "<japanese word 4>", "<japanese word 5>"]
}

word_detail_input_format = {
    "line_0": [{"word": "<japanese word 1>"}, {"word": "<japanese word 2>"}],
    "line_1": [{"word": "<japanese word 3>"}, {"word": "<japanese word 4>"}, {"word": "<japanese word 5>"}]
}

word_detail_output_format = {
    "line_0": [
        {
            "word": "<japanese word 1>",
            "hiragana": "<hiragana for japanese word 1>",
            "dictionary_entry": "<dictionary entry for japanese word 1>",
            "dictionary_hiragana": "<hiragana for dictionary entry of japanese word 1>",
            "meaning": "<english translation for japanese word 1>",
            "sentence_form": "<sentence form for japanese word 1>"
        }, 
        {
            "word": "<japanese word 2>",
            "hiragana": "<hiragana for japanese word 2>",
            "dictionary_entry": "<dictionary entry for japanese word 2>",
            "dictionary_hiragana": "<hiragana for dictionary entry of japanese word 2>",
            "meaning": "<english translation for japanese word 2>",
            "sentence_form": "<sentence form for japanese word 2>"
        }, 
    ],
    "line_1": [
        {
            "word": "<japanese word 3>",
            "hiragana": "<hiragana for japanese word 3>",
            "dictionary_entry": "<dictionary entry for japanese word 3>",
            "dictionary_hiragana": "<hiragana for dictionary entry of japanese word 3>",
            "meaning": "<english translation for japanese word 3>",
            "sentence_form": "<sentence form for japanese word 3>"
        }, 
    ]
}


output_format = {
    "lines": [{
        "words": [{
            "word": "<japanese word>",
            "hiragana": "<japanese word in hiragana>",
            "dictionary_entry": "<japanese dictionary entry>",
            "dictionary_hiragana": "<japanese dictionary entry in hiragana>",
            "meaning": "<english translation>",
            "sentence_form": "<sentence form>"
        }]
    }],
    "translation": "<translation>"
}

# OpenAI-compatible local server
llm = ChatOpenAI(
   model="gpt-4.1",
   base_url="http://localhost:4141",
   api_key="not_required",   # anything non-empty if your server doesn't check
   temperature=0.2,
)



In [None]:
def lines_list_to_line_dict(lines):
    return {f"line_{i}": line for i, line in enumerate(lines["lines"])}

def word_dict_to_word_detail(dict):
    return {line: [{"word": word} for word in words] for line, words in dict.items()}

def word_detail_to_output_format(word_detail, translation):
    output = {"lines": [], "translation": translation}
    for line, words in word_detail.items():
        output["lines"].append({"words": words})
    return output

In [None]:
def character_repair_chain(llm):
    system_prompt = dedent(f"""\
        I will provide an json dictionary of japanese words, which represent lines of text in a japanese manga.
        These have been digitally converted using OCR tooling.
        If you believe there are any character errors in the OCR output, correct them.
        Only correct errors if they are clearly due to the OCR tool misinterpreting the characters. Do not correct anything else.

        This input and output format will be as follows: 
        {json.dumps(line_dict_format, indent=2)}

        Respond only in the output format.
    """)
    prompt = ChatPromptTemplate.from_messages([
        SystemMessage(content=system_prompt),
        ("human", "{input}")
    ])

    chain = prompt | llm | JsonOutputParser()
    return chain


In [None]:
def separate_word_chain(llm):
    system_prompt = dedent(f"""\
        I will provide an array of japanese words, which represent lines of input text in a japanese manga.
        I want you to separate the lines out into individual words. 
        Each word should represent a single dictionary word. Do not join words together.
        Particles should represent also their own words. Do not join them with other words.

        The input will be of the format:
        {json.dumps(line_dict_format, indent=2)}

        Respond only in the format:
        {json.dumps(word_dict_format, indent=2)}
    """)
    prompt = ChatPromptTemplate.from_messages([
        SystemMessage(content=system_prompt),
        ("human", "{input}")
    ])

    chain = prompt | llm | JsonOutputParser()
    return chain


In [None]:
def translation_chain(llm):
    system_prompt = dedent(f"""\
        I will provide an array of japanese words, which represent lines of input text in a japanese manga.
        I want you to provide an english transation for the full sentence.
        Only provide the translation, do not include any additional information.
        If possible, preference a transaltion that fits in with the original japanese grammer and sentence order.
        Provide the translations as a single string value in a json dictionary as the output format specified below.
        Do not respond as an array.

        The input will be of the format:
        {json.dumps(line_dict_format, indent=2)}

        Respond only in the format:
        {json.dumps(translation_output_format, indent=2)}
    """)
    prompt = ChatPromptTemplate.from_messages([
        SystemMessage(content=system_prompt),
        ("human", "{input}")
    ])

    chain = prompt | llm | JsonOutputParser()
    return chain


In [None]:
def word_detail_chain(llm):
    system_prompt = dedent(f"""\
        I will provide an lines of japanese words, which represent lines of input text in a japanese manga.
        For each word in the line, I want you to add:
            - The hirigana representation of the word 
            - The dictionary form of the word
            - The hirigana for the dictionary entry of the word
            - The translated meaning of the word
            - It's sentence form (eg present continuous, past, etc )

        The input will be of the format:
        {json.dumps(word_detail_input_format, indent=2)}

        Respond only in the format:
        {json.dumps(translation_output_format, indent=2)}
    """)
    prompt = ChatPromptTemplate.from_messages([
        SystemMessage(content=system_prompt),
        ("human", "{input}")
    ])

    chain = prompt | llm | JsonOutputParser()
    return chain


In [None]:
def translation_pipeline(llm, input):
    lines_as_dict = lines_list_to_line_dict(input)
    repaired_chars = character_repair_chain(llm).invoke({"input": json.dumps(lines_as_dict, indent=2, ensure_ascii=False)})
    translation = translation_chain(llm).invoke({"input": json.dumps(repaired_chars, indent=2, ensure_ascii=False)})
    separate_words = separate_word_chain(llm).invoke({"input": json.dumps(repaired_chars, indent=2, ensure_ascii=False)})
    word_detail = word_dict_to_word_detail(separate_words)
    word_detail_translated = word_detail_chain(llm).invoke({"input": json.dumps(word_detail, indent=2, ensure_ascii=False)})
    return word_detail_to_output_format(word_detail_translated, translation["translation"])

In [None]:
#translation_pipeline(llm, test_input)

In [None]:
file = "...mokuro"

In [None]:
from ipywidgets import IntProgress, HBox, Label
from IPython.display import display
import json

with open(file) as f:
    volume = json.load(f)


total_blocks = sum(
    1
    for page in volume["pages"]
    for block in page["blocks"]
)



pbar = IntProgress(min=0, max=total_blocks, value=0, description="Blocks")
status = Label(value=f"0 / {total_blocks}")

# Error widgets
errors = 0
errbar = IntProgress(min=0, max=total_blocks, value=0, description="Errors")
errbar.bar_style = "danger"
err_status = Label(value="0 errors")

display(HBox([pbar, status, errbar, err_status]))

processed = 0
for page in volume["pages"]:
    for block in page["blocks"]:
        if ("line_translations" not in block or "translation" not in block) and len(block["lines"]) > 0:
            max_retries = 3
            retry_count = 0
            while retry_count <= max_retries:
                try:
                    result = translation_pipeline(llm, {"lines": block["lines"]})
                    block["line_translations"] = result["lines"]
                    block["translation"] = result["translation"]
                    break
                except Exception as e:
                    if retry_count == max_retries:
                        block["translation_error"] = str(e)
                        errors += 1
                        errbar.value = errors
                        err_status.value = f"{errors} errors"
                    retry_count += 1
            with open(file, "w") as f:
                f.write(json.dumps(volume, indent=2, ensure_ascii=False))
        processed += 1
        pbar.value = processed
        status.value = f"{processed} / {total_blocks}"

if processed == total_blocks:
    pbar.bar_style = "success"