In [None]:
import os
import google.generativeai as genai
import time
import re
# Configure API key (replace with your key)
genai.configure(api_key="")

# Model setup
generation_config = {
    "temperature": 0.2,
    "top_p": 1,
    "top_k": 1,
    "max_output_tokens": 4096,
}

system_prompt = (
    "You are a multilingual translator. You will be given a text in the input language. Your task is to translate the text into the output language. "
    "You must provide a translation that is accurate, consistent, and fluent in the output language. The output must only be the translation of the text, with no explanation."
)
safety_settings = {
    "HARM_CATEGORY_HARASSMENT": "BLOCK_NONE",
    "HARM_CATEGORY_HATE_SPEECH": "BLOCK_NONE",
    "HARM_CATEGORY_SEXUALLY_EXPLICIT": "BLOCK_NONE",
    "HARM_CATEGORY_DANGEROUS_CONTENT": "BLOCK_NONE",
}

model = genai.GenerativeModel(
    model_name="gemini-2.0-flash",  # Adjust model name as needed
    generation_config=generation_config,
    system_instruction=system_prompt,
    safety_settings=safety_settings
)

def generate_glossary(text, max_retries=3, min_chunk_size=500):
    """
    Generate a glossary from the text, handling API rejections by splitting recursively.

    Args:
        text (str): Input text to extract terms from.
        max_retries (int): Maximum retries before giving up on a chunk.
        min_chunk_size (int): Minimum chunk size to avoid infinite splitting.

    Returns:
        dict: Glossary mapping Japanese terms to Simplified Chinese translations.
    """
    glossary_prompt = (
        "You are a multilingual glossary generator. I will provide a text in the input language. You will generate a glossary for this text, listing special terms that require consistent translation, along with their translations in the output language."
        "Extract special terms (single words only) from the following text, such as names of characters, places, unique items, etc., which may not have a single correct translation and could be inconsistent across multiple translations."
        "Provide the special terms in the input language and their most appropriate translations in the output language. Each term should correspond to only one translation. The output should be a list of input language terms mapped to their output language translations."
        "Output format: term1: translation1\nterm2: translation2\n...\n\n"
    )
    def process_chunk(chunk, retries_left):
        full_prompt = glossary_prompt + chunk
        try:
            response = model.generate_content([full_prompt])
            glossary = {}
            for line in response.text.split('\n'):
                if ':' in line:
                    term, translation = line.split(':', 1)
                    glossary[term.strip()] = translation.strip()
            return glossary, True
        except Exception as e:
            if retries_left <= 0:
                print(f"Failed to process chunk after {max_retries} retries: {e}")
                return {}, False
            print(f"Error processing chunk: {e}. Retrying with split...")
            time.sleep(2)

            if len(chunk) <= min_chunk_size:
                print(f"Chunk too small to split further: {chunk[:20]}...")
                return {}, False

            mid = len(chunk) // 2
            paragraphs = chunk.split('\n')
            if len(paragraphs) > 1:
                split_idx = sum(len(p) for p in paragraphs[:len(paragraphs)//2]) + 1
                first_half = chunk[:split_idx]
                second_half = chunk[split_idx:]
            else:
                first_half = chunk[:mid]
                second_half = chunk[mid:]

            glossary1, success1 = process_chunk(first_half, retries_left - 1)
            glossary2, success2 = process_chunk(second_half, retries_left - 1)
            merged_glossary = glossary1.copy()
            for term, translation in glossary2.items():
                if term not in merged_glossary:
                    merged_glossary[term] = translation
            return merged_glossary, success1 or success2

    glossary, success = process_chunk(text, max_retries)
    if not success:
        print("Warning: Glossary generation partially failed. Some terms may be missing.")
    return glossary

def generate_summary(text, glossary_str):
    """
    Generate a brief Chinese summary of the text using the glossary for consistency.

    Args:
        text (str): Input text to summarize.
        glossary_str (str): Glossary string for term consistency.

    Returns:
        str: Generated summary in Chinese.
    """
    summary_prompt = (
        "You are a multilingual text summarizer. I will provide a text in the input language. You will summarize this text briefly in the output language."
        f"Using the following glossary {glossary_str}, summarize the following text {text} in no more than 3 sentences. Only output the summary in the output language."
    )
    try:
        response = model.generate_content([summary_prompt])
        return response.text
    except Exception as e:
        print(f"Error generating summary: {e}")
        return ""

def contains_japanese(text, threshold=0.25):
    """
    Check if Japanese characters exceed a threshold percentage of the text.

    Args:
        text (str): Text to check.
        threshold (float): Max allowed proportion of Japanese characters (default 0.25).

    Returns:
        bool: True if Japanese exceeds threshold, False otherwise.
    """
    if not text:
        return False
    japanese_count = sum(1 for char in text if '\u3040' <= char <= '\u309F' or '\u30A0' <= char <= '\u30FF')
    return (japanese_count / len(text)) >= threshold

def translate_with_splitting(fixed_prompt, chunk, min_chunk_size=500):
    """
    Translate a chunk, splitting recursively if the API rejects it.

    Args:
        fixed_prompt (str): Fixed part of the prompt (summary, glossary, context).
        chunk (str): Text chunk to translate.
        min_chunk_size (int): Minimum chunk size.

    Returns:
        str: Translated text.
    """
    prompt = fixed_prompt + f"Translate the following Japanese text to Chinese: {chunk}"

    def try_translate():
        try:
            response = model.generate_content([prompt])
            translated = response.text
            if not contains_japanese(translated):
                return translated
            print("Too much Japanese detected, retrying...")
            time.sleep(1)
            return None
        except Exception as e:
            print(f"Error: {e}")
            time.sleep(10)
            return None

    translated = try_translate()
    if translated is not None:
        return translated

    if len(chunk) <= min_chunk_size:
        print(f"Chunk too small to split further: {chunk[:20]}...")
        return ""

    mid = len(chunk) // 2
    first_half = chunk[:mid]
    second_half = chunk[mid:]

    translated_first = translate_with_splitting(fixed_prompt, first_half)
    translated_second = translate_with_splitting(fixed_prompt, second_half)
    return translated_first + translated_second

def split_into_chunks(text, chunk_size=3192, overlap=0):
    """
    Split text into chunks with optional overlap.

    Args:
        text (str): Text to split.
        chunk_size (int): Size of each chunk.
        overlap (int): Overlap between chunks.

    Returns:
        list: List of text chunks.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

def translate_long_text(text, chunk_size=3192, context_mode='chinese_only', context_lines=10):
    """
    Translate long text with glossary, summary, and context options.

    Args:
        text (str): Text to translate.
        chunk_size (int): Size of each chunk.
        context_mode (str): 'jp_ch', 'chinese_only', or 'no_context'.
        context_lines (int): Number of context lines.

    Returns:
        str: Translated text.
    """
    glossary = generate_glossary(text)
    glossary = {term: translation for term, translation in glossary.items() if len(term) <= 25}
    glossary_str = ", ".join([f"'{term}': '{translation}'" for term, translation in glossary.items()])
    print(f"Generated glossary: {glossary_str}")
    summary = generate_summary(text, glossary_str)
    print(f"Generated summary: {summary}")

    chunks = split_into_chunks(text, chunk_size, overlap=0)
    translated_chunks = []
    prev_jp_lines = []
    prev_ch_lines = []

    for chunk in chunks:
        filtered_glossary = {term: translation for term, translation in glossary.items() if term in chunk}
        filtered_glossary_str = ", ".join([f"'{term}': '{translation}'" for term, translation in filtered_glossary.items()])

        fixed_prompt = f"Summary: {summary}\n\n"
        if context_mode == 'jp_ch' and (prev_jp_lines or prev_ch_lines):
            jp_context = '\n'.join(prev_jp_lines[-context_lines:]) if prev_jp_lines else ""
            ch_context = '\n'.join(prev_ch_lines[-context_lines:]) if prev_ch_lines else ""
            fixed_prompt += f"Previous Japanese context: {jp_context}\nPrevious Chinese context: {ch_context}\n\n"
        elif context_mode == 'chinese_only' and prev_ch_lines:
            ch_context = '\n'.join(prev_ch_lines[-context_lines:])
            fixed_prompt += f"Previous context: {ch_context}\n\n"
        # 'no_context' adds nothing to fixed_prompt beyond summary

        fixed_prompt += f"Glossary: {filtered_glossary_str}\n\n"
        translated = translate_with_splitting(fixed_prompt, chunk)
        translated_chunks.append(translated)

        # Update context
        jp_lines = chunk.split('\n')
        ch_lines = translated.split('\n')
        prev_jp_lines.extend(jp_lines)
        prev_ch_lines.extend(ch_lines)

    return "".join(translated_chunks)


In [None]:
def reduce_repetitive_characters(text):
    processed_text = re.sub(r'(.{3})\1{2,}', r'\1\1', text, flags=re.DOTALL)
    processed_text = re.sub(r'(.{2})\1{2,}', r'\1\1', processed_text, flags=re.DOTALL)
    processed_text = re.sub(r'(.)\1{2,}', r'\1\1', processed_text, flags=re.DOTALL)
    return processed_text

# Move file to finished folder
def move_file_to_finished(file_path):
    finished_path = os.path.join("finished_workflow", os.path.basename(file_path))
    try:
        os.rename(file_path, finished_path)
    except FileExistsError:
        base_name, extension = os.path.splitext(finished_path)
        new_path = f"{base_name}_2{extension}"
        os.rename(file_path, new_path)
        print(f"Renamed to {new_path} due to existing file")


In [None]:

# Process all .txt files
preprocessing = True
for file in os.listdir():
    if not file.endswith(".txt"):
        continue
    print(f"Processing {file}...")
    try:
        # Try utf-16 first
        with open(file, 'r', encoding='utf-16') as f:
            text = f.read()
    except UnicodeError:
        try:
        # Try utf-8 next
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
        except UnicodeError:
        # Fall back to default encoding with errors handled
            with open(file, 'r', encoding='utf-8', errors='replace') as f:
                text = f.read()
            print(f"Warning: Encoding issues detected in {file}, using replacement characters")
    if True:
        if preprocessing:
            text = reduce_repetitive_characters(text)
        translated_text = translate_long_text(text)
        output_file = f"translated_workflow/{file[:-4]}_translated.txt"
        os.makedirs("translated_workflow", exist_ok=True)
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(translated_text)
        print(f"{file} translated to {output_file}")
        move_file_to_finished(file)
