Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature]: 一种更好的LaTeX翻译方式,有望实现支持ChatGLM。已经完成部分代码,需要帮助 #1038

Open
azwphy opened this issue Aug 13, 2023 · 4 comments

Comments

@azwphy
Copy link

azwphy commented Aug 13, 2023

Class | 类型

函数插件

Feature Request | 功能请求

基本思路是:用pylatexenc包的parser工具将LaTeX代码转为抽象语法树(AST),通过遍历整个抽象语法树找到需要翻译的纯文本,并记录其在AST中的位置(pos)。把文本丢给ChatGLM翻译,再将得到的代码重新构建为latex代码,导言区加入ctex包,直接编译即可。

尝试开发中的插件代码如下,其中pylatexenc==3.0a19:

'crazy_functions/latex_fns/latex_parser_ver.py'

from toolbox import update_ui, update_ui_lastest_msg    # 刷新Gradio前端界面
from toolbox import zip_folder, objdump, objload, promote_file_to_downloadzone
from .latex_toolbox import PRESERVE, TRANSFORM
from .latex_toolbox import set_forbidden_text, set_forbidden_text_begin_end, set_forbidden_text_careful_brace
from .latex_toolbox import reverse_forbidden_text_careful_brace, reverse_forbidden_text, convert_to_linklist, post_process
from .latex_toolbox import fix_content, find_main_tex_file, merge_tex_files, compile_latex_with_timeout
from pylatexenc.latexnodes import *
from pylatexenc.latexwalker import *
import os, shutil
import re, itertools
import numpy as np
from ..crazy_utils import request_gpt_model_in_new_thread_with_ui_alive

pj = os.path.join
white_macroname_list = [
    "emph",
    "textbf"
]
def parse_latex(latex_string):
   # 创建一个LatexWalker对象,用于解析LaTeX代码
   latex_walker = LatexWalker(latex_string)
   # 获取解析后的AST
   nodelist, parsing_state_delta = latex_walker.parse_content(parsers.LatexGeneralNodesParser())
   npos = nodelist.pos
   nlen = nodelist.len
   return nodelist, npos, nlen

   # 解析LaTeX代码

def begining_of_document(nodes):
    for node in nodes:
        if isinstance(node, LatexEnvironmentNode):
            if node.environmentname == "document":
                return node

def get_note_from_nodes(nodes, res_list):
    # 遍历AST
    if isinstance(nodes, list):
        for node in nodes:
            get_note_from_nodes(node, res_list)
    elif isinstance(nodes, LatexEnvironmentNode) or isinstance(nodes, LatexGroupNode):
        for child in nodes.nodelist:
            get_note_from_nodes(child, res_list)
    elif isinstance(nodes, LatexMacroNode):
        if nodes.macroname in white_macroname_list:
            get_note_from_nodes(nodes.nodeargd, res_list)
    elif isinstance(nodes, ParsedArguments):
        get_note_from_nodes(nodes.argnlist, res_list)
    elif isinstance(nodes, LatexCharsNode):
        if not "_" in nodes.chars:
            res_list.append([(nodes.chars).replace("\n", ""), nodes.pos, 0, 0])

def split_text_to_words(res_list, words):
    for line in res_list:
       currect_line = line[0].split()
       currect_line = [[x, line[1], line[2], line[3]] for x in currect_line]
       for i in currect_line:
            words.append(i)
    for index, element in enumerate(words):
        element[2] = index
    return words

def split_counted_res(split_res_list, max_limited_word=200):
    last_period_index = 0
    index = 0
    token_counter = 0
    line_counter = 0
    while True:
        if index == len(split_res_list):
            break
        split_res_list[index][3] = line_counter
        if "." in split_res_list[index][0]:
            last_period_index = index
        if token_counter > max_limited_word:
            line_counter += 1
            token_counter = 0
            index = last_period_index
        index += 1
        token_counter += 1
    return split_res_list

def combine_by_pos(finally_counted_res):
    combine_counted_res = []
    mid_res = [list(v) for k, v in itertools.groupby(finally_counted_res, key=lambda x: x[3])]
    for element in mid_res:
        combine_counted_res.append([list(v) for k, v in itertools.groupby(element, key=lambda x: x[1])])
    # 最后再合并一次
    finally_combine_counted_res = []
    for element in combine_counted_res:
        midmid_res = []
        for element2 in element:
            midmid_res.append([' '.join([x[0] for x in element2]), element2[0][1]])
        finally_combine_counted_res.append(midmid_res)
    return finally_combine_counted_res

def translate_func(combine_counted_res, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]):
    # 翻译函数
    #  <-------- 根据需要切换prompt ----------> 
    inputs_array = []
    for element in combine_counted_res:
        for element2 in element:
            prompt = ""
            # prompt += "Context:\n"
            # for element3 in element:
            #     prompt += element3[0]
            prompt += "Below is a section from an academic paper, translate this section to Chinese."
            prompt += "Answer me only with the revised text:"
            prompt += element2[0]
            inputs_array.append([prompt, element2[1]])
    length_of_inputs_array = len(inputs_array)
    sys_prompt = "You are a professional translator."
    #  <-------- gpt 单线程请求 ----------> 
    gpt_response_collection = []
    for element in inputs_array:
        gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
            inputs=element[0],             # 提问的内容,给chatgpt看的
            inputs_show_user=element[0],   # 提问的内容,给用户看的(可以隐藏啰嗦的细节)
            llm_kwargs=llm_kwargs,    # 无聊的chatgpt内部参数
            chatbot=chatbot,          # 聊天框句柄,原样传递
            history=[],               # 之前的聊天内容,只有之前的聊天内容中有值得抽取的信息时,才是必要的
            sys_prompt=sys_prompt
            )
        gpt_response_collection.append([gpt_say,element[1]])
    return gpt_response_collection
    

def return_note_from_nodes(nodes, trans_res_list):
    # 遍历AST
    if isinstance(nodes, list):
        for node in nodes:
            return_note_from_nodes(node, trans_res_list)
    elif isinstance(nodes, LatexEnvironmentNode) or isinstance(nodes, LatexGroupNode):
        for child in nodes.nodelist:
            return_note_from_nodes(child, trans_res_list)
    elif isinstance(nodes, LatexMacroNode):
        if nodes.macroname in white_macroname_list:
            return_note_from_nodes(nodes.nodeargd, trans_res_list)
    elif isinstance(nodes, ParsedArguments):
        return_note_from_nodes(nodes.argnlist, trans_res_list)
    elif isinstance(nodes, LatexCharsNode):
        for i in trans_res_list:
            if i[1] == nodes.pos:
                i[1] = nodes.chars

def ProcessLaTeXMain(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]):
    import time, os, re
    from .latex_actions import LatexPaperFileGroup, LatexPaperSplit

    #  <-------- 寻找主tex文件 ----------> 
    maintex = find_main_tex_file(file_manifest, mode)
    chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。'))
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
    time.sleep(3)

    #  <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ----------> 
    main_tex_basename = os.path.basename(maintex)
    assert main_tex_basename.endswith('.tex')
    main_tex_basename_bare = main_tex_basename[:-4]
    may_exist_bbl = pj(project_folder, f'{main_tex_basename_bare}.bbl')
    if os.path.exists(may_exist_bbl):
        shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge.bbl'))
        shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_{mode}.bbl'))
        shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_diff.bbl'))

    with open(maintex, 'r', encoding='utf-8', errors='replace') as f:
        content = f.read()
        merged_content = merge_tex_files(project_folder, content, mode)

    with open(project_folder + '/merge.tex', 'w', encoding='utf-8', errors='replace') as f:
        f.write(merged_content)
    # 读取文件
    with open(project_folder + '/merge.tex', 'r', encoding='utf-8') as f:
        latex_code = f.read()
    # 解析LaTeX代码
    nodes, _, _ = parse_latex(latex_code)
    trnode= list(nodes)
    res_list = []
    get_note_from_nodes(begining_of_document(trnode), res_list)
    res_list = sorted(res_list, key=lambda x: x[2])
    res_list = [x for x in res_list if x[0] != ""]
    words = []
    split_res_list = split_text_to_words(res_list, words)
    finally_counted_res = split_counted_res(split_res_list)
    finally_combine_counted_res = combine_by_pos(finally_counted_res)
    gpttranres = translate_func(finally_combine_counted_res, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[])
    chatbot.append(str(gpttranres))
    trans_res_list = list(itertools.chain.from_iterable(gpttranres))
    # 生成LaTeX代码
    return_note_from_nodes(trnode, trans_res_list)
    with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f:
        if mode != 'translate_zh' or "binary" in final_tex: f.write(trans_res_list)
    #  <-------- 整理结果, 退出 ----------> 
    chatbot.append((f"完成了吗?", 'GPT结果已输出, 即将编译PDF'))
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
    #  <-------- 返回 ----------> 
    return project_folder + f'/merge_{mode}.tex'

crazy_functions/TeX论文翻译

import os
from toolbox import update_ui, trimmed_format_exc, get_conf, objdump, objload, promote_file_to_downloadzone
from toolbox import CatchException, report_execption, update_ui_lastest_msg, zip_result, gen_time_str
from functools import partial
from pylatexenc.latexnodes import *
from pylatexenc.latexwalker import *
from .latex_fns.latex_parser_ver import *
import glob, os, requests, time
pj = os.path.join
def import_requirements():
    try:
        from pylatexenc.latexwalker import LatexWalker, LatexCharsNode # 尝试导入依赖
    except:
        # 如果缺少依赖fitz,则给出安装建议
        report_execption(chatbot, history, 
            a = f"解析项目: {txt}", 
            b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pylatexenc```。")
        yield chatbot, history, '正常'
        return
ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/")

# =================================== 工具函数 ===============================================
专业词汇声明  = 'If the term "agent" is used in this section, it should be translated to "智能体". '
def switch_prompt(translate_list, mode, more_requirement):
    # 切换prompt
    """
    Generate prompts and system prompts based on the mode for proofreading or translating.
    Args:
    - translate_list: Proofreader or Translator instance.
    - mode: A string specifying the mode, either 'proofread' or 'translate_zh'.

    Returns:
    - inputs_array: A list of strings containing prompts for users to respond to.
    - sys_prompt_array: A list of strings containing prompts for system prompts.
    """
    n_split = len(translate_list)
    if mode == 'proofread_en':
        inputs_array = [r"Below is a section from an academic paper, proofread this section." + 
                        r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + more_requirement +
                        r"Answer me only with the revised text:" + 
                        f"\n\n{frag}" for frag in pfg.sp_file_contents]
        sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)]
    elif mode == 'translate_zh':
        inputs_array = [r"Below is a section from an English academic paper, translate it into Chinese. " + more_requirement + 
                        r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + 
                        r"Answer me only with the translated text:" + 
                        f"\n\n{frag}" for frag in pfg.sp_file_contents]
        sys_prompt_array = ["You are a professional translator." for _ in range(n_split)]
    else:
        assert False, "未知指令"
    return inputs_array, sys_prompt_array

def desend_to_extracted_folder_if_exist(project_folder):
    """ 
    Descend into the extracted folder if it exists, otherwise return the original folder.

    Args:
    - project_folder: A string specifying the folder path.

    Returns:
    - A string specifying the path to the extracted folder, or the original folder if there is no extracted folder.
    """
    maybe_dir = [f for f in glob.glob(f'{project_folder}/*') if os.path.isdir(f)]
    if len(maybe_dir) == 0: return project_folder
    if maybe_dir[0].endswith('.extract'): return maybe_dir[0]
    return project_folder

def move_project(project_folder, arxiv_id=None):
    """ 
    Create a new work folder and copy the project folder to it.

    Args:
    - project_folder: A string specifying the folder path of the project.

    Returns:
    - A string specifying the path to the new work folder.
    """
    import shutil, time
    time.sleep(2)   # avoid time string conflict
    if arxiv_id is not None:
        new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder')
    else:
        new_workfolder = f'gpt_log/{gen_time_str()}'
    try:
        shutil.rmtree(new_workfolder)
    except:
        pass

    # align subfolder if there is a folder wrapper
    items = glob.glob(pj(project_folder,'*'))
    if len(glob.glob(pj(project_folder,'*.tex'))) == 0 and len(items) == 1:
        if os.path.isdir(items[0]): project_folder = items[0]

    shutil.copytree(src=project_folder, dst=new_workfolder)
    return new_workfolder

def arxiv_download(chatbot, history, txt):
    def check_cached_translation_pdf(arxiv_id):
        translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'translation')
        if not os.path.exists(translation_dir):
            os.makedirs(translation_dir)
        target_file = pj(translation_dir, 'translate_zh.pdf')
        if os.path.exists(target_file):
            promote_file_to_downloadzone(target_file, rename_file=None, chatbot=chatbot)
            return target_file
        return False
    def is_float(s):
        try:
            float(s)
            return True
        except ValueError:
            return False
    if ('.' in txt) and ('/' not in txt) and is_float(txt): # is arxiv ID
        txt = 'https://arxiv.org/abs/' + txt.strip()
    if ('.' in txt) and ('/' not in txt) and is_float(txt[:10]): # is arxiv ID
        txt = 'https://arxiv.org/abs/' + txt[:10]
    if not txt.startswith('https://arxiv.org'): 
        return txt, None
    
    # <-------------- inspect format ------------->
    chatbot.append([f"检测到arxiv文档连接", '尝试下载 ...']) 
    yield from update_ui(chatbot=chatbot, history=history)
    time.sleep(1) # 刷新界面

    url_ = txt   # https://arxiv.org/abs/1707.06690
    if not txt.startswith('https://arxiv.org/abs/'): 
        msg = f"解析arxiv网址失败, 期望格式例如: https://arxiv.org/abs/1707.06690。实际得到格式: {url_}"
        yield from update_ui_lastest_msg(msg, chatbot=chatbot, history=history) # 刷新界面
        return msg, None
    # <-------------- set format ------------->
    arxiv_id = url_.split('/abs/')[-1]
    if 'v' in arxiv_id: arxiv_id = arxiv_id[:10]
    cached_translation_pdf = check_cached_translation_pdf(arxiv_id)
    if cached_translation_pdf: return cached_translation_pdf, arxiv_id

    url_tar = url_.replace('/abs/', '/e-print/')
    translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print')
    extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract')
    os.makedirs(translation_dir, exist_ok=True)
    
    # <-------------- download arxiv source file ------------->
    dst = pj(translation_dir, arxiv_id+'.tar')
    if os.path.exists(dst):
        yield from update_ui_lastest_msg("调用缓存", chatbot=chatbot, history=history)  # 刷新界面
    else:
        yield from update_ui_lastest_msg("开始下载", chatbot=chatbot, history=history)  # 刷新界面
        proxies, = get_conf('proxies')
        r = requests.get(url_tar, proxies=proxies)
        with open(dst, 'wb+') as f:
            f.write(r.content)
    # <-------------- extract file ------------->
    yield from update_ui_lastest_msg("下载完成", chatbot=chatbot, history=history)  # 刷新界面
    from toolbox import extract_archive
    extract_archive(file_path=dst, dest_dir=extract_dst)
    return extract_dst, arxiv_id
# ========================================= 插件主程序 =====================================================    
@CatchException
def 更好的Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
    # <-------------- information about this plugin ------------->
    chatbot.append([
        "函数插件功能?",
        "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky, azwphy。"])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

    # <-------------- more requirements ------------->
    if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
    more_req = plugin_kwargs.get("advanced_arg", "")
    _switch_prompt_ = partial(switch_prompt, more_requirement=more_req)

    # <-------------- check deps ------------->
    import_requirements()
    try:
        import glob, os, time, subprocess, pylatexenc, itertools
        subprocess.Popen(['pdflatex', '-version'])
    except Exception as e:
        chatbot.append([ f"解析项目: {txt}",
            f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
    # <-------------- clear history and read input ------------->
    txt, arxiv_id = yield from arxiv_download(chatbot, history, txt)
    if txt.endswith('.pdf'):
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
    

    # <-------------- clear history and read input ------------->
    history = []
    if os.path.exists(txt):
        project_folder = txt
    else:
        if txt == "": txt = '空空如也的输入栏'
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return
    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
    if len(file_manifest) == 0:
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return

    # <-------------- if is a zip/tar file ------------->
    project_folder = desend_to_extracted_folder_if_exist(project_folder)


    # <-------------- move latex project away from temp folder ------------->
    project_folder = move_project(project_folder, arxiv_id=None)


    # <-------------- if merge_translate_zh is already generated, skip gpt req ------------->
    if not os.path.exists(project_folder + '/merge_translate_zh.tex'):
        res_test_file = yield from ProcessLaTeXMain(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
    
    promote_file_to_downloadzone(file=res_test_file, chatbot=chatbot)
    # # <-------------- compile PDF ------------->
    # success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread_en', 
    #                          work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder)
    

    # <-------------- zip PDF ------------->
    # zip_res = zip_result(project_folder)
    # if success:
    #     chatbot.append((f"成功啦", '请查收结果(压缩包)...'))
    #     yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
    #     promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
    # else:
    #     chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...'))
    #     yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
    #     promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)

    # # <-------------- we are done ------------->
    # return success

请注意,部分代码未完成。遇到的主要问题为:

每个单词都有一个pos位置信息,所以必须在翻译后的片段也包含这种位置信息。例如,\emph{}用于加粗时会将一句话分割为三个部分。如果要保证翻译后的文本还能在适当的位置加粗,则必须将这句话的每一个片段都有对应的翻译(而不是有整段话的翻译)。为了保持上下文的连贯性,我试图使用这样一种方法:按照token数限制将文本分割为片段,每一段都以句点结束。把整段话提供给GLM,并让它只翻译其中的某一片段。把每一个片段都重复这个操作。但无论怎么写prompt,GLM似乎都无法理解我的意图。所以写issue向各位求助。

最后还有关于pylatexenc包的一个问题。该包不支持将针对AST的修改重构为LaTeX代码。我能想到的替代方案为,按照文本顺序对收集得到的英文文本逐一替换为相应的中文文本。这个思路应该是比较可行的。最主要的问题还是如何连贯的翻译。

@binary-husky
Copy link
Owner

AAAAA\emph{BBBBBBB}CCCCCCCCC这种情况确实不太容易处理

@binary-husky
Copy link
Owner

binary-husky commented Aug 13, 2023

我之前的一个思路时借助pandoc实现ast,这样做出来的代码可以复用到word,ppt等各种文档上

不过需要解决的难点太多,需要花大量时间,只能暂缓

@azwphy
Copy link
Author

azwphy commented Aug 16, 2023

我之前的一个思路时借助pandoc实现ast,这样做出来的代码可以复用到word,ppt等各种文档上

不过需要解决的难点太多,需要花大量时间,只能暂缓

pandoc感觉还是有些局限性,LaTeX转Markdown容易,但翻译后很难转回LaTeX
如果只需要翻译结果的话,或许pandoc也行?

@reonokiy
Copy link
Contributor

reonokiy commented Sep 9, 2023

我的想法是能否按AST分割到段(分割到小于GPT的长度限制)之后直接交给GPT处理,处理完成后验证LaTeX语法的正确性,错误就回退到纯文本模式。

这里是我直接引导GPT输出内容的一个例子,不知道符不符合要求:

Latex翻译专家

这个例子只有第一段话是我直接输入的,后续它能够知道每段话中的LaTeX代码并且不翻译(尤其是对于emph进行了翻译而cite不翻译)

另一个例子是使用Few Shot,想法是可以直接引导GPT输出JSON
词典API

在这个例子里,第一个要求以及hello,meaning两个单词的回复都由我手动撰写。接下来在输入英文单词,它能够正确输出JSON格式的回复。

相关链接:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

3 participants