Skip to content

Commit

Permalink
Merge pull request #881 from OverKit/master
Browse files Browse the repository at this point in the history
update latex_utils.py
  • Loading branch information
binary-husky committed Jun 27, 2023
2 parents ea624b1 + e18bef2 commit 601a95c
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 53 deletions.
6 changes: 4 additions & 2 deletions crazy_functions/crazy_functions_test.py
Expand Up @@ -190,9 +190,11 @@ def test_Latex():
# txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE
# txt = r"https://arxiv.org/abs/2002.09253"
# txt = r"https://arxiv.org/abs/2306.07831"
# txt = r"https://arxiv.org/abs/2212.10156"
txt = r"https://arxiv.org/abs/2212.10156"
# txt = r"https://arxiv.org/abs/2211.11559"
txt = r"https://arxiv.org/abs/2303.08774"
# txt = r"https://arxiv.org/abs/2303.08774"
# txt = r"https://arxiv.org/abs/2303.12712"
# txt = r"C:\Users\fuqingxu\arxiv_cache\2303.12712\workfolder"


for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
Expand Down
112 changes: 61 additions & 51 deletions crazy_functions/latex_utils.py
Expand Up @@ -8,24 +8,31 @@
"""
========================================================================
Part One
Latex segmentation to a linklist
Latex segmentation with a binary mask (PRESERVE=0, TRANSFORM=1)
========================================================================
"""
PRESERVE = 0
TRANSFORM = 1

def split_worker(text, mask, pattern, flags=0):
def set_forbidden_text(text, mask, pattern, flags=0):
"""
Add a preserve text area in this paper
e.g. with pattern = r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}"
you can mask out (mask = PRESERVE so that text become untouchable for GPT)
everything between "\begin{equation}" and "\end{equation}"
"""
if isinstance(pattern, list): pattern = '|'.join(pattern)
pattern_compile = re.compile(pattern, flags)
for res in pattern_compile.finditer(text):
mask[res.span()[0]:res.span()[1]] = PRESERVE
return text, mask

def split_worker_careful_brace(text, mask, pattern, flags=0):
def set_forbidden_text_careful_brace(text, mask, pattern, flags=0):
"""
Move area into preserve area
Add a preserve text area in this paper (text become untouchable for GPT).
count the number of the braces so as to catch compelete text area.
e.g.
\caption{blablablablabla\texbf{blablabla}blablabla.}
"""
pattern_compile = re.compile(pattern, flags)
for res in pattern_compile.finditer(text):
Expand All @@ -40,9 +47,12 @@ def split_worker_careful_brace(text, mask, pattern, flags=0):
mask[begin:end] = PRESERVE
return text, mask

def split_worker_reverse_careful_brace(text, mask, pattern, flags=0):
def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0, forbid_wrapper=True):
"""
Move area out of preserve area
Move area out of preserve area (make text editable for GPT)
count the number of the braces so as to catch compelete text area.
e.g.
\caption{blablablablabla\texbf{blablabla}blablabla.}
"""
pattern_compile = re.compile(pattern, flags)
for res in pattern_compile.finditer(text):
Expand All @@ -55,9 +65,12 @@ def split_worker_reverse_careful_brace(text, mask, pattern, flags=0):
p += 1
end = p
mask[begin:end] = TRANSFORM
if forbid_wrapper:
mask[res.regs[0][0]:begin] = PRESERVE
mask[end:res.regs[0][1]] = PRESERVE
return text, mask

def split_worker_begin_end(text, mask, pattern, flags=0, limit_n_lines=42):
def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42):
"""
Find all \begin{} ... \end{} text block that with less than limit_n_lines lines.
Add it to preserve area
Expand Down Expand Up @@ -154,6 +167,7 @@ def rm_comments(main_file):
else:
new_file_remove_comment_lines.append(l)
main_file = '\n'.join(new_file_remove_comment_lines)
# main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令
main_file = re.sub(r'(?<!\\)%.*', '', main_file) # 使用正则表达式查找半行注释, 并替换为空字符串
return main_file

Expand Down Expand Up @@ -200,9 +214,11 @@ def merge_tex_files(project_foler, main_file, mode):
main_file = re.sub(r"\\documentclass\[(.*?)\]{(.*?)}", r"\\documentclass[\1,fontset=windows,UTF8]{\2}",main_file)
main_file = re.sub(r"\\documentclass{(.*?)}", r"\\documentclass[fontset=windows,UTF8]{\1}",main_file)
# find paper abstract
pattern = re.compile(r'\\begin\{abstract\}.*\n')
match = pattern.search(main_file)
assert match is not None, "Cannot find paper abstract section!"
pattern_opt1 = re.compile(r'\\begin\{abstract\}.*\n')
pattern_opt2 = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL)
match_opt1 = pattern_opt1.search(main_file)
match_opt2 = pattern_opt2.search(main_file)
assert (match_opt1 is not None) or (match_opt2 is not None), "Cannot find paper abstract section!"
return main_file


Expand Down Expand Up @@ -283,46 +299,33 @@ def split_subprocess(txt, project_folder, return_dict, opts):
mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM

# 吸收title与作者以上的部分
text, mask = split_worker(text, mask, r"(.*?)\\maketitle", re.DOTALL)
# 删除iffalse注释
text, mask = split_worker(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL)
# 吸收在25行以内的begin-end组合
text, mask = split_worker_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25)
text, mask = set_forbidden_text(text, mask, r"(.*?)\\maketitle", re.DOTALL)
# 吸收iffalse注释
text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL)
# 吸收在42行以内的begin-end组合
text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42)
# 吸收匿名公式
text, mask = split_worker(text, mask, r"\$\$(.*?)\$\$", re.DOTALL)
text, mask = set_forbidden_text(text, mask, [ r"\$\$(.*?)\$\$", r"\\\[.*?\\\]" ], re.DOTALL)
# 吸收其他杂项
text, mask = split_worker(text, mask, r"\\section\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\section\*\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\subsection\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\subsubsection\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\bibliography\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\bibliographystyle\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{figure\}(.*?)\\end\{figure\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{multline\}(.*?)\\end\{multline\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{table\}(.*?)\\end\{table\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{table\*\}(.*?)\\end\{table\*\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{minipage\}(.*?)\\end\{minipage\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{align\*\}(.*?)\\end\{align\*\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{align\}(.*?)\\end\{align\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{equation\}(.*?)\\end\{equation\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL)
text, mask = split_worker(text, mask, r"\\includepdf\[(.*?)\]\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\item ")
text, mask = split_worker(text, mask, r"\\label\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\begin\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\vspace\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\hspace\{(.*?)\}")
text, mask = split_worker(text, mask, r"\\end\{(.*?)\}")
text, mask = split_worker_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL)
text, mask = split_worker_reverse_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, [ r"\\section\{(.*?)\}", r"\\section\*\{(.*?)\}", r"\\subsection\{(.*?)\}", r"\\subsubsection\{(.*?)\}" ])
text, mask = set_forbidden_text(text, mask, [ r"\\bibliography\{(.*?)\}", r"\\bibliographystyle\{(.*?)\}" ])
text, mask = set_forbidden_text(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
text, mask = set_forbidden_text(text, mask, [r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, [r"\\begin\{figure\}(.*?)\\end\{figure\}", r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, [r"\\begin\{multline\}(.*?)\\end\{multline\}", r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, [r"\\begin\{table\}(.*?)\\end\{table\}", r"\\begin\{table\*\}(.*?)\\end\{table\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, [r"\\begin\{minipage\}(.*?)\\end\{minipage\}", r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, [r"\\begin\{align\*\}(.*?)\\end\{align\*\}", r"\\begin\{align\}(.*?)\\end\{align\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, [r"\\begin\{equation\}(.*?)\\end\{equation\}", r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}"], re.DOTALL)
text, mask = set_forbidden_text(text, mask, [r"\\includepdf\[(.*?)\]\{(.*?)\}", r"\\clearpage", r"\\newpage", r"\\appendix", r"\\tableofcontents", r"\\include\{(.*?)\}"])
text, mask = set_forbidden_text(text, mask, [r"\\vspace\{(.*?)\}", r"\\hspace\{(.*?)\}", r"\\label\{(.*?)\}", r"\\begin\{(.*?)\}", r"\\end\{(.*?)\}", r"\\item "])
text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL)
# reverse 操作必须放在最后
text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL, forbid_wrapper=True)
text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\abstract\{(.*?)\}", re.DOTALL, forbid_wrapper=True)
root = convert_to_linklist(text, mask)

# 修复括号
Expand Down Expand Up @@ -396,7 +399,7 @@ def break_check(string):
prev_node = node
node = node.next
if node is None: break

# 输出html调试文件,用红色标注处保留区(PRESERVE),用黑色标注转换区(TRANSFORM)
with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
segment_parts_for_gpt = []
nodes = []
Expand Down Expand Up @@ -448,8 +451,14 @@ def merge_result(self, arr, mode, msg):
if mode == 'translate_zh':
pattern = re.compile(r'\\begin\{abstract\}.*\n')
match = pattern.search(result_string)
assert match is not None, "Cannot find paper abstract section!"
position = match.end()
if not match:
# match \abstract{xxxx}
pattern_compile = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL)
match = pattern_compile.search(result_string)
position = match.regs[1][0]
else:
# match \begin{abstract}xxxx\end{abstract}
position = match.end()
result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:]
return result_string

Expand All @@ -468,6 +477,7 @@ def split(self, txt, project_folder, opts):
args=(txt, project_folder, return_dict, opts))
p.start()
p.join()
p.close()
self.nodes = return_dict['nodes']
self.sp = return_dict['segment_parts_for_gpt']
return self.sp
Expand Down

0 comments on commit 601a95c

Please sign in to comment.