auto prompt lang

binary-husky · Jan 21, 2024 · b55d573 · b55d573
1 parent 06b0e80
commit b55d573
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 19 deletions.
diff --git a/core_functional.py b/core_functional.py
@@ -4,18 +4,27 @@
 import importlib
 from toolbox import clear_line_break
 from toolbox import build_gpt_academic_masked_string
+from toolbox import apply_gpt_academic_string_mask_langbased
+from toolbox import build_gpt_academic_masked_string_langbased
 from textwrap import dedent
 
 def get_core_functions():
     return {
 
-        "英语学术润色": {
-            # [1*] 前缀，会被加在你的输入之前。例如，用来描述你的要求，例如翻译、解释代码、润色等等
-            "Prefix":   r"Below is a paragraph from an academic paper. Polish the writing to meet the academic style, "
-                        r"improve the spelling, grammar, clarity, concision and overall readability. When necessary, rewrite the whole sentence. "
-                        r"Firstly, you should provide the polished paragraph. "
-                        r"Secondly, you should list all your modification and explain the reasons to do so in markdown table." + "\n\n",
-            # [2*] 后缀，会被加在你的输入之后。例如，配合前缀可以把你的输入内容用引号圈起来
+        "学术语料润色": {
+            # [1*] 前缀字符串，会被加在你的输入之前。例如，用来描述你的要求，例如翻译、解释代码、润色等等。
+            #      这里填一个提示词字符串就行了，这里为了区分中英文情景搞复杂了一点
+            "Prefix":   build_gpt_academic_masked_string_langbased(
+                            text_show_english=
+                                r"Below is a paragraph from an academic paper. Polish the writing to meet the academic style, "
+                                r"improve the spelling, grammar, clarity, concision and overall readability. When necessary, rewrite the whole sentence. "
+                                r"Firstly, you should provide the polished paragraph. "
+                                r"Secondly, you should list all your modification and explain the reasons to do so in markdown table.",
+                            text_show_chinese=
+                                r"作为一名中文学术论文写作改进助理，你的任务是改进所提供文本的拼写、语法、清晰、简洁和整体可读性，"
+                                r"同时分解长句，减少重复，并提供改进建议。请先提供文本的更正版本，然后在markdown表格中列出修改的内容，并给出修改的理由:"
+                        ) + "\n\n",
+            # [2*] 后缀字符串，会被加在你的输入之后。例如，配合前缀可以把你的输入内容用引号圈起来
             "Suffix":   r"",
             # [3] 按钮颜色 (可选参数，默认 secondary)
             "Color":    r"secondary",
@@ -33,6 +42,7 @@ def get_core_functions():
             "Prefix":   r"",
             # 后缀，会被加在你的输入之后。例如，配合前缀可以把你的输入内容用引号圈起来
             "Suffix":
+                # dedent() 函数用于去除多行字符串的缩进
                 dedent("\n"+f'''
                     ==============================
 
@@ -85,14 +95,22 @@ def get_core_functions():
 
 
         "学术英中互译": {
-            "Prefix":   r"I want you to act as a scientific English-Chinese translator, " +
-                        r"I will provide you with some paragraphs in one language " +
-                        r"and your task is to accurately and academically translate the paragraphs only into the other language. " +
-                        r"Do not repeat the original provided paragraphs after translation. " +
-                        r"You should use artificial intelligence tools, " +
-                        r"such as natural language processing, and rhetorical knowledge " +
-                        r"and experience about effective writing techniques to reply. " +
-                        r"I'll give you my paragraphs as follows, tell me what language it is written in, and then translate:" + "\n\n",
+            "Prefix":   build_gpt_academic_masked_string_langbased(
+                            text_show_chinese=
+                                r"I want you to act as a scientific English-Chinese translator, "
+                                r"I will provide you with some paragraphs in one language "
+                                r"and your task is to accurately and academically translate the paragraphs only into the other language. "
+                                r"Do not repeat the original provided paragraphs after translation. "
+                                r"You should use artificial intelligence tools, "
+                                r"such as natural language processing, and rhetorical knowledge "
+                                r"and experience about effective writing techniques to reply. "
+                                r"I'll give you my paragraphs as follows, tell me what language it is written in, and then translate:",
+                            text_show_english=
+                                r"你是经验丰富的翻译，请把以下学术文章段落翻译成中文，"
+                                r"并同时充分考虑中文的语法、清晰、简洁和整体可读性，"
+                                r"必要时，你可以修改整个句子的顺序以确保翻译后的段落符合中文的语言习惯。"
+                                r"你需要翻译的文本如下："
+                        ) + "\n\n",
             "Suffix":   r"",
         },
 
@@ -142,7 +160,11 @@ def handle_core_functionality(additional_fn, inputs, history, chatbot):
         if "PreProcess" in core_functional[additional_fn]:
             if core_functional[additional_fn]["PreProcess"] is not None:
                 inputs = core_functional[additional_fn]["PreProcess"](inputs)  # 获取预处理函数（如果有的话）
-        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
+        # 为字符串加上上面定义的前缀和后缀。
+        inputs = apply_gpt_academic_string_mask_langbased(
+            string = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"],
+            lang_reference = inputs,
+        )
         if core_functional[additional_fn].get("AutoClearHistory", False):
             history = []
         return inputs, history

diff --git a/shared_utils/text_mask.py b/shared_utils/text_mask.py
@@ -10,18 +10,25 @@
 # - () 括号在正则表达式中表示捕获组。
 # - 在这个例子中，(.*?)表示捕获任意长度的文本，直到遇到括号外部最近的限定符，即</show_llm>和</show_render>。
 
-# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=/1=-=-=-=-=-=-=-=-=-=-=-=-=-=/2-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=/1=-=-=-=-=-=-=-=-=-=-=-=-=-=/2-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
 const_extract_re = re.compile(
     r"<gpt_academic_string_mask><show_llm>(.*?)</show_llm><show_render>(.*?)</show_render></gpt_academic_string_mask>"
 )
-
+# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=/1=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-/2-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+const_extract_langbased_re = re.compile(
+    r"<gpt_academic_string_mask><lang_english>(.*?)</lang_english><lang_chinese>(.*?)</lang_chinese></gpt_academic_string_mask>",
+    flags=re.DOTALL,
+)
 
 @lru_cache(maxsize=128)
 def apply_gpt_academic_string_mask(string, mode="show_all"):
     """
-    根据字符串要给谁看（大模型，还是web渲染），对字符串进行处理，返回处理后的字符串
+    当字符串中有掩码tag时（<gpt_academic_string_mask><show_...>），根据字符串要给谁看（大模型，还是web渲染），对字符串进行处理，返回处理后的字符串
     示意图：https://mermaid.live/edit#pako:eNqlkUtLw0AUhf9KuOta0iaTplkIPlpduFJwoZEwJGNbzItpita2O6tF8QGKogXFtwu7cSHiq3-mk_oznFR8IYLgrGbuOd9hDrcCpmcR0GDW9ubNPKaBMDauuwI_A9M6YN-3y0bODwxsYos4BdMoBrTg5gwHF-d0mBH6-vqFQe58ed5m9XPW2uteX3Tubrj0ljLYcwxxR3h1zB43WeMs3G19yEM9uapDMe_NG9i2dagKw1Fee4c1D9nGEbtc-5n6HbNtJ8IyHOs8tbs7V2HrlDX2w2Y7XD_5haHEtQiNsOwfMVa_7TzsvrWIuJGo02qTrdwLk9gukQylHv3Afv1ML270s-HZUndrmW1tdA-WfvbM_jMFYuAQ6uCCxVdciTJ1CPLEITpo_GphypeouzXuw6XAmyi7JmgBLZEYlHwLB2S4gHMUO-9DH7tTnvf1CVoFFkBLSOk4QmlRTqpIlaWUHINyNFXjaQWpCYRURUKiWovBYo8X4ymEJFlECQUpqaQkJmuvWygPpg
     """
+    if "<gpt_academic_string_mask>" not in string: # No need to process
+        return string
+
     if mode == "show_all":
         return string
     if mode == "show_llm":
@@ -41,6 +48,50 @@ def build_gpt_academic_masked_string(text_show_llm="", text_show_render=""):
     return f"<gpt_academic_string_mask><show_llm>{text_show_llm}</show_llm><show_render>{text_show_render}</show_render></gpt_academic_string_mask>"
 
 
+@lru_cache(maxsize=128)
+def apply_gpt_academic_string_mask_langbased(string, lang_reference):
+    """
+    当字符串中有掩码tag时（<gpt_academic_string_mask><lang_...>），根据语言，选择提示词，对字符串进行处理，返回处理后的字符串
+    例如，如果lang_reference是英文，那么就只显示英文提示词，中文提示词就不显示了
+    举例：
+        输入1
+            string = "注意，lang_reference这段文字是：<gpt_academic_string_mask><lang_english>英语</lang_english><lang_chinese>中文</lang_chinese></gpt_academic_string_mask>"
+            lang_reference = "hello world"
+        输出1
+            "注意，lang_reference这段文字是：英语"
+            
+        输入2
+            string = "注意，lang_reference这段文字是中文"   # 注意这里没有掩码tag，所以不会被处理
+            lang_reference = "hello world"
+        输出2
+            "注意，lang_reference这段文字是中文"            # 原样返回
+    """
+
+    if "<gpt_academic_string_mask>" not in string: # No need to process
+        return string
+
+    def contains_chinese(string):
+        chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
+        return chinese_regex.search(string) is not None
+
+    mode = "english" if not contains_chinese(lang_reference) else "chinese"
+    if mode == "english":
+        string = const_extract_langbased_re.sub(r"\1", string)
+    elif mode == "chinese":
+        string = const_extract_langbased_re.sub(r"\2", string)
+    else:
+        raise ValueError("Invalid mode")
+    return string
+
+
+@lru_cache(maxsize=128)
+def build_gpt_academic_masked_string_langbased(text_show_english="", text_show_chinese=""):
+    """
+    根据语言，选择提示词，对字符串进行处理，返回处理后的字符串
+    """
+    return f"<gpt_academic_string_mask><lang_english>{text_show_english}</lang_english><lang_chinese>{text_show_chinese}</lang_chinese></gpt_academic_string_mask>"
+
+
 if __name__ == "__main__":
     # Test
     input_string = (

diff --git a/toolbox.py b/toolbox.py
@@ -22,6 +22,8 @@
 from shared_utils.connect_void_terminal import get_chat_default_kwargs
 from shared_utils.text_mask import apply_gpt_academic_string_mask
 from shared_utils.text_mask import build_gpt_academic_masked_string
+from shared_utils.text_mask import apply_gpt_academic_string_mask_langbased
+from shared_utils.text_mask import build_gpt_academic_masked_string_langbased
 
 pj = os.path.join
 default_user_name = "default_user"