In [2]:
from typing import List, Dict
from autocoder.common import AutoCoderArgs, SourceCode
from byzerllm.utils.client.code_utils import extract_code
import json
from loguru import logger

import byzerllm
from byzerllm import ByzerLLM

# Initialize ByzerLLM
llm = ByzerLLM.from_default_model("deepseek_chat")

@byzerllm.prompt()
def extract_relevance_range_from_docs_with_conversation(
    conversations: List[Dict[str, str]], documents: List[str]
) -> str:
    """
    根据提供的文档和对话历史提取相关信息范围。

    输入:
    1. 文档内容:
    {% for doc in documents %}
    {{ doc }}
    {% endfor %}

    2. 对话历史:
    {% for msg in conversations %}
    <{{ msg.role }}>: {{ msg.content }}
    {% endfor %}

    任务:
    1. 分析最后一个用户问题及其上下文。
    2. 在文档中找出与问题相关的一个或多个重要信息段。
    3. 对每个相关信息段，确定其起始行号(start_line)和结束行号(end_line)。
    4. 信息段数量不超过4个。

    输出要求:
    1. 返回一个JSON数组，每个元素包含"start_line"和"end_line"。
    2. start_line和end_line必须是整数，表示文档中的行号。
    3. 行号从1开始计数。
    4. 如果没有相关信息，返回空数组[]。

    输出格式:
    严格的JSON数组，不包含其他文字或解释。

    示例:
    1.  文档：
        1 这是这篇动物科普文。
        2 大象是陆地上最大的动物之一。
        3 它们生活在非洲和亚洲。
        问题：大象生活在哪里？
        返回：[{"start_line": 2, "end_line": 3}]

    2.  文档：
        1 地球是太阳系第三行星，
        2 有海洋、沙漠，温度适宜，
        3 是已知唯一有生命的星球。
        4 太阳则是太阳系的唯一恒心。
        问题：地球的特点是什么？
        返回：[{"start_line": 1, "end_line": 3}]

    3.  文档：
        1 苹果富含维生素。
        2 香蕉含有大量钾元素。
        问题：橙子的特点是什么？
        返回：[]        
    """  

# result = extract_relevance_range_from_docs_with_conversation.with_llm(llm).run(conversations=conversations, documents=documents)


# {"conversation":conversations, "doc":[doc.source_code]}
conversations = None
documents = None
with open("/tmp/rag.json", "r") as f:
    lines = f.read().split("\n")
    for i, line in enumerate(lines):
        if line:
            v = json.loads(line)
            conversations = v["conversation"]
            documents = v["doc"]

def process_range_doc(doc, max_retries=3):
    for attempt in range(max_retries):
        content = ""
        try:
            source_code_with_line_number = ""          
            source_code_lines = doc.source_code.split("\n")  
            for idx,line in enumerate(source_code_lines):
                source_code_with_line_number += f"{idx+1} {line}\n"                

            extracted_info = extract_relevance_range_from_docs_with_conversation.with_llm(
                llm
            ).run(
                conversations, [source_code_with_line_number]
            )
                                                                        
            json_str = extract_code(extracted_info)[0][1]

            print(json_str)
            json_objs = json.loads(json_str)                                    
                                                
            for json_obj in json_objs:
                start_line = json_obj["start_line"] - 1
                end_line = json_obj["end_line"] - 1 
                chunk = "\n".join(source_code_lines[start_line:end_line])
                content +=  chunk + "\n"
                print(f"{start_line} - {end_line} : {chunk}")
            
            return SourceCode(
                module_name=doc.module_name, source_code=content.strip()
            )
        except Exception as e:
            e.print_exc() 
            if attempt < max_retries - 1:
                logger.warning(f"Error processing doc {doc.module_name}, retrying... (Attempt {attempt + 1}) attempts: {str(e)}")                                                       
            else:
                logger.error(f"Failed to process doc {doc.module_name} after {max_retries} attempts: {str(e)}")                
                return SourceCode(
                module_name=doc.module_name, source_code=content.strip()
            )

# print(documents[0])
m = process_range_doc(SourceCode(
                module_name="test", source_code=documents[0]
            ))


print(m.source_code)


[32m2024-08-29 10:55:36.486[0m | [1mINFO    [0m | [36mbyzerllm.utils.connect_ray[0m:[36mconnect_cluster[0m:[36m48[0m - [1mJDK 21 will be used (/Users/allwefantasy/.auto-coder/jdk-21.0.2.jdk/Contents/Home)...[0m
2024-08-29 10:55:36,529	INFO worker.py:1564 -- Connecting to existing Ray cluster at address: 127.0.0.1:6379...
2024-08-29 10:55:36,530	INFO worker.py:1582 -- Calling ray.init() again after it has already been called.


[
    {"start_line": 21, "end_line": 22},
    {"start_line": 29, "end_line": 36},
    {"start_line": 49, "end_line": 50},
    {"start_line": 52, "end_line": 54}
]
20 - 21 : 文本:GMP基本要求
28 - 35 : "2、机构与人员","5条","3、机构与人员","4节22条"
"3、厂房与设施","23条","4、厂房与设施","5节33条"
"4、设备","7条","5、设备","6节31条"
"5、物料","10条","6、物料与产品","7节36条"
"6、卫生","9条","",""
"7、验证","4条","7、确认与验证","12条"
"8、文件","5条","8、文件管理","6节34条"
48 - 49 : 文本:药品GMP基本要求
51 - 53 : #/Users/allwefantasy/Downloads/docs/新版GMP解读.pptx#395
文本:主要修订内容
文本:GMP基本要求
"2、机构与人员","5条","3、机构与人员","4节22条"
"3、厂房与设施","23条","4、厂房与设施","5节33条"
"4、设备","7条","5、设备","6节31条"
"5、物料","10条","6、物料与产品","7节36条"
"6、卫生","9条","",""
"7、验证","4条","7、确认与验证","12条"
"8、文件","5条","8、文件管理","6节34条"
文本:药品GMP基本要求
#/Users/allwefantasy/Downloads/docs/新版GMP解读.pptx#395
文本:主要修订内容
