In [40]:
import fitz  # PyMuPDF
import os
import re
import tempfile
import camelot
import pandas as pd
from pdf2image import convert_from_path
from typing import List, Optional
def pdf_to_markdown(pdf_path, output_md, image_dir, dpi=500, table_flavor='lattice'):

    # Initialize
    os.makedirs(image_dir, exist_ok=True)
    markdown_content: List[str] = []
    image_count = 0

    def extract_tables_camelot(pdf_path: str, page_num: int) -> Optional[List[pd.DataFrame]]:
        """Extract tables using Camelot with robust error handling"""
        try:
            tables = camelot.read_pdf(
                pdf_path,
                pages=str(page_num),
                flavor=table_flavor,
                suppress_stdout=True,
                strip_text='\n',
                backend='poppler'  # Ensure poppler is installed
            )
            return [t.df for t in tables] if tables else None
        except Exception as e:
            print(f"⚠️ Camelot failed on page {page_num}: {str(e)}")
            return None

    def extract_tables_fallback(page) -> Optional[List[pd.DataFrame]]:
        """Fallback table extraction using pdfplumber"""
        try:
            import pdfplumber
            with pdfplumber.open(pdf_path) as pdf:
                page = pdf.pages[page_num-1]
                tables = page.extract_tables()
                return [pd.DataFrame(t[1:], columns=t[0]) for t in tables if t]
        except ImportError:
            print("ℹ️ Install pdfplumber for fallback table extraction: pip install pdfplumber")
            return None
        except Exception as e:
            print(f"⚠️ Fallback extraction failed: {str(e)}")
            return None

    # Main processing
    with fitz.open(pdf_path) as doc, tempfile.TemporaryDirectory() as temp_dir:
        images = convert_from_path(pdf_path, dpi=dpi, output_folder=temp_dir, fmt='png')
        
        for page_num, (page, pil_image) in enumerate(zip(doc, images), start=1):
            # Page header
            markdown_content.append(f"\n\n# Page {page_num}\n\n")

            # 1. Image extraction
            for img_index, img in enumerate(page.get_images(full=True), start=1):
                try:
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    image_filename = f"{image_dir}/image_{page_num}_{img_index}.{base_image['ext']}"
                    with open(image_filename, "wb") as f:
                        f.write(base_image["image"])
                    markdown_content.append(
                        f"![Image {image_count}]({image_filename}) "
                        f"{{width={base_image['width']}px, height={base_image['height']}px}}\n\n"
                    )
                    image_count += 1
                except Exception as e:
                    print(f"⚠️ Image {img_index} extraction failed: {str(e)}")

            # 2. Table extraction (try camelot first, then fallback)
            tables = extract_tables_camelot(pdf_path, page_num) or \
                    extract_tables_fallback(page)
            
            if tables:
                markdown_content.append(f"## 📊 Tables (Page {page_num})\n\n")
                for i, df in enumerate(tables, 1):
                    try:
                        md_table = df.to_markdown(index=False)
                        markdown_content.append(f"### Table {i}\n```markdown\n{md_table}\n```\n\n")
                    except Exception as e:
                        print(f"⚠️ Table {i} formatting failed: {str(e)}")
                        markdown_content.append(f"### Table {i}\n[Table content could not be converted]\n\n")

            # 3. Text extraction and formatting
            text = page.get_text("text").strip()
            if text:
                formatted_lines = []
                for line in (l.strip() for l in text.split('\n') if l.strip()):
                    # Heading detection
                    if len(line) < 50 and (line.isupper() or re.match(r'^[IVX]+\.', line)):
                        formatted_lines.append(f"## {line}")
                    # List detection
                    elif line.startswith(('•', '-', '◦', '∙')):
                        formatted_lines.append(f"- {line[1:].strip()}")
                    elif re.match(r'^\d+[\.\)]', line):
                        formatted_lines.append(f"1. {line.split(maxsplit=1)[1] if ' ' in line else ''}")
                    else:
                        formatted_lines.append(line)
                
                markdown_content.append("\n".join(formatted_lines) + "\n")

    # Post-processing
    final_content = re.sub(r'\n{3,}', '\n\n', "\n".join(markdown_content)).strip()
    
    # Write output
    with open(output_md, 'w', encoding='utf-8') as f:
        f.write(final_content)

    return final_content

pdf_path = '/Users/zimoshen/Desktop/Talkweb_intern/KnowledgeGraphConstruction/Data/Original/教材/高中教材人教版/高中思想政治必修2_经济与社会.pdf'
output_path = './test/高中思想政治必修2_经济与社会高中思想政治必修2_经济与社会.md'
image_dir = './test/Image/'

pdf_to_markdown(pdf_path, output_path, image_dir, dpi=500)

⚠️ Camelot failed on page 1: module 'camelot' has no attribute 'read_pdf'
MuPDF error: format error: cannot find object in xref (2151 0 R)

MuPDF error: format error: cannot find object in xref (2151 0 R)

⚠️ Camelot failed on page 2: module 'camelot' has no attribute 'read_pdf'
⚠️ Camelot failed on page 3: module 'camelot' has no attribute 'read_pdf'
⚠️ Camelot failed on page 4: module 'camelot' has no attribute 'read_pdf'
⚠️ Camelot failed on page 5: module 'camelot' has no attribute 'read_pdf'
⚠️ Camelot failed on page 6: module 'camelot' has no attribute 'read_pdf'
⚠️ Camelot failed on page 7: module 'camelot' has no attribute 'read_pdf'
⚠️ Camelot failed on page 8: module 'camelot' has no attribute 'read_pdf'
⚠️ Camelot failed on page 9: module 'camelot' has no attribute 'read_pdf'
⚠️ Camelot failed on page 10: module 'camelot' has no attribute 'read_pdf'
⚠️ Camelot failed on page 11: module 'camelot' has no attribute 'read_pdf'
⚠️ Camelot failed on page 12: module 'camelot' has n

'# Page 1\n\n![Image 0](./test/Image//image_1_1.jpeg) {width=920px, height=1148px}\n\n![Image 1](./test/Image//image_1_2.png) {width=720px, height=314px}\n\n![Image 2](./test/Image//image_1_3.png) {width=324px, height=314px}\n\n![Image 3](./test/Image//image_1_4.png) {width=116px, height=87px}\n\n## 📊 Tables (Page 1)\n\n### Table 1\n```markdown\n|    |    |    |    |    |    |    |    |    |    |    |\n|----|----|----|----|----|----|----|----|----|----|----|\n```\n\n### Table 2\n```markdown\n|    |    |    |    |    |    |    |    |    |    |    |\n|----|----|----|----|----|----|----|----|----|----|----|\n```\n\n### Table 3\n```markdown\n|                           |    |    | None   |\n|:--------------------------|:---|:---|:-------|\n| ®                         | 普 |    |        |\n|                           | 通 |    |        |\n|                           | 高 |    |        |\n|                           | 中 |    |        |\n|                           | 教 |    |        |\n|        

In [39]:
def pdf_to_markdown(pdf_path, output_md, image_dir, dpi=300):
    """
    强化版PDF转Markdown转换器
    """
    import fitz
    import os
    import re
    import pandas as pd
    from pdf2image import convert_from_path
    
    # 初始化
    os.makedirs(image_dir, exist_ok=True)
    markdown = []
    
    def safe_extract_tables(page):
        """多方法表格提取"""
        # 方法1：尝试camelot
        try:
            import camelot
            tables = camelot.read_pdf(
                pdf_path,
                pages=f"{page.number+1}",
                flavor="lattice",
                suppress_stdout=True
            )
            return [t.df for t in tables]
        except:
            pass
        
        # 方法2：尝试pdfplumber
        try:
            import pdfplumber
            with pdfplumber.open(pdf_path) as pdf:
                return [
                    pd.DataFrame(table[1:], columns=table[0])
                    for table in pdf.pages[page.number].extract_tables() 
                    if table
                ]
        except:
            return []

    # 主处理流程
    with fitz.open(pdf_path) as doc:
        for page in doc:
            # 页面标题
            markdown.append(f"\n# 第 {page.number+1} 页\n")
            
            # 表格处理
            tables = safe_extract_tables(page)
            if tables:
                markdown.append(f"\n## 表格列表（共{len(tables)}个）\n")
                for i, df in enumerate(tables, 1):
                    markdown.append(f"### 表格{i}\n```markdown\n{df.to_markdown()}\n```\n")
            
            # 文本处理
            text = page.get_text("text").strip()
            if text:
                # 增强的标题识别
                lines = []
                for line in text.split('\n'):
                    line = line.strip()
                    if not line:
                        continue
                    # 识别章节标题（如 3.1 函数的概念）
                    if re.match(r'^\d+\.\d+\s+.+$', line):
                        lines.append(f"## {line}")
                    # 识别列表项
                    elif line.startswith(('•','-','*','→')):
                        lines.append(f"- {line[1:].strip()}")
                    else:
                        lines.append(line)
                markdown.append("\n".join(lines))
    
    # 输出结果
    with open(output_md, 'w', encoding='utf-8') as f:
        f.write("\n".join(markdown))

pdf_path = '/Users/zimoshen/Desktop/Talkweb_intern/KnowledgeGraphConstruction/Data/Original/教材/高中教材人教版/高中思想政治必修2_经济与社会.pdf'
output_path = './test/高中思想政治必修2_经济与社会高中思想政治必修2_经济与社会.md'
image_dir = './test/Image/'

pdf_to_markdown(pdf_path, output_path, image_dir, dpi=500)

MuPDF error: format error: cannot find object in xref (2151 0 R)

MuPDF error: format error: cannot find object in xref (2151 0 R)



In [15]:
import re

class Node:
    def __init__(self, title, level, parent=None):
        self.title = title 
        self.level = level
        self.content = ""
        self.children = []
        self.parent = parent  # Add parent reference
    
    def add_child(self, node):
        node.parent = self  # Set the parent when adding a child
        self.children.append(node)
    
    def add_content(self, text):
        if self.content:
            self.content += "\n" + text
        else:
            self.content = text

class MD_parser:
    def __init__(self, file_judge):
        self.paper_path = ''
        self.book_path = './test/高中思想政治必修2_经济与社会高中思想政治必修2_经济与社会.md'
        self.file_judge = file_judge
        
    def parse_markdown_to_linked_lists(self):
        if file_judge['file_type'] == 'Book':
            with open(self.book_path, 'r', encoding='utf-8') as f:
                markdown_text = f.read() 
        else:
            with open(self.paper_path, 'r', encoding='utf-8') as f:
                markdown_text = f.read() 
        lines = markdown_text.split('\n') 
        roots = []
        stack = [] 
        for line in lines:
            if line.strip() == '':
                continue
            match = re.match(r'^(#+)\s*(.*)', line)
            if match:
                level = len(match.group(1))
                title = match.group(2).strip()
                node = Node(title, level)
                while stack and stack[-1].level >= level:
                    stack.pop()
                if not stack:  # 根节点
                    roots.append(node)
                else:  # 子节点
                    stack[-1].add_child(node)
                stack.append(node)
            else:
                if stack: 
                    stack[-1].add_content(line)
        return roots

file_judge = {
    'file_type': 'Book', 
    'is ENG': False 
}
Parser = MD_parser(file_judge)
BookTree = Parser.parse_markdown_to_linked_lists()

a = BookTree[0].children
for c in a:
    print(c.title)

3.1 函数的概念及其表示
3.2 函数的基本性质
3.3 幂函数
3.4 函数的应用 (一)
小结


In [814]:
from openai import OpenAI
import pandas as pd
import re
import queue
import requests
import json
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class LumberChunker:
    def __init__(self, BookTree):
        self.client = OpenAI(api_key = 'sk-16528769d4794f44a01c283443c88bec', base_url="https://api.deepseek.com")
        self.nlp_Chi = spacy.load("zh_core_web_sm")
        self.nlp_Eng = spacy.load("en_core_web_sm")
        self.subject = '数学'
        self.output_path = './chunked_book.csv'
        self.BookTree = BookTree

    def lumberchunker(self):
        chunked_data = []
# data_separate是不管层级的展开的分类，   NewBookTree是管层级的
        for chapter1 in self.BookTree: # 一集
            NewBookTree = self._initialize_chapter_structure(chapter1)
            extra_question = queue.Queue()

            for chapter2 in chapter1.children: # 二级
                chapter2_content_list = []
                data_separate = self._initialize_data_separate(chapter2)
                
                NewBookTree[chapter1]["sections"][chapter2] = chapter2_content_list   
                # 后面记得处理二级标题下的内容，用三级标题多出来的extra question，二级一下的多出来的储存好，最后放进大章节的内容里
                for chapter3 in chapter2.children:
                    # 判断节点是否为知识点，是的话就形成dict加进chapter2的list，不是的话，就加进问题,最后统一处理进chapter2的问题块

                    self._classify_node(chapter3, data_separate, chapter2_content_list)

                    if chapter3 in data_separate['知识']:
                        chapter3_content_dict = {chapter3:[]}
                        chapter2_content_list.append(chapter3_content_dict)
                        # child chapters，处理四级标题的东西
                        self._process_child_chapters(chapter3, data_separate, chapter3_content_dict[chapter3])
                        # 以上就是把所有的知识节点都存进去了，下面处理content

                        chapter3_content_list = chapter3_content_dict[chapter3]
                        remaining = queue.Queue()
                        for entry in chapter3_content_list:
                            for key, value in entry.items():
                                self._process_chapter_content(key, entry[key], question_queue=remaining)

                # question content
                question_content = self._clean_question_content(data_separate['题目'])
                question_content_queue = self._split_sentences_general(question_content)

                # 处理知识节点
                for entry in chapter2_content_list:
                    for key, value in entry.items():
                        self._chunk_all_nodes(key, question_content_queue, entry)

                # 处理chapter2.content：
                self._process_chapter_content(chapter2, chapter2_content_list, question_content_queue)

                if not question_content_queue.empty(): # Remaining question content
                    self._handle_remaining_questions(question_content_queue, chapter2_content_list, extra_question)

            self._process_top_level_chapter_content(chapter1, NewBookTree, extra_question)            

            chunked_data.append(NewBookTree)
        return chunked_data

    def _process_chapter_content(self, chapter, content_list, question_queue):
        check = self._check_len(chapter.content)
        if check == 'OK':
            content_list.append(chapter.content)
        elif check == 'LARGE':
            content_list.append(chapter.content[:800])
            remaining = chapter.content[800:]
            if remaining:
                question_queue.put(remaining)
        else:  # SMALL
            combined = chapter.content
            while not question_queue.empty() and len(combined) < 600:
                combined += "\n" + question_queue.get()
            
            if len(combined) > 1000:
                content_list.append(combined[:800])
                if len(combined) > 800:
                    question_queue.put(combined[800:])
            else:
                content_list.append(combined)

    def _process_top_level_chapter_content(self, chapter, book_tree, extra_questions):
        chapter_content = chapter.content
        check = self._check_len(chapter_content)
        content_list = book_tree[chapter]['content']
        
        if check == 'OK':
            content_list.append(chapter_content)
        elif check == 'LARGE':
            sentences = self._split_sentences_general(chapter_content)
            tmp_chunk = ''
            while not sentences.empty():
                sentence = sentences.get()
                if 600 < len(tmp_chunk) + len(sentence) < 1000:
                    tmp_chunk += sentence + "\n"
                    content_list.append(tmp_chunk)
                    tmp_chunk = ''
                elif len(tmp_chunk) + len(sentence) < 600:
                    tmp_chunk += sentence + "\n"
                else:
                    content_list.append(tmp_chunk[:1000])
                    tmp_chunk = sentence + "\n"
            
            if tmp_chunk:
                content_list.append(tmp_chunk)
        else:  # SMALL
            while not extra_questions.empty() and len(chapter_content) < 600:
                chapter_content += "\n" + extra_questions.get()
            
            if len(chapter_content) > 1000:
                content_list.append(chapter_content[:1000])
                remaining = chapter_content[1000:]
                if remaining:
                    extra_questions.put(remaining)
            else:
                content_list.append(chapter_content)

        if not extra_questions.empty() and content_list:
            last_chunk = content_list[-1]
            while not extra_questions.empty():
                next_question = extra_questions.get()
                combined_length = len(last_chunk) + len(next_question) + 1 
                
                if combined_length <= 1000:
                    last_chunk += "\n" + next_question
                    content_list[-1] = last_chunk 
                else:
                    extra_questions.put(next_question)
                    break
        tmp_chunk = ''
        while not extra_questions.empty():
            question = extra_questions.get()
            if len(tmp_chunk) + len(question) + 1 <= 1000: 
                tmp_chunk += question + "\n"
                if len(tmp_chunk) >= 600:
                    content_list.append(tmp_chunk.strip())
                    tmp_chunk = ''
            else:
                if tmp_chunk:
                    content_list.append(tmp_chunk.strip())
                    tmp_chunk = question + "\n"
                else:
                    content_list.append(question[:1000])
                    remaining = question[1000:]
                    if remaining:
                        extra_questions.put(remaining)
        if tmp_chunk:
            content_list.append(tmp_chunk.strip())

    def _initialize_chapter_structure(self, chapter):
        return {
                    chapter: {
                        "content": [],
                        "sections": {}
                    }
                }

    def _initialize_data_separate(self, chapter):
        return {
                    '知识': [chapter],
                    '题目': ''
                }

    def _process_child_chapters(self, parent_chapter, data_separate, parent_content_list:list):
        # parent_chapter：relative一级
        if not parent_chapter.children:
            return

        for child in parent_chapter.children:  # relative二级
            self._classify_node(child, data_separate, parent_content_list)           
            if child in data_separate['知识']:
                parent_content_list.append({child:[]})

    def _clean_question_content(self, question_content):
        return "\n".join([line for line in question_content.splitlines() if line.strip()])

    def _chunk_all_nodes(self, chapter:dict, question_content_queue:queue.Queue, chapter_dict):
        check = self._check_len(chapter.content)
        if check == 'OK':
            chapter_dict[chapter].append(chapter.content)
        elif check == 'SMALL':
            self._handle_small_chunk(chapter_dict, chapter, question_content_queue)
        elif check == 'LARGE':
            self._handle_large_chunk(chapter_dict, chapter, question_content_queue)

    def _handle_small_chunk(self, chapter_dict, chapter:dict, question_queue:queue.Queue):
        content = chapter.content
        max_iterations = 100  # Prevent infinite loops
        iteration = 0
        
        while (not question_queue.empty() and 
               len(content) < 600 and 
               iteration < max_iterations):
            content += "\n" + "\n" + question_queue.get()
            iteration += 1

        if len(content) > 1000:
            chapter_dict[chapter].append(content[:800])
            remaining = content[800:]
            if remaining:
                question_queue.put(remaining)
        else:
            chapter_dict[chapter].append(content)

    def _handle_large_chunk(self, chapter_dict, chapter, question_queue: queue.Queue):
        tmp_queue = self._split_sentences_general(chapter.content)
        tmp_chunk = ''
        max_iterations = 200
        iteration = 0
        
        while not tmp_queue.empty() and iteration < max_iterations:
            sentence = tmp_queue.get()
            iteration += 1
            
            if len(tmp_chunk) + len(sentence) > 800:
                if len(tmp_chunk) >= 600:
                    chapter_dict[chapter].append(tmp_chunk)
                    tmp_chunk = sentence + "\n" + "\n"
                else:
                    tmp_chunk += sentence + "\n"
                    if len(tmp_chunk) > 1200:
                        chapter_dict[chapter].append(tmp_chunk[:len(tmp_chunk)//2])
                        chapter_dict[chapter].append(tmp_chunk[len(tmp_chunk)//2:])
                        tmp_chunk = ''
            else: 
                tmp_chunk += sentence + "\n"
        
        if tmp_chunk:
            if len(tmp_chunk) >= 600:
                chapter_dict[chapter].append(tmp_chunk)
            else:
                if not question_queue.empty() and len(tmp_chunk) < 800:
                    additional = question_queue.get()
                    if len(tmp_chunk) + len(additional) > 800:
                        remaining = 800 - len(tmp_chunk)
                        tmp_chunk += additional[:remaining]
                        chapter_dict[chapter].append(tmp_chunk)
                        question_queue.put(additional[remaining:])
                    else:
                        tmp_chunk += "\n" + additional
                        chapter_dict[chapter].append(tmp_chunk)
                else:
                    question_queue.put(tmp_chunk)

    def _handle_remaining_questions(self, question_queue, chapter_content_list, extra_question:queue.Queue):
        # chapter_content_dict: NewBookTree[chapter1]['sections']
        # Handle remaining questions in the queue
        if question_queue.empty():
            return
            
        tmp_chunk = ''
        node = Node('习题与思考', 2)
        node_dict = {node: []}
        chapter_content_list.append(node_dict)
        max_iterations = 200
        iteration = 0
        
        while not question_queue.empty() and iteration < max_iterations:
            tmp_chunk += question_queue.get() + "\n"
            iteration += 1
            
            if len(tmp_chunk) >= 600:
                if len(tmp_chunk) > 1000:
                    node_dict[node].append(tmp_chunk[:800])
                    remaining = tmp_chunk[800:]
                    question_queue.put(remaining)
                    tmp_chunk = ''
                else:
                    node_dict[node].append(tmp_chunk)
                    tmp_chunk = ''
        
        if tmp_chunk:
            extra_question.put(tmp_chunk.strip())

    def _classify_node(self, ChapterNode: Node, data_separate, parent_content_list:list):
        title = ChapterNode.title
        prompt = f'你觉得“{title}“这是个知识点的标题吗，如果是返回1，不是返回0'
        response = self.client.chat.completions.create(
            model='deepseek-chat',
            messages=[
                {"role": "system", "content": "你是个聪明的老师"},
                {"role": "user", "content": prompt}
            ],
            stream=False
        )
        judge = response.choices[0].message.content
        
        if judge == '1':
            data_separate['知识'].append(ChapterNode)
            parent_content_list.append({ChapterNode: []})

        else:
            content = ChapterNode.content
            data_separate['题目'] += content
        
    def _check_len(self, chunk):
        length = len(chunk)
        if 600 <= length <= 800:
            return 'OK'
        elif length < 600:
            return 'SMALL'
        elif length > 800:
            return 'LARGE'

    def _split_sentences_general(self, text: str) -> queue.Queue:
        replacements = {
            "tables": [],
            "latex": [],
            "images": []
        }

        def replace_table(match):
            replacements["tables"].append(match.group(0))
            return f"@@TABLE{len(replacements['tables']) - 1}@@"

        def replace_latex(match):
            expr = match.group(0)
            replacements["latex"].append(expr)
            return f"@@LATEX{len(replacements['latex']) - 1}@@"

        def replace_image(match):
            replacements["images"].append(match.group(0))
            return f"@@IMAGE{len(replacements['images']) - 1}@@"

        text = re.sub(r'<table.*?>.*?</table>', replace_table, text, flags=re.DOTALL)

        latex_pattern = r'\$\$.*?\$\$|\$.*?\$|\\\[.*?\\\]|\\\(.*?\\\)'
        text = re.sub(latex_pattern, replace_latex, text, flags=re.DOTALL)

        text = re.sub(r'!\[.*?\]\((.*?)\)', replace_image, text)
        protected_text = (
            text.replace('@@LATEX', 'LATEXPROTECTED')
                .replace('@@IMAGE', 'IMAGEPROTECTED')
                .replace('@@TABLE', 'TABLEPROTECTED')
        )
        if self.subject == 'English':
            doc = self.nlp_Eng(protected_text)
        else:
            doc = self.nlp_Chi(protected_text)
        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
        restored_sentences = queue.Queue()
        for sent in sentences:
            sent = (
                sent.replace('LATEXPROTECTED', '@@LATEX')
                    .replace('IMAGEPROTECTED', '@@IMAGE')
                    .replace('TABLEPROTECTED', '@@TABLE')
            )
            for i, table in enumerate(replacements["tables"]):
                sent = sent.replace(f"@@TABLE{i}@@", table)
            for j, latex in enumerate(replacements["latex"]):
                sent = sent.replace(f"@@LATEX{j}@@", latex)
            for k, img in enumerate(replacements["images"]):
                sent = sent.replace(f"@@IMAGE{k}@@", img)
            restored_sentences.put(sent)

        return restored_sentences

    def text_to_table(self, Chunked_book):
        data = []

        for parent1, value in Chunked_book.items():  # 章节: 第一章
            parent1_title = parent1.title
            subcha = Chunked_book.get(parent1)

            parent1_content = subcha['content']
            for entry in parent1_content:
                if entry and str(entry).strip():
                    data.append({
                        'parent': parent1_title,
                        'relation1': '同位',
                        'chapter_title': parent1_title,
                        'relation2': '文本块',
                        'content': entry
                    })

            parent1_sections = subcha['sections']
            for parent2, value in parent1_sections.items():  # 课: 1.1: []
                parent2_title = parent2.title
                subsubcha = parent1_sections.get(parent2)

                for entry in subsubcha:  # 课的 content
                    if isinstance(entry, str):
                        if entry and str(entry).strip(): 
                            data.append({
                                'parent': parent1_title,
                                'relation1': '上位',
                                'chapter_title': parent2_title,
                                'relation2': '文本块',
                                'content': entry
                            })
                    elif isinstance(entry, dict):
                        for parent3, value in entry.items():  # 课的小节
                            parent3_title = parent3.title
                            for entry in value:
                                if isinstance(entry, dict):
                                    for parent4, value in entry.items():
                                        parent4_title = parent4.title
                                        for entry in value:
                                            if entry and str(entry).strip(): 
                                                data.append({
                                                    'parent': parent3_title,
                                                    'relation1': '上位',
                                                    'chapter_title': parent4_title,
                                                    'relation2': '文本块',
                                                    'content': entry
                                                })
                                elif isinstance(entry, str):
                                    if entry and str(entry).strip():
                                        data.append({
                                            'parent': parent2_title,
                                            'relation1': '上位',
                                            'chapter_title': parent3_title,
                                            'relation2': '文本块',
                                            'content': entry
                                        })

        df = pd.DataFrame(data, columns=['parent', 'relation1', 'chapter_title', 'relation2', 'content'])
        Entity_list_self = []
        Entity_list_father = []
        for ind, row in df.iterrows():
            parent = row['parent']
            itself = row['chapter_title']

            Entity_list_father.append(self.link_book_with_entity(parent))
            Entity_list_self.append(self.link_book_with_entity(itself))
            
        df['Entity_father'] = Entity_list_father
        df['Entity_self'] = Entity_list_self



        return df
    
    def link_book_with_entity(self, chunk):
        headers = {
            "Authorization": f"Bearer {'app-ICVmh2I94iQ6hFxkvJ7wevLX'}",
            "Content-Type": 'application/json'
        }
        request_data = {
            "inputs": {
                "title": chunk, 
            },
            "user": 'Zimo'
        }
        response = requests.post('http://localhost/v1/workflows/run', headers=headers, json=request_data, timeout=30)
        response_text = response.text
        response_json = json.loads(response_text)
        responses = response_json['data']['outputs']['entity']
        entity_list = [item['content'] for item in responses]
        candidate = {}
        distance_list = []
        for entry in entity_list:
            distance = self._tfidf_cosine_distance(chunk, entry)
            candidate[distance] = entry
            distance_list.append(distance)

        best_dist = min(distance_list)
        best = candidate[best_dist]
        
        return best
    
    def _tfidf_cosine_distance(self, str1, str2):
        documents = [str1, str2]
        vectorizer = TfidfVectorizer(analyzer='char').fit(documents)
        tfidf_matrix = vectorizer.transform(documents)
        cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        cosine_dist = 1 - cosine_sim
        return cosine_dist

    
    def main(self):
        Book = self.lumberchunker()
        Chunked_book = Book[0]
        df = self.text_to_table(Chunked_book)

        return Chunked_book, df
    
Chunker = LumberChunker(BookTree)
Chunked_book, df = Chunker.main()


In [819]:
df.to_csv('./Data/Updated/TextBooks/MAT/Book1/chunked_book.csv', index=False)

In [817]:
set(df['Entity_self'])

{'subject: 逻辑与思维',
 '函数及其表示',
 '函数的图象',
 '函数的基本性质',
 '函数的奇偶性',
 '函数的应用',
 '函数的性质',
 '函数的表示方法',
 '利用函数单调性求最值或值域',
 '幂函数的图象',
 '归纳法',
 '空集'}

In [45]:
import requests
import json
import os
print(os.getenv("MINERU_API_KEY"))


None
{'User-Agent': 'python-requests/2.32.4', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}


In [68]:

chunks = "['$\\overrightarrow { E C } \\cdot \\overrightarrow { E D } = \\mathrm { ~ ( ~ \\Gamma ~ ) ~ }$', '$\\sqrt { 5 }$', '【分析】方法一：以 $\\left\\{ { \\overrightarrow { A B } } , { \\overrightarrow { A D } } \\right\\}$ 为基底向量表示 $\\mathsf { \\Pi } _ { E C , E D } ^ { \\mathsf { u u } }$ ，再结合数量积的运算律运算求解；方法二：建系，利用平面向量的坐标运算求解；方法三：利用余弦定理求cos $\\angle D E C$ ，进而根据数量积的定义运算求解.【详解】方法一：以 $\\left\\{ { \\overrightarrow { A B } } , { \\overrightarrow { A D } } \\right\\}$ 为基底向量，可知 $\\left| \\overset { \\mathbf { u u } } { A B } \\right| = \\left| \\overset { \\mathbf { u u } } { A D } \\right| = 2 , \\overset { \\mathbf { u u } } { A B } \\cdot \\overset { \\mathbf { u u } } { A D } = 0$ ，  \n则 $\\stackrel { \\mathrm { \\tiny ~ { U I M } ~ } } { \\cal E } = \\stackrel { \\mathrm { \\tiny ~ { U } ~ } } { \\cal E } B + \\stackrel { \\mathrm { \\tiny ~ { U I } ~ } } { \\cal B } C = \\frac { 1 } { 2 } \\stackrel { \\mathrm { \\tiny ~ { U I } ~ } } { \\cal A } B + \\stackrel { \\mathrm { \\tiny ~ { U I } ~ } } { \\cal A } D , \\stackrel { \\mathrm { \\tiny ~ { U I } ~ } } { \\cal E } D = \\stackrel { \\mathrm { \\tiny ~ { U } ~ } } { \\cal E } A + \\stackrel { \\mathrm { \\tiny ~ { U I } ~ } } { \\cal A } D = - \\frac { 1 } { 2 } \\stackrel { \\mathrm { \\tiny ~ { U I } ~ } } { \\cal A } B + \\stackrel { \\mathrm { \\tiny ~ { U I } ~ } } { \\cal A } D ,$   \n所以 ${ \\stackrel { \\mathrm { U I O } } { E C } } \\cdot { \\stackrel { \\mathrm { U I O } } { E D } } = \\left( { \\frac { 1 } { 2 } } { \\stackrel { \\mathrm { U I O } } { A B } } + { \\stackrel { \\mathrm { U I O } } { A D } } \\right) \\cdot \\left( - { \\frac { 1 } { 2 } } { \\stackrel { \\mathrm { U I O } } { A B } } + { \\stackrel { \\mathrm { U I O } } { A D } } \\right) = - { \\frac { 1 } { 4 } } { \\stackrel { \\mathrm { U I O } } { A B } } + { \\stackrel { \\mathrm { U I D } } { A D } } = - 1 + 4 = 3 , \\ldots ,$', '$E D = E C = \\sqrt { 5 } , C D = 2$ ，在VCDE 中，由余弦定理可得cosÐDEC = DE2 +CE2 - DC2 $\\angle D E C = \\frac { D E ^ { 2 } + C E ^ { 2 } - D C ^ { 2 } } { 2 D E \\cdot C E } = \\frac { 5 + 5 - 4 } { 2 \\times \\sqrt { 5 } \\times \\sqrt { 5 } } = \\frac { 3 } { 5 } ,$ 所以 $\\stackrel { \\mathrm { \\bf { U I I } } } { E C } \\cdot \\stackrel { \\mathrm { \\bf { U I I } } } { E D } = \\left| \\stackrel { \\mathrm { \\bf { U I I } } } { E C } \\right| \\left| \\stackrel { \\mathrm { \\bf { U I I } } } { E D } \\right| \\cos \\angle D E C = \\sqrt { 5 } \\times \\sqrt { 5 } \\times \\frac { 3 } { 5 } = 3 .$']"

headers = {
            "Authorization": f"Bearer {'app-iwGbYYWNZ1aK9TgXSLU9uIJq'}",
            "Content-Type": 'application/json'
        }

request_data = {
    "inputs": {
        "File": chunks, 
    },
    "user": 'Zimo'
}
api_url = 'http://localhost/v1/workflows/run'
response = requests.post(api_url, headers=headers, json=request_data, timeout=60)
response

<Response [200]>

In [63]:
response_json = json.loads(re)
output = response_json['data']['outputs']['corrected']
print(output)

以下是纠正后的LaTeX表达式列表：

```python
[
    r"\overrightarrow{EC} \cdot \overrightarrow{ED} = \Gamma"
]
```

修改说明：
1. 移除了多余的波浪线符号`~`和括号`(~)`
2. 将`\mathrm`中的内容改为标准的数学符号`\Gamma`
3. 保持了向量表示法的正确语法`\overrightarrow{EC}`
4. 保持了点乘符号`\cdot`的正确用法
以下是纠正后的 LaTeX 表达式列表：

```python
["\\sqrt{5}"]
```
以下是纠正后的LaTeX公式列表：

1. `\left\{ { \overrightarrow { A B } } , { \overrightarrow { A D } } \right\}`  
2. `\left| \overrightarrow { A B } \right| = \left| \overrightarrow { A D } \right| = 2`  
3. `\overrightarrow { A B } \cdot \overrightarrow { A D } = 0`  
4. `\overrightarrow { E C } = \overrightarrow { E B } + \overrightarrow { B C } = \frac { 1 } { 2 } \overrightarrow { A B } + \overrightarrow { A D }`  
5. `\overrightarrow { E D } = \overrightarrow { E A } + \overrightarrow { A D } = - \frac { 1 } { 2 } \overrightarrow { A B } + \overrightarrow { A D }`  
6. `\overrightarrow { E C } \cdot \overrightarrow { E D } = \left( { \frac { 1 } { 2 } } \overrightarrow { A B } + \overrightarrow { A D } \right) \cdot \le

In [807]:
response_text = response.text
response_json = json.loads(response_text)
responses = response_json['data']['outputs']['entity']
entity_list = [item['content'] for item in responses]
entity_list
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def tfidf_cosine_distance(str1, str2):
    documents = [str1, str2]
    vectorizer = TfidfVectorizer(analyzer='char').fit(documents)
    tfidf_matrix = vectorizer.transform(documents)
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    cosine_dist = 1 - cosine_sim
    return cosine_dist

candidate = {}
distance_list = []
for entry in entity_list:
    distance = tfidf_cosine_distance(chunk, entry)
    candidate[distance] = entry
    distance_list.append(distance)

best_dist = min(distance_list)
best = candidate[best_dist]
best

'函数的表示方法'