In [None]:
import os
from dotenv import load_dotenv

os.environ["LANGSMITH_TRACING"] = "true"
load_dotenv("../config/.env")

In [2]:
import pprint

from langchain_community.document_loaders import FileSystemBlobLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import PyMuPDFParser, RapidOCRBlobParser

file_path = "./assets/01中国经济的三驾马车.pdf"
loader = GenericLoader(
    blob_loader=FileSystemBlobLoader(
        path="./assets/macro_economy/",
        glob="*.pdf",
    ),
    blob_parser=PyMuPDFParser(
        images_inner_format="markdown-img",
        images_parser=RapidOCRBlobParser(),
        extract_tables="markdown",
    ),
)
docs = loader.load()

pprint.pprint(docs)

[Document(metadata={'producer': 'macOS Version 15.6.1 (Build 24G90) Quartz PDFContext, AppendMode 1.1', 'creator': '', 'creationdate': "D:20250414135535Z00'00'", 'source': 'assets/macro_economy/01中国经济的三驾马车.pdf', 'file_path': 'assets/macro_economy/01中国经济的三驾马车.pdf', 'total_pages': 7, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': "D:20250908150709Z00'00'", 'trapped': '', 'modDate': "D:20250908150709Z00'00'", 'creationDate': "D:20250414135535Z00'00'", 'page': 0}, page_content='一、内容介绍 01:31\n1. 课程背景与⽬的\n\uf06c\n课程背景:\n 宏观经济灰⾊难懂，投资选择复杂多变，学员需要⼀⻔实战的投资配置课\n来掌握经济形势分析和投资⽅法。\n\uf06c\n课程⽬的:\n 帮助学员快速掌握分析经济形势和投资配置的⽅法，实现财富增⻓，掌控\n⾃⼰的⼈⽣。\n\uf06c\n课程特⾊:\n 涵盖六⼤经济指标、投资时钟和⼤类资产投资⼿册、投资⼤师的经典思想\n和案例、新能源⼈⼯智能半导体等热⻔产业链。\n\uf06c\n课程形式:\n 配有书课件、直播课程和专属课程助理，确保学员能够知⾏合⼀，将知识\n应⽤到实战中。\n2. 学习投资时钟和⼤类资产投资⼿册 02:17\n\uf06c\n投资时钟:\n ⼀种投资框架，帮助学员理解经济周期与资产配置的关系，快速⼊⻔投资\n框架。\n\uf06c\n⼤类资产投资⼿册:\n 涵盖房地产、股市、基⾦、商品、汇率、⻩⾦等投资⽅法，为学\n员提供全⾯的投资指导。\n3. 学习投资⼤师的经典思想和案例 02:26\n\uf06c\n投资⼤师思想:\n 涵盖价值派、成⻓

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)
all_splits = text_splitter.split_documents(docs)

pprint.pprint(all_splits)

[Document(metadata={'producer': 'macOS Version 15.6.1 (Build 24G90) Quartz PDFContext, AppendMode 1.1', 'creator': '', 'creationdate': "D:20250414135535Z00'00'", 'source': 'assets/macro_economy/01中国经济的三驾马车.pdf', 'file_path': 'assets/macro_economy/01中国经济的三驾马车.pdf', 'total_pages': 7, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': "D:20250908150709Z00'00'", 'trapped': '', 'modDate': "D:20250908150709Z00'00'", 'creationDate': "D:20250414135535Z00'00'", 'page': 0, 'start_index': 0}, page_content='一、内容介绍 01:31\n1. 课程背景与⽬的\n\uf06c\n课程背景:\n 宏观经济灰⾊难懂，投资选择复杂多变，学员需要⼀⻔实战的投资配置课\n来掌握经济形势分析和投资⽅法。\n\uf06c\n课程⽬的:\n 帮助学员快速掌握分析经济形势和投资配置的⽅法，实现财富增⻓，掌控\n⾃⼰的⼈⽣。\n\uf06c\n课程特⾊:\n 涵盖六⼤经济指标、投资时钟和⼤类资产投资⼿册、投资⼤师的经典思想\n和案例、新能源⼈⼯智能半导体等热⻔产业链。\n\uf06c\n课程形式:\n 配有书课件、直播课程和专属课程助理，确保学员能够知⾏合⼀，将知识\n应⽤到实战中。\n2. 学习投资时钟和⼤类资产投资⼿册 02:17\n\uf06c\n投资时钟:\n ⼀种投资框架，帮助学员理解经济周期与资产配置的关系，快速⼊⻔投资\n框架。\n\uf06c\n⼤类资产投资⼿册:\n 涵盖房地产、股市、基⾦、商品、汇率、⻩⾦等投资⽅法，为学\n员提供全⾯的投资指导。\n3. 学习投资⼤师的经典思想和案例 02:26\n\uf06c\n

In [None]:
from langchain_huggingface import HuggingFaceEndpointEmbeddings

embeddings = HuggingFaceEndpointEmbeddings(
    model="Qwen/Qwen3-Embedding-0.6B",
)

vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(len(vector_1))

ValidationError: 1 validation error for HuggingFaceEndpointEmbeddings
endpoint_url
  Extra inputs are not permitted [type=extra_forbidden, input_value='https://hf-mirror.com/ap...en/Qwen3-Embedding-0.6B', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/extra_forbidden