In [12]:
from langchain_openai import ChatOpenAI
from langchain_community.embeddings import DashScopeEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_community.embeddings import DashScopeEmbeddings 

from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import bs4
from langchain import hub
import os
from langchain.chat_models import init_chat_model

from dotenv import load_dotenv
import os
load_dotenv()
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

In [13]:
#### INDEXING ####

# Load Documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()










In [14]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embeddings
embeddings = DashScopeEmbeddings(model="text-embedding-v1")
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####

# promote

prompt = hub.pull("rlm/rag-prompt")

# llm
# llm = ChatOpenAI(
#     base_url="https://api.deepseek.com/v1",
#     api_key=os.getenv('DEEPSEEK_API_KEY'),
#     model="deepseek-chat"
# )
# 初始化 DeepSeek 聊天模型
# 初始化 DeepSeek 聊天模型
llm = init_chat_model(
    model="deepseek-chat", 
    model_provider="deepseek",
    api_key=os.getenv('DEEPSEEK_API_KEY')  # 添加这行
)
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# 被 retriever 用来检索文档。
# 被 RunnablePassthrough 原样传递。
# 两者结果汇集成字典。
# 字典填充 prompt。
# prompt 发给 llm。
# llm 的回答被 StrOutputParser 解析。
# 最终，invoke 方法会返回一个最终的、干净的字符串答案。
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# question
rag_chain.invoke("What is Task Decomposition?")






'Task decomposition is the process of breaking down a complex task into smaller, more manageable steps. Techniques like Chain of Thought (CoT) and Tree of Thoughts (ToT) use step-by-step reasoning to achieve this, enabling better planning and execution. CoT focuses on linear decomposition, while ToT explores multiple reasoning paths in a tree structure.'

## 索引

In [16]:
# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

Count tokens considering ~4 char / token

In [17]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

8

Text embedding models

In [22]:
from langchain_community.embeddings import DashScopeEmbeddings


embd = DashScopeEmbeddings(model="text-embedding-v1")
query_result = embd.embed_query("What is Task Decomposition?")
document_result = embd.embed_query("Task Decomposition is a technique that breaks down a complex task into smaller, more manageable sub-tasks.")
len(query_result)
len(document_result)

print(len(query_result))
print(len(document_result))










1536
1536


Cosine similarity is reccomended (1 indicates identical) 

In [24]:
# 计算两者的余弦相似度
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

similarity = cosine_similarity(query_result, document_result)
print(similarity)











0.9082681489114193


Document Loaders

In [27]:
# 文档加载
### INDEXING ####

# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()


Splitter

This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.

In [28]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

Vectorstores

In [36]:
# Index
from langchain_community.embeddings import DashScopeEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=DashScopeEmbeddings())

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [37]:
docs = retriever.invoke("What is Task Decomposition?")

In [38]:
len(docs)

5