In [None]:
# ! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '<your_api_key>'

In [None]:
os.environ['GOOGLE_API_KEY'] = '<your_api_key>'
os.environ["GRPC_VERBOSITY"] = "ERROR"
os.environ["GRPC_TRACE"] = ""

In [None]:
os.environ["ANONYMIZED_TELEMETRY"] = "FALSE"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [None]:
os.environ["USER_AGENT"] = "rag-from-scratch/1.0"

In [None]:
os.environ['OPENAI_API_KEY'] = <your-api-key>

In [None]:
from langchain_classic.hub import pull
import bs4
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from chromadb.config import Settings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

#### INDEXING ####

# Load Documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

client_settings = Settings(
    anonymized_telemetry=False,
    is_persistent=True,               # 可选
    persist_directory="./chroma_db",   # 可选
)

embedding = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

# Embed
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding = embedding,
                                    client_settings=client_settings,
                                    collection_name="rag_demo")
retriever = vectorstore.as_retriever()


#### RETRIEVAL and GENERATION ####

# Prompt
prompt = pull("rlm/rag-prompt")          # 可能是 str

# LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    convert_system_message_to_human=False  # 重要
)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
rag_chain.invoke("What is Task Decomposition?")

In [None]:
# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

In [None]:
query_result = embedding.embed_query(question)
document_result = embedding.embed_query(document)

In [None]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

In [None]:
#### INDEXING ####

# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

In [None]:
# Split
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

In [None]:
# Index
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [None]:
docs = retriever.invoke("What is Task Decomposition?")

In [None]:
len(docs)

In [None]:
# Prompt
from langchain_core.prompts import ChatPromptTemplate
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

In [None]:
# Chain
chain = prompt | llm

In [None]:
# Run
chain.invoke({"context":docs,"question":"What is Task Decomposition?"})

In [None]:
rag_chain.invoke("What is Task Decomposition?")