向量化存储github仓库代码
进行QA问答

In [None]:
# 基本配置
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
from langchain_community.embeddings.cloudflare_workersai import CloudflareWorkersAIEmbeddings
from supabase.client import Client, create_client

load_dotenv(override=True)

qw_llm_openai = ChatOpenAI(
    openai_api_base=os.getenv('DASHSCOPE_API_BASE'),
    openai_api_key=os.getenv('DASHSCOPE_API_KEY'),
    model_name="qwen2-1.5b-instruct",
    temperature=0.7,
    streaming=True,
)
embeddings = CloudflareWorkersAIEmbeddings(
    account_id=os.getenv('CF_ACCOUNT_ID'),
    api_token=os.getenv('CF_API_TOKEN'),
    model_name="@cf/baai/bge-large-en-v1.5",
)

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")

supabase: Client = create_client(supabase_url, supabase_key)

In [6]:

import os
from langchain.document_loaders import TextLoader

root_dir = '/Users/pangmengting/Documents/workspace/crab-blog-second'
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file == 'node_modules':
            print(file)
        # try:
        #     loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
        #     docs.extend(loader.load_and_split())
        # except Exception as e:
        #     pass

In [ ]:
len(docs)

In [ ]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
splits = text_splitter.split_documents(docs)

In [ ]:
len(splits)

In [ ]:
from langchain_community.vectorstores import SupabaseVectorStore

vectorstore = SupabaseVectorStore.from_documents(
    splits,
    embeddings,
    client=supabase,
    table_name="bge_large_vector",
    query_name="bge_large_match_documents",
)

In [ ]:
retriever = vectorstore.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['fetch_k'] = 100
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 10

In [ ]:
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain

# from langchain.chains import ConversationalRetrievalChain

qa = ConversationalRetrievalChain.from_llm(qw_llm_openai, retriever=retriever)

In [ ]:
questions = [
    "What does Chroma do?",
    "How to use Chroma?"
]
chat_history = []

for question in questions:
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"Question:\n {question} \n")
    print(f"Answer:\n {result['answer']} \n\n")

In [ ]:
def ask(question, chat_history):
    response = qa({"question": question, "chat_history": chat_history})
    print(f"Question:\n {question}\n")
    print(f"Answer:\n {response['answer']}\n")

In [ ]:
ask("What's the main programming language used in Chroma?", chat_history)

In [ ]:
ask('Show me the public functions of class Client', chat_history)