### Embedding（向量化）

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "sk-xxx"
os.environ["OPENAI_API_BASE"] = "https://api.chatanywhere.tech/v1"
os.environ["OPENAI_API_MODEL"] = "gpt-4-turbo"

In [3]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()

#### 文档向量化

In [4]:
embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
len(embeddings), len(embeddings[0])

(5, 1536)

#### 查询向量化

In [5]:
embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
embedded_query[:5]

[0.005384807424727803,
 -0.0005522561790177143,
 0.03896066510130952,
 -0.002939867294003907,
 -0.008987877434176596]

#### 向量化缓存

与向量存储一起使用

In [6]:
! pip install --upgrade --quiet  langchain-openai faiss-cpu

使用本地文件系统存储嵌入并使用 FAISS 向量存储进行检索的示例

In [7]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

underlying_embeddings = OpenAIEmbeddings()

store = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

In [8]:
list(store.yield_keys())

[]

加载文档并嵌入

In [9]:
raw_documents = TextLoader("mit.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 21186, which is longer than the specified 500
Created a chunk of size 14520, which is longer than the specified 500


In [10]:
%%time
db = FAISS.from_documents(documents, cached_embedder)

CPU times: user 74.7 ms, sys: 9.01 ms, total: 83.7 ms
Wall time: 1.59 s


In [11]:
%%time
db2 = FAISS.from_documents(documents, cached_embedder)

CPU times: user 2.52 ms, sys: 944 µs, total: 3.46 ms
Wall time: 2.74 ms


查看嵌入

In [12]:
list(store.yield_keys())[:5]

['text-embedding-ada-0020799b664-19cd-5b1a-a672-fc51aaa2bcde',
 'text-embedding-ada-00247281925-89d9-5865-a7bd-401dd4a0b9bf',
 'text-embedding-ada-0027d63be3c-5f51-56c3-809e-add6364c3275',
 'text-embedding-ada-002f0e6775e-7569-528c-b90f-c15a33843d30',
 'text-embedding-ada-0025cca1aeb-95c6-5838-9b36-cc1fae584544']