In [1]:
# load environment variables from .env file (requires `python-dotenv`)
from dotenv import load_dotenv
import os

load_dotenv()

True

In [2]:
from langchain_community.embeddings import DashScopeEmbeddings
import os

embedding_model = DashScopeEmbeddings(
    model="text-embedding-v4",
    )

text = "This is a test query"
query_result = embedding_model.embed_query(text)
print(query_result)
print(len(query_result))


[-0.00958594772964716, 0.016300806775689125, 0.03246745094656944, -0.03783397004008293, 0.023974930867552757, -0.02443108521401882, 0.05887073278427124, 0.019145062193274498, -0.06509589403867722, 0.13137242197990417, 0.031581975519657135, -0.0040986803360283375, 0.001002868521027267, 0.020942848175764084, -0.02125142142176628, -0.028818216174840927, 0.002307603834196925, -0.05817308649420738, 0.003612339263781905, 0.03163563832640648, 0.013107727281749249, 0.010659252293407917, -0.023371197283267975, 0.034748222678899765, -0.01583123579621315, 0.055060502141714096, 0.020916014909744263, -0.0391756035387516, 0.032118625938892365, -0.016891123726963997, -0.04395180568099022, 0.006265413016080856, -0.018299836665391922, -0.019131647422909737, 0.016904540359973907, 0.03244061768054962, 0.004880179651081562, 0.07507762312889099, -0.0055409325286746025, 0.01673012785613537, -0.018004676327109337, 0.02830839715898037, -0.05090145021677017, 0.020795268937945366, -0.03888044133782387, 0.004756

In [5]:
## 计算相似度

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# 定义文本
text1 = "我喜欢吃苹果"
text2 = "我最爱吃的水果是苹果"
text3 = "今天天气不错"

# 获取文本向量
vector1 = np.array(embedding_model.embed_query(text1)).reshape(1, -1)
vector2 = np.array(embedding_model.embed_query(text2)).reshape(1, -1)
vector3 = np.array(embedding_model.embed_query(text3)).reshape(1, -1)

# 计算余弦相似度
similarity12 = cosine_similarity(vector1, vector2)[0][0]
similarity23 = cosine_similarity(vector2, vector3)[0][0]

print(similarity12)
print(similarity23)



0.8187081789976591
0.29377201534315184


## 向量数据持久化

In [6]:
from redis import Redis

r = Redis.from_url("redis://localhost:6379")
r.ft("fruit").dropindex(delete_documents=True)

b'OK'

In [9]:
from langchain_redis import RedisConfig, RedisVectorStore

config = RedisConfig(
    index_name="fruit",
    redis_url="redis://localhost:6379",
)

vector_store = RedisVectorStore(embedding_model, config=config)

vector_store.add_texts(["香蕉很长", "苹果很甜", "西瓜又大又圆"])

15:23:03 redisvl.index.index INFO   Index already exists, not overwriting.


['fruit::01JZHW8MF0FGM06NNP63R4G57N',
 'fruit::01JZHW8MF09J78PDT28M8XFC49',
 'fruit::01JZHW8MF0Z5NXTPY50N4Z0CTT']

In [10]:
scored_results = vector_store.similarity_search_with_score("又圆又大的水果是什么", k=3)
for doc, score in scored_results:
    print(f"{doc.page_content} - {score}")

西瓜又大又圆 - 0.266252994537
苹果很甜 - 0.459183335304
香蕉很长 - 0.478137373924


In [11]:
retriver = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

retriver.invoke("长长的水果是什么？")

[Document(metadata={}, page_content='香蕉很长'),
 Document(metadata={}, page_content='西瓜又大又圆'),
 Document(metadata={}, page_content='苹果很甜')]

In [13]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("human", "{question}")
])

# prompt.invoke 返回的是 PromptValue，而 retriver.invoke 需要传入 str，这里做个转换
def format_prompt_value(prompt_value):
    return prompt_value.to_string()

chain = prompt | format_prompt_value | retriver

documents = chain.invoke({"question": "又长又甜的水果是什么？"})
for document in documents:
    print(document.page_content)

苹果很甜
香蕉很长
西瓜又大又圆


In [None]:
""