# 基于Embedding的相似搜索

In [None]:
from dotenv import load_dotenv
import os
from openai import OpenAI
import numpy as np
import pandas as pd
import ast
# 设置OpenAI，及OpenAI客户端初始化
load_dotenv()   # 从环境变量中获取API密钥
api_key = os.getenv('XIAOAI_API_KEY')
base_url = os.getenv('XIAOAI_BASE_URL')

client = OpenAI(api_key=api_key, base_url=base_url)
model = 'text-embedding-ada-002'

# # 读取数据，并把向量字符串转换为矩阵
df = pd.read_csv('datas/embedding_output_caojm.csv')
df['embedding_vec'] = df['embedding'].apply(ast.literal_eval)
# print(df['embedding_vec'][0])

# 创建embeddings
def embedding_text(text,print_flag=True):
    resp = client.embeddings.create(input=text,model=model)
    if print_flag:
        print("完整响应信息:")
        print(f"- model: {resp.model}")  # 使用的模型ID
        print(f"- object: {resp.object}")  # 对象类型
        print(f"- usage:")
        print(f"  - prompt_tokens: {resp.usage.prompt_tokens}")  # 输入使用的token数
        print(f"  - total_tokens: {resp.usage.total_tokens}")  # 总token数
        
        # 数据信息
        print("\n第一个embedding数据:")
        print(f"- embedding长度: {len(resp.data[0].embedding)}")
        print(f"- index: {resp.data[0].index}")
    return resp.data[0].embedding

# 计算余弦相似度
def consin_distance(a,b):
    return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

# 相似搜索
def search_by_word(work_key,n_result=1,print_flag=True):
    word_embedding = embedding_text(work_key)
    df['similarity'] = df['embedding_vec'].apply(lambda x: consin_distance(x,word_embedding))
    res = (
        df.sort_values('similarity',ascending=False)
        .head(n_result)
        .combined.str.replace('Title:','')
        .str.replace(' Content:','')
    )
    if print_flag:
        print(f"数据类型: {type(res)}")
        print(f"\n数据内容示例:")
        print(res)
        print(f"\n索引信息:")
        print(res.index)
        # # 如果想转换成列表
        # res_list = res.tolist()
        # print(f"\n转换为列表后:")
        
        # print(res_list)
        print(50 * '--')
        for r in res:
            print(r)
            print()
    return res


# 测试
if __name__ == '__main__':
    # search_by_word('Maui Coffee')
    search_by_word('bad',5)

## 代码说明
```text
完整响应信息:
- model: text-embedding-ada-002
- object: list
- usage:
  - prompt_tokens: 1
  - total_tokens: 1

第一个embedding数据:
- embedding长度: 1536
- index: 0
数据类型: <class 'pandas.core.series.Series'>

数据内容示例:
809    Busted;The bag came broken. Product was leakin...
845    Disappointed;The metal cover has severely disf...
303    pretty good but not the best;being raised in g...
603    Just Bad;Watery and unpleasant.  Like Yoohoo m...
601    God Awful;As a dabbler who enjoys spanning the...
Name: combined, dtype: object

索引信息:
Index([809, 845, 303, 603, 601], dtype='int64')
----------------------------------------------------------------------------------------------------
Busted;The bag came broken. Product was leaking out of the box, due to poor packing standards.<br />Hope next items arrive unscathed. Quinoa tasted good.

Disappointed;The metal cover has severely disformed. And most of the cookies inside have been crushed into small pieces. Shopping experience is awful. I'll never buy it online again.

pretty good but not the best;being raised in germany, i always had the great marzipan. when i got this box it actually did taste pretty good but it was somewhat dry, though i think the toasting is the cause of that. it didnt have an intense flavor and was more sweet than almondy, another reason it wasnt my favorite. i love marzipan but i cant afford the real expensive stuff so i buy this stuff and its delicious either way, since i rarely get to try it.<br />shipping was on time and it was fresh. so all in all i rate this four, for the good service, and the good tasting candy.<br />warning- it was smaller than i expected.

Just Bad;Watery and unpleasant.  Like Yoohoo mixed with dirty dish water.  I find it quite odd that Keurig would release a product like this.  I'm sure they can come up with a decent hot chocolate and not this swill.  I had one pod in a sample pack so at least I didn't buy a whole box of them.

God Awful;As a dabbler who enjoys spanning the entire spectrum of taste, I am more than willing to try anything once.  Both as a food aficionado and a lover of bacon, I just had to pick this up.  One taste caused me to throw out my sandwich, and subsequently throw out the entire jar of unused mayonnaise.<br /><br />I would give this less than 1 star, if I could.<br /><br />Steer clear from this unless you're a major tool who has no sensibility past buying bacon-everything.
```