In [46]:
"""
依赖包准备

pandas：用于数据处理/分析，提供了 DataFrame 数据结构，方便进行数据的读取、处理、分析等操作
tiktoken：由 openai 开发，将文本转换成 token，示例：https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
openai
chardet
matplotlib
plotly
scipy
"""
%pip install tiktoken chardet openai pandas matplotlib plotly scikit-learn numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [47]:
"""
数据集准备

亚马逊美食评论数据集：https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews
该数据集包含截至2012年10月用户在亚马逊上留下的共计568,454条美食评论。
这些评论都是用英语撰写的，并且倾向于积极或消极。每个评论都有一个产品ID、用户ID、评分、标题（摘要）和正文。

查看数据集编码方式，便于 pandas 可以以正确的编码 read_csv
"""
!chardetect "data/fine_food_reviews.csv"

data/fine_food_reviews.csv: utf-8 with confidence 0.99


In [48]:
import pandas as pd

input_datapath = "data/fine_food_reviews.csv"
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()

# 将 "Summary" 和 "Text" 字段组合成新的字段 "combined"
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [49]:
df["combined"]

Id
1         Title: Good Quality Dog Food; Content: I have ...
2         Title: Not as Advertised; Content: Product arr...
3         Title: "Delight" says it all; Content: This is...
4         Title: Cough Medicine; Content: If you are loo...
5         Title: Great taffy; Content: Great taffy at a ...
                                ...                        
568450    Title: Will not do without; Content: Great for...
568451    Title: disappointed; Content: I'm disappointed...
568452    Title: Perfect for our maltipoo; Content: Thes...
568453    Title: Favorite Training and reward treat; Con...
568454    Title: Great Honey; Content: I am very satisfi...
Name: combined, Length: 568427, dtype: object

In [50]:
import tiktoken

# 官方推荐 embedding 模型，支持最大输入 Token 数 8191，输出向量维度 1536
embedding_model = "text-embedding-ada-002"
# cl100k_base 为 text-embedding-ada-002 模型对应的分词器（TOKENIZER）
encoding_name = "cl100k_base"

# 设置模型输入上限
max_tokens = 8000  
# 仅筛选出 1000 条数据
top_n = 1000
# 按照时间取最近的 2000 条评论
df = df.sort_values("Time").tail(top_n * 2) 
# 从 DataFrame 中删除名为 "Time" 的列，并在原始的 DataFrame 上进行修改，而不返回新的 DataFrame。
df.drop("Time", axis=1, inplace=True)
# 获取编码对象
encoding = tiktoken.get_encoding(encoding_name)
# 分别计算每个 combined 分词之后的 token 数量，并把结果存储在 n_tokens 列
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))

# 只保留 n_tokens 在 max_tokens 范围内的行数据
df = df[df.n_tokens <= max_tokens].tail(top_n)

len(df)

1000

In [51]:
# 从 openai.embeddings_utils 包中导入 get_embedding 函数。
# 这个函数可以获取 GPT-3 模型生成的嵌入向量。
# 嵌入向量是模型内部用于表示输入数据的一种形式。
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity

# 实际生成会耗时几分钟
# 提醒：非必须步骤，可直接复用项目中的嵌入文件 fine_food_reviews_with_embeddings_1k
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))

output_datapath = "data/fine_food_reviews_with_embeddings_1k.csv"
df.to_csv(output_datapath)

ModuleNotFoundError: No module named 'scipy.spatial'