## 让AI生成点实验数据

In [1]:
import openai, os

openai.api_key = os.environ.get("OPENAI_API_KEY")

COMPLETION_MODEL = "text-davinci-003"

def generate_data_by_prompt(prompt):
    response = openai.Completion.create(
        engine=COMPLETION_MODEL,
        prompt=prompt,
        temperature=0.5,
        max_tokens=2048,
        top_p=1,
    )
    return response.choices[0].text

prompt = """请你生成50条淘宝网里的商品的标题，每条在30个字左右，品类是3C数码产品，标题里往往也会有一些促销类的信息，每行一条。"""
data = generate_data_by_prompt(prompt)
print(data)



1.新款Apple/苹果iPhoneXS/XR手机套装优惠
2.正品华为Mate20Pro/P30Pro手机最新折扣
3.2020新款iPad Pro/Air/Mini全系列特价
4.惠普/戴尔笔记本电脑超低折扣
5.小米/魅族手机新品特惠
6.荣耀/OPPO新款手机热销
7.苹果AirPods/Beats耳机大促
8.索尼/佳能/尼康相机限时优惠
9.华硕/惠普/戴尔台式电脑特惠
10.苹果/三星智能电视超低价
11.苹果/三星/华为智能手表特惠
12.苹果/三星/华硕笔记本电脑折扣
13.小米/乐视/索尼智能电视特价
14.荣耀/小米/OPPO新品手机特惠
15.小米/乐视/索尼智能电视特价
16.Apple/苹果MacBook Pro/Air笔记本特惠
17.荣耀/小米/OPPO新品手机特价
18.华为/OPPO/小米智能手表最新折扣
19.华硕/惠普/戴尔台式电脑超低价
20.Apple/苹果iPad Pro/Air/Mini全系列特惠
21.索尼/佳能/尼康相机限时特价
22.华为/OPPO/小米智能手表最新折扣
23.苹果AirPods/Beats耳机大促销
24.新款Apple/苹果iPhoneXS/XR手机特惠
25.正品华为Mate20Pro/P30Pro手机热销
26.苹果/三星智能电视超低折扣
27.小米/魅族手机新品特价
28.荣耀/OPPO新款手机大促
29.Apple/苹果MacBook Pro/Air笔记本特价
30.小米/乐视/索尼智能电视超低折扣
31.惠普/戴尔笔记本电脑最新优惠
32.华硕/惠普/戴尔台式电脑超低价
33.苹果/三星/华为智能手表特惠
34.苹果/三星/华硕笔记本电脑特价
35.新款Apple/苹果iPhoneXS/XR手机大促
36.正品华为Mate20Pro/P30Pro手机特惠
37.2020新款iPad Pro/Air/Mini全系列限时优惠
38.小米/乐视/索尼智能电视特惠
39.荣耀/OPPO新款手机超低折扣
40.华为/OPPO/小米智能手表最新特惠
41.苹果AirPods/Beats耳机大促销
42.苹果/三星智能电视特惠
43.苹果/三星/华硕笔记本电脑特价
44.小米/魅族手机新品超低折扣
45.惠普/戴尔笔记本电脑限时优惠
46.索尼/佳能/尼康相机特惠
47.华硕/惠普/戴尔台式电

为了让数据和真实情况更加接近一点，我们可以好好设计一下我们的提示语。比如，我这里就指明了是淘宝的商品，品类是3C，并且标题里要包含一些促销信息。

我们把拿到的返回结果，按行分割，加载到一个DataFrame里面，看看结果会是怎么样的：

In [2]:
import pandas as pd

product_names = data.strip().split('\n')
df = pd.DataFrame({'product_name': product_names})
df.head()

Unnamed: 0,product_name
0,1.新款Apple/苹果iPhoneXS/XR手机套装优惠
1,2.正品华为Mate20Pro/P30Pro手机最新折扣
2,3.2020新款iPad Pro/Air/Mini全系列特价
3,4.惠普/戴尔笔记本电脑超低折扣
4,5.小米/魅族手机新品特惠


In [3]:
df.product_name = df.product_name.apply(lambda x: x.split('.')[1].strip())
df.head()

Unnamed: 0,product_name
0,新款Apple/苹果iPhoneXS/XR手机套装优惠
1,正品华为Mate20Pro/P30Pro手机最新折扣
2,2020新款iPad Pro/Air/Mini全系列特价
3,惠普/戴尔笔记本电脑超低折扣
4,小米/魅族手机新品特惠


In [4]:
clothes_prompt = """请你生成50条淘宝网里的商品的标题，每条在30个字左右，品类是女性的服饰箱包等等，标题里往往也会有一些促销类的信息，每行一条。"""
clothes_data = generate_data_by_prompt(clothes_prompt)
clothes_product_names = clothes_data.strip().split('\n')


In [5]:
clothes_df = pd.DataFrame({'product_name': clothes_product_names})
clothes_df.product_name = clothes_df.product_name.apply(lambda x: x.split('.')[1].strip())
clothes_df.head()

Unnamed: 0,product_name
0,【新款】真皮时尚单肩斜挎包
1,【精选】简约时尚女士包包
2,【优惠】百搭高贵气质手提包
3,【热卖】时尚百搭贝壳包
4,【特惠】经典款双肩斜挎包


In [6]:
# append clothes_df to df
df = pd.concat([df, clothes_df], axis=0)
df = df.reset_index(drop=True)
print(df.product_name)

0      新款Apple/苹果iPhoneXS/XR手机套装优惠
1       正品华为Mate20Pro/P30Pro手机最新折扣
2     2020新款iPad Pro/Air/Mini全系列特价
3                   惠普/戴尔笔记本电脑超低折扣
4                      小米/魅族手机新品特惠
                  ...             
95                   【热销】真皮时尚单肩斜挎包
96                   【新品】百搭高贵气质手提包
97                     【特惠】时尚百搭贝壳包
98                    【限量】经典款双肩斜挎包
99                   【热销】简约活力拼接手提包
Name: product_name, Length: 100, dtype: object


In [7]:
display(df)

Unnamed: 0,product_name
0,新款Apple/苹果iPhoneXS/XR手机套装优惠
1,正品华为Mate20Pro/P30Pro手机最新折扣
2,2020新款iPad Pro/Air/Mini全系列特价
3,惠普/戴尔笔记本电脑超低折扣
4,小米/魅族手机新品特惠
...,...
95,【热销】真皮时尚单肩斜挎包
96,【新品】百搭高贵气质手提包
97,【特惠】时尚百搭贝壳包
98,【限量】经典款双肩斜挎包


## 通过Embedding进行语义搜索

In [8]:
from openai.embeddings_utils import get_embeddings
import openai, os, backoff

openai.api_key = os.environ.get("OPENAI_API_KEY")
embedding_model = "text-embedding-ada-002"

batch_size = 100

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def get_embeddings_with_backoff(prompts, engine):
    embeddings = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        embeddings += get_embeddings(list_of_text=batch, engine=engine)
    return embeddings

prompts = df.product_name.tolist()
prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]

embeddings = []
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)
    embeddings += batch_embeddings

df["embedding"] = embeddings
df.to_parquet("data/taobao_product_title.parquet", index=False)

In [9]:
from openai.embeddings_utils import get_embedding, cosine_similarity

# search through the reviews for a specific product
def search_product(df, query, n=3, pprint=True):
    product_embedding = get_embedding(
        query,
        engine=embedding_model
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results


results = search_product(df, "自然淡雅背包", n=3)

【新品】百搭高贵气质手提包
【新品】简约时尚女士包包
【精选】简约时尚女士包包


## 利用Embedding信息进行商品推荐的冷启动

In [11]:
def recommend_product(df, product_name, n=3, pprint=True):
    product_embedding = df[df['product_name'] == product_name].iloc[0].embedding
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results

results = recommend_product(df, "Apple/苹果MacBook Pro/Air笔记本特惠", n=3)

Apple/苹果MacBook Pro/Air笔记本特惠
Apple/苹果MacBook Pro/Air笔记本特价
Apple/苹果iPad Pro/Air/Mini全系列特惠


## 通过FAISS加速搜索过程

In [1]:
%conda install -c conda-forge faiss

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/xuwenhao/miniconda3/envs/py310

  added / updated specs:
    - faiss


The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    pkgs/main::ca-certificates-2023.01.10~ --> conda-forge::ca-certificates-2022.12.7-h033912b_0 
  certifi            pkgs/main/osx-64::certifi-2022.12.7-p~ --> conda-forge/noarch::certifi-2022.12.7-pyhd8ed1ab_0 
  openssl              pkgs/main::openssl-1.1.1t-hca72f7f_0 --> conda-forge::openssl-1.1.1t-hfd90126_0 



Downloading and Extracting Packages

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.


In [12]:
import faiss
import numpy as np

def load_embeddings_to_faiss(df):
    embeddings = np.array(df['embedding'].tolist()).astype('float32')
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

index = load_embeddings_to_faiss(df)

In [13]:
def search_index(index, df, query, k=5):
    query_vector = np.array(get_embedding(query, engine=embedding_model)).reshape(1, -1).astype('float32')
    distances, indexes = index.search(query_vector, k)

    results = []
    for i in range(len(indexes)):
        product_names = df.iloc[indexes[i]]['product_name'].values.tolist()
        results.append((distances[i], product_names))    
    return results

products = search_index(index, df, "自然淡雅背包", k=3)

In [14]:
for distances, product_names in products:
    for i in range(len(distances)):
        print(product_names[i], distances[i])


【新品】百搭高贵气质手提包 0.23767906
【新品】简约时尚女士包包 0.23816529
【精选】简约时尚女士包包 0.23835516
