## 让AI生成点实验数据

In [1]:
import openai, os

openai.api_key = os.environ.get("OPENAI_API_KEY")

COMPLETION_MODEL = "text-davinci-003"

def generate_data_by_prompt(prompt):
    response = openai.Completion.create(
        engine=COMPLETION_MODEL,
        prompt=prompt,
        temperature=0.5,
        max_tokens=2048,
        top_p=1,
    )
    return response.choices[0].text

prompt = """请你生成50条淘宝网里的商品的标题，每条在30个字左右，品类是3C数码产品，标题里往往也会有一些促销类的信息，每行一条。"""
data = generate_data_by_prompt(prompt)
print(data)



1.【新品上市】苹果AirPods Pro无线蓝牙耳机
2.【限时特惠】华为Mate30 Pro 5G全网通版
3.【热销爆款】荣耀V30 Pro 5G双模双待手机
4.【热卖抢购】三星Galaxy S20 Ultra 5G手机
5.【爆款抢购】小米10 Pro 5G双模双待手机
6.【新品上市】苹果iPad Pro 11英寸平板电脑
7.【限时特惠】华为MateBook X Pro笔记本电脑
8.【热销爆款】荣耀MagicBook Pro笔记本电脑
9.【热卖抢购】华硕ZenBook Pro Duo笔记本电脑
10.【爆款抢购】联想Yoga Pro 7笔记本电脑
11.【新品上市】苹果Apple Watch Series 5智能手表
12.【限时特惠】华为GT2 Pro智能手表
13.【热销爆款】三星Galaxy Watch Active2智能手表
14.【热卖抢购】小米Amazfit GTR智能手表
15.【爆款抢购】荣耀Watch Magic智能手表
16.【新品上市】苹果AirPods 2无线蓝牙耳机
17.【限时特惠】华为FreeBuds 3无线蓝牙耳机
18.【热销爆款】三星Galaxy Buds+无线蓝牙耳机
19.【热卖抢购】小米Redmi AirDots无线蓝牙耳机
20.【爆款抢购】荣耀FlyPods 3无线蓝牙耳机
21.【新品上市】苹果iPhone 11 Pro Max全网通手机
22.【限时特惠】华为P30 Pro 5G双模双待手机
23.【热销爆款】三星Galaxy Note 10 Plus 5G手机
24.【热卖抢购】小米CC9 Pro 5G双模双待手机
25.【爆款抢购】荣耀V20 5G双模双待手机
26.【新品上市】苹果iPhone 11 Pro全网通手机
27.【限时特惠】华为Mate 20 Pro 5G双模双待手机
28.【热销爆款】三星Galaxy S10 Plus 5G手机
29.【热卖抢购】小米CC9 5G双模双待手机
30.【爆款抢购】荣耀20 Pro 5G双模双待手机
31.【新品上市】苹果iPad Pro 12.9英寸平板电脑
32.【限时特惠】华为MateBook 13笔记本电脑
33.【热销爆款】荣耀MagicBook 14笔记本电脑
34.【热卖抢购】华硕ZenBook 13笔记本电脑
35.【爆款抢购】联

为了让数据和真实情况更加接近一点，我们可以好好设计一下我们的提示语。比如，我这里就指明了是淘宝的商品，品类是3C，并且标题里要包含一些促销信息。

我们把拿到的返回结果，按行分割，加载到一个DataFrame里面，看看结果会是怎么样的：

In [2]:
import pandas as pd

product_names = data.strip().split('\n')
df = pd.DataFrame({'product_name': product_names})
df.head()

Unnamed: 0,product_name
0,1.【新品上市】苹果AirPods Pro无线蓝牙耳机
1,2.【限时特惠】华为Mate30 Pro 5G全网通版
2,3.【热销爆款】荣耀V30 Pro 5G双模双待手机
3,4.【热卖抢购】三星Galaxy S20 Ultra 5G手机
4,5.【爆款抢购】小米10 Pro 5G双模双待手机


In [3]:
df.product_name = df.product_name.apply(lambda x: x.split('.')[1].strip())
df.head()

Unnamed: 0,product_name
0,【新品上市】苹果AirPods Pro无线蓝牙耳机
1,【限时特惠】华为Mate30 Pro 5G全网通版
2,【热销爆款】荣耀V30 Pro 5G双模双待手机
3,【热卖抢购】三星Galaxy S20 Ultra 5G手机
4,【爆款抢购】小米10 Pro 5G双模双待手机


In [4]:
clothes_prompt = """请你生成50条淘宝网里的商品的标题，每条在30个字左右，品类是女性的服饰箱包等等，标题里往往也会有一些促销类的信息，每行一条。"""
clothes_data = generate_data_by_prompt(clothes_prompt)
clothes_product_names = clothes_data.strip().split('\n')


In [5]:
clothes_df = pd.DataFrame({'product_name': clothes_product_names})
clothes_df.product_name = clothes_df.product_name.apply(lambda x: x.split('.')[1].strip())
clothes_df.head()

Unnamed: 0,product_name
0,【新款】时尚拼接真皮女包
1,潮流撞色腰带女士手提包
2,【特价】经典款小方包女士手抓包
3,【热销】简约百搭链条挎包
4,【精选】百搭双肩包百搭挎包


In [6]:
# append clothes_df to df
df = pd.concat([df, clothes_df], axis=0)
df = df.reset_index(drop=True)
print(df.product_name)

0         【新品上市】苹果AirPods Pro无线蓝牙耳机
1         【限时特惠】华为Mate30 Pro 5G全网通版
2          【热销爆款】荣耀V30 Pro 5G双模双待手机
3     【热卖抢购】三星Galaxy S20 Ultra 5G手机
4           【爆款抢购】小米10 Pro 5G双模双待手机
                  ...              
95                  【促销】简约百搭真皮女士手抓包
96                     【特价】百搭时尚拼接挎包
97                     【新款】潮流款百搭斜挎包
98                     【热销】经典款拼接双肩包
99                    【精选】时尚真皮女士手提包
Name: product_name, Length: 100, dtype: object


In [7]:
display(df)

Unnamed: 0,product_name
0,【新品上市】苹果AirPods Pro无线蓝牙耳机
1,【限时特惠】华为Mate30 Pro 5G全网通版
2,【热销爆款】荣耀V30 Pro 5G双模双待手机
3,【热卖抢购】三星Galaxy S20 Ultra 5G手机
4,【爆款抢购】小米10 Pro 5G双模双待手机
...,...
95,【促销】简约百搭真皮女士手抓包
96,【特价】百搭时尚拼接挎包
97,【新款】潮流款百搭斜挎包
98,【热销】经典款拼接双肩包


## 通过Embedding进行语义搜索

In [8]:
from openai.embeddings_utils import get_embeddings
import openai, os, backoff

openai.api_key = os.environ.get("OPENAI_API_KEY")
embedding_model = "text-embedding-ada-002"

batch_size = 100

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def get_embeddings_with_backoff(prompts, engine):
    embeddings = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        embeddings += get_embeddings(list_of_text=batch, engine=engine)
    return embeddings

prompts = df.product_name.tolist()
prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]

embeddings = []
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)
    embeddings += batch_embeddings

df["embedding"] = embeddings
df.to_parquet("data/taobao_product_title.parquet", index=False)

In [9]:
from openai.embeddings_utils import get_embedding, cosine_similarity

# search through the reviews for a specific product
def search_product(df, query, n=3, pprint=True):
    product_embedding = get_embedding(
        query,
        engine=embedding_model
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results


results = search_product(df, "自然淡雅背包", n=3)

【新品】潮流款简约挎包
潮流撞色腰带女士手提包
【新款】潮流款真皮双肩包


## 利用Embedding信息进行商品推荐的冷启动

In [11]:
def recommend_product(df, product_name, n=3, pprint=True):
    product_embedding = df[df['product_name'] == product_name].iloc[0].embedding
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results

results = recommend_product(df, "【限时特惠】华为Mate30 Pro 5G全网通版", n=3)

【限时特惠】华为Mate30 Pro 5G全网通版
【限时特惠】华为Mate 20 Pro 5G双模双待手机
【限时特惠】华为P30 Pro 5G双模双待手机


## 通过FAISS加速搜索过程

In [1]:
%conda install -c conda-forge faiss

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/xuwenhao/miniconda3/envs/py310

  added / updated specs:
    - faiss


The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    pkgs/main::ca-certificates-2023.01.10~ --> conda-forge::ca-certificates-2022.12.7-h033912b_0 
  certifi            pkgs/main/osx-64::certifi-2022.12.7-p~ --> conda-forge/noarch::certifi-2022.12.7-pyhd8ed1ab_0 
  openssl              pkgs/main::openssl-1.1.1t-hca72f7f_0 --> conda-forge::openssl-1.1.1t-hfd90126_0 



Downloading and Extracting Packages

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.


In [16]:
import faiss
import numpy as np

embeddings_test = np.array(df['embedding'].tolist()).astype('float32')
print(embeddings_test.shape[1])
index_test = faiss.IndexFlatL2(embeddings_test.shape[1])
index_test.add(embeddings_test)
print(index_test)

1536
<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7f4541ae1720> >


In [19]:
def search_index(index, df, query, k=5):
    query_vector = np.array(get_embedding(query, engine=embedding_model)).reshape(1, -1).astype('float32')
    distances, indexes = index.search(query_vector, k)

    results = []
    for i in range(len(indexes)):
        product_names = df.iloc[indexes[i]]['product_name'].values.tolist()
        results.append((distances[i], product_names))    
    return results

products = search_index(index, df, "自然淡雅背包", k=3)

In [21]:
for distances, product_names in products:
    for i in range(len(distances)):
        print(f"number is {i}", product_names[i], distances[i])


number is 0 【新品】潮流款简约挎包 0.22879115
number is 1 潮流撞色腰带女士手提包 0.2348077
number is 2 【新款】潮流款真皮双肩包 0.2386006
