In [3]:
pip install tiktoken

Looking in indexes: http://mirrors.aliyun.com/pypi/simple/
Collecting tiktoken
  Downloading http://mirrors.aliyun.com/pypi/packages/5c/76/03b8286cd264f9f5550229fe21f72abc89d431a9a3c887fc365763acc5a4/tiktoken-0.3.0-cp39-cp39-macosx_10_9_x86_64.whl (735 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m735.4/735.4 kB[0m [31m256.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting blobfile>=2
  Downloading http://mirrors.aliyun.com/pypi/packages/c1/35/6b92aa0d86f26f0a8ab6959dd29ac4c7e96d5c1d948d4347bba12e07695a/blobfile-2.0.1-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m214.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pycryptodomex~=3.8
  Downloading http://mirrors.aliyun.com/pypi/packages/78/db/ec162a8fa1c7c8e03488616a01de59bb752b985f1c507ffb127b40b9d456/pycryptodomex-3.17-cp35-abi3-macosx_10_9_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1

## 认识数据集

In [3]:
# imports
import pandas as pd
import tiktoken
import openai
from openai.embeddings_utils import get_embedding
import numpy as np

In [4]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8191  # the maximum for text-embedding-ada-002 is 8191

In [5]:
input_file = './data/fine_food_reviews_1k.csv'

In [6]:
df = pd.read_csv(input_file, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.sort_values('Time')
df.dropna(inplace=True)
print(df.shape)
df.drop_duplicates(subset=['Summary', 'Text'], keep='last', inplace=True)
print(df.shape)
df['Combined'] = 'Title: ' + df.Summary.str.strip() + '; Content: ' + df.Text.str.strip()
print(df.shape)

(1000, 6)
(762, 6)


In [9]:
df.head(10)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,Combined
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...
295,1351123200,B000LKTTTW,A14MQ40CCU8B13,5,Best tomato soup,I have a hard time finding packaged food of an...,Title: Best tomato soup; Content: I have a har...
292,1351123200,B002JA06Z8,A3ESIUM1JTR7KK,5,These fresh berries are truly MIRACULOUS!!!,I have ordered from Ethans on three separate o...,Title: These fresh berries are truly MIRACULOU...
291,1351123200,B002HQNCBO,A1UW65ZMZ3UWD3,5,Baconnaise,If you are a fan of bacon you're going to like...,Title: Baconnaise; Content: If you are a fan o...
289,1351123200,B0048GRNZM,AXG287OY16WWL,1,Cute,"For some reason I thought that you got three ""...",Title: Cute; Content: For some reason I though...
288,1351123200,B006IMC3LS,A332LFCZ6ZPJPW,4,Great Monster product,Kudos to Monster for making this coffee flavor...,Title: Great Monster product; Content: Kudos t...
286,1351123200,B0002YGSJQ,AAUVIV5KLSC8A,5,spicy,It is a too spicy grocery in japan.<br /><br /...,Title: spicy; Content: It is a too spicy groce...
284,1351123200,B005CT8R90,A19SLJ981ULZ03,5,Excellent but Price?,I first heard about this on America's Test Kit...,Title: Excellent but Price?; Content: I first ...
283,1351123200,B0000ESTGX,A214U5SCYVJ7G4,4,Smooth tasting mousse,This was very good and it was smoothe and good...,Title: Smooth tasting mousse; Content: This wa...
282,1351123200,B007JT7ARQ,A17K6WB5PGT3UV,5,"Great Shampoo, Use it all the time now",This is one of those products I would have nev...,"Title: Great Shampoo, Use it all the time now;..."


In [11]:
top_n = 100
encoding = tiktoken.get_encoding(embedding_encoding)
# omit reviews that are too long to embed
df["n_tokens"] = df.Combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
df.shape

(100, 8)

In [12]:
df.head(5)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,Combined,n_tokens
650,1351209600,B0051O6P36,A1VC6419THHIET,5,Good for all cats.,I just got these treats last week and they're ...,Title: Good for all cats.; Content: I just got...,81
651,1351209600,B001EO5RSQ,A33W5JAFGHYRQZ,5,Love this Cereal!,There is nothing else like this on the market....,Title: Love this Cereal!; Content: There is no...,55
652,1351209600,B0045H264C,A3IYSIAKYOMKTO,5,Wild Honey,This really is unfiltered honey made from wild...,Title: Wild Honey; Content: This really is unf...,107
679,1351209600,B000UBD88A,AWRFQYLG7LQKJ,2,Not very strong,Not as strong as the regular dark coffee. Dis...,Title: Not very strong; Content: Not as strong...,45
654,1351209600,B001XWRMAU,A1KWVBDHBG50VZ,5,Outstanding product!.....,Great flavor.....lotsa &#34;heat&#34;....I use...,Title: Outstanding product!.....; Content: Gre...,43


In [13]:
openai.api_key = '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'
df['embedding'] = df.Combined.apply(lambda x: get_embedding(x, engine=embedding_model))

In [19]:
print(len(df.iloc[1]['embedding']))
print(df.iloc[1]['embedding'])

1536
[-0.012850540690124035, -0.00849321112036705, -0.005164966452866793, 0.0071384659968316555, -0.01005637925118208, 0.004018643405288458, 0.01457653846591711, -0.026352401822805405, -0.012629092670977116, -0.0014548858162015676, 0.023382384330034256, 0.009203149937093258, -0.015306017361581326, -0.03061203472316265, 0.0003667745040729642, 0.025688055902719498, 0.031002825126051903, -0.02800675481557846, 0.013560479506850243, 0.001993038924410939, -0.006281980313360691, 0.03717733919620514, -0.0031002825126051903, -0.007346888072788715, -0.004630884155631065, -0.0017895014025270939, 0.004692759830504656, -0.02218395471572876, 0.009567889384925365, 0.0030025846790522337, 0.041111309081315994, 0.014172720722854137, -0.021845268085598946, -0.0032956786453723907, -0.025531738996505737, -0.005041216034442186, -0.0029521072283387184, -0.0008605563780292869, 0.024685023352503777, -0.024190019816160202, 0.0166868157684803, 0.023069750517606735, 0.0114046111702919, 0.012427182868123055, -0.03

## embedding

- dimension
- norm

In [20]:
df['embed_len'] = df.embedding.apply(lambda x: len(x))
df['embed_norm'] = df.embedding.apply(lambda x: np.linalg.norm(x))

In [21]:
df

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,Combined,n_tokens,embedding,embed_len,embed_norm
650,1351209600,B0051O6P36,A1VC6419THHIET,5,Good for all cats.,I just got these treats last week and they're ...,Title: Good for all cats.; Content: I just got...,81,"[-0.020426278933882713, -0.022335030138492584,...",1536,1.0
651,1351209600,B001EO5RSQ,A33W5JAFGHYRQZ,5,Love this Cereal!,There is nothing else like this on the market....,Title: Love this Cereal!; Content: There is no...,55,"[-0.012850540690124035, -0.00849321112036705, ...",1536,1.0
652,1351209600,B0045H264C,A3IYSIAKYOMKTO,5,Wild Honey,This really is unfiltered honey made from wild...,Title: Wild Honey; Content: This really is unf...,107,"[0.0019596752244979143, -0.010193675756454468,...",1536,1.0
679,1351209600,B000UBD88A,AWRFQYLG7LQKJ,2,Not very strong,Not as strong as the regular dark coffee. Dis...,Title: Not very strong; Content: Not as strong...,45,"[-0.0016112793236970901, -0.026598896831274033...",1536,1.0
654,1351209600,B001XWRMAU,A1KWVBDHBG50VZ,5,Outstanding product!.....,Great flavor.....lotsa &#34;heat&#34;....I use...,Title: Outstanding product!.....; Content: Gre...,43,"[-0.00573874544352293, 0.007031316868960857, 0...",1536,1.0
...,...,...,...,...,...,...,...,...,...,...,...
623,1351209600,B0000CFXYA,A3GS4GWPIBV0NT,1,Strange inflammation response,Truthfully wasn't crazy about the taste of the...,Title: Strange inflammation response; Content:...,110,"[0.00011091353371739388, -0.00466986745595932,...",1536,1.0
624,1351209600,B0001BH5YM,A1BZ3HMAKK0NC,5,My favorite and only MUSTARD,You've just got to experience this mustard... ...,Title: My favorite and only MUSTARD; Content:...,80,"[-0.020842211320996284, -0.013073143549263477,...",1536,1.0
625,1351209600,B0009ET7TC,A2FSDQY5AI6TNX,5,My furbabies LOVE these!,Shake the container and they come running. Eve...,Title: My furbabies LOVE these!; Content: Shak...,47,"[-0.009749102406203747, -0.0068712360225617886...",1536,1.0
619,1351209600,B007PA32L2,A15FF2P7RPKH6G,5,got this for the daughter,all i have heard since she got a kuerig is why...,Title: got this for the daughter; Content: all...,50,"[-0.00521062919870019, 0.0009606690146028996, ...",1536,1.0


## semantic search base text embedding

In [29]:
from openai.embeddings_utils import get_embedding, cosine_similarity

# search through the reviews for a specific product
def search_reviews(df, query, n=3, pprint=True):
    query_embed = get_embedding(
        query,
        engine=embedding_model
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, query_embed))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .Combined.str.replace("Title: ", "")
        .str.replace("; Content:", ": ")
    )
    if pprint:
        n = 1
        for r in results:
            print('top{}:'.format(n))
            n += 1
            # print(r[:200])
            print(r[:])
            print('==================================')
    return results


results = search_reviews(df, "good coffee", n=3)

top1:
super coffee:  Great coffee and so easy to brew.  This coffee has great aroma and is good to the last drop.  I actually like all the brands.  This is the way coffee should taste!!
top2:
Delicious!!!!:  A coffee treat. Now that my husband and I drink this coffee, there is no going back to the plain stuff ;).
top3:
Full- bodied without a bitter after-taste:  This is my everyday coffee choice...a good all around crowd pleaser.  Green mountain Sumatra would be my back-up-for-a-change-of-pace second choice...nice to have both on hand!


In [30]:
results = search_reviews(df, "bad rice", n=3)

top1:
Unbelievable! the best rice I have ever had.:  I hardly ever write reviews, this rice is so good that I have to write about it.<br /><br />I live in Hawaii and have never seen this Caribbean rice mix for sale in any store. I came across this while searching Amazon, so glad I gave it a try!.. "try it" I do not think you'll be disappointed.<br /><br />I have added the chopped up chicken as suggested but i also added small chopped onion, it is unbelievable! Another great way to make it even better is try adding fresh Mango while it is simmering or if you cant find it fresh, use canned mango nectar, substitute 1/4 cup of the water for the mango juice. Super Good!. Also instead of 2 1/2 cups water as suggested  use only 2 1/4 cups water, brings out the flavor a little bit more I think (2 cups water / 1/4 cup mango nectar).
top2:
I love this stuff:  Hard to find in the grocery. I buy it by the case online. One box makes four lunches with some blackened chicken for protein. There is som

In [33]:
results = search_reviews(df, "hamburger", n=3)

top1:
Great for HS lunch:  Great for HS lunch, kid enjoy as a snack also, will buy again. Salted chips are good too, tried them too.
top2:
I love this stuff:  Hard to find in the grocery. I buy it by the case online. One box makes four lunches with some blackened chicken for protein. There is something about the sweet pineapple and curry that makes this rice mix delicious.
top3:
Dogs love it.:  This is the "all gone" treat after dinner.  It's the only treat that the dogs work for; and I did run the chance of losing a hand.  They know a new command now:  "be gentle" when taking liver treats.
