In [1]:
import os
from typing import List
from dotenv import load_dotenv
import numpy as np
from openai import OpenAI

load_dotenv()

client = OpenAI(max_retries=5, api_key=os.environ.get("OPENAI_API_KEY"))

def get_embedding(text: str, model="text-embedding-3-small", **kwargs) -> List[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    response = client.embeddings.create(input=[text], model=model, **kwargs)

    return response.data[0].embedding


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Get embeddings from dataset

This notebook gives an example on how to get embeddings from a large dataset.


## 1. Load the dataset

The dataset used in this example is [fine-food reviews](https://www.kaggle.com/snap/amazon-fine-food-reviews) from Amazon. The dataset contains a total of 568,454 food reviews Amazon users left up to October 2012. We will use a subset of this dataset, consisting of 1,000 most recent reviews for illustration purposes. The reviews are in English and tend to be positive or negative. Each review has a ProductId, UserId, Score, review title (Summary) and review body (Text).

We will combine the review summary and review text into a single combined text. The model will encode this combined text and it will output a single vector embedding.

To run this notebook, you will need to install: pandas, openai, transformers, plotly, matplotlib, scikit-learn, torch (transformer dep), torchvision, and scipy.

In [2]:
import pandas as pd
import tiktoken

embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

In [34]:
# # To add an index column
# # load & inspect dataset
# input_datapath = "Data/Product_Information_Dataset.csv"  # to save space, we provide a pre-filtered dataset
# df = pd.read_csv(input_datapath)
# df.reset_index(inplace=True)
# df.to_csv("Data/Product_Information_Dataset_with_index.csv", index=False)

In [None]:
# # To replace empty values with ""
# # load & inspect dataset
# input_datapath = "Data/Product_Information_Dataset_with_index.csv"  # to save space, we provide a pre-filtered dataset
# df = pd.read_csv(input_datapath)
# df.fillna("None", inplace=True)
# df.to_csv("Data/Product_Information_Dataset_with_index_and_no_empty_values.csv", index=False)

  df.fillna("None", inplace=True)


In [85]:
# load & inspect dataset
input_datapath = "Data/Product_Information_Dataset.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath)
# only include these columns, which is all of the columns
df = df[
    [
        "main_category",
        "title",
        "average_rating",
        "rating_number",
        "features",
        "description",
        "price",
        "store",
        "categories",
        "details",
        "parent_asin",
    ]
]

# Ensure all columns are strings before concatenation so it doesn't throw an error
df["title"] = df["title"].astype(str)
df["description"] = df["description"].astype(str)
df["features"] = df["features"].astype(str)
df["details"] = df["details"].astype(str)
df["categories"] = df["categories"].astype(str)
df["store"] = df["store"].astype(str)
df["main_category"] = df["main_category"].astype(str)

# df = df.dropna()
# TODO experiment with less columns included
df["combined"] = (
    "Title: "
    + df.title.str.strip()
    + "; Description: "
    + df.description.str.strip()
    + "; Features: "
    + df.features.str.strip()
    + "; Details: "
    + df.details.str.strip()
    + "; Categories: "
    + df.categories.str.strip()
    + "; Store: "
    + df.store.str.strip()
    + "; Main Category: "
    + df.main_category.str.strip()
)
df.head(2)
len(df)

5000

In [86]:
top_n = 100000000000

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

5000

## 2. Get embeddings and save them for future reuse

In [87]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df.to_csv("Data/Product_Information_Dataset_with_embeddings.csv")

In [91]:
query = get_embedding("BOYA BYM1", model=embedding_model)

## 3. Semantic search using embeddings

In [4]:
import pandas as pd
import numpy as np
from ast import literal_eval
import boto3
s3 = boto3.client('s3')

def download_file_from_s3(bucket_name, object_name, file_name):
  s3.download_file(bucket_name, object_name, file_name)

bucket_name = "chatbot-store-genailabs"
object_name = "Product_Information_Dataset_with_embeddings.csv"
file_name = "Data/Product_Information_Dataset_with_embeddings.csv" # local file name

download_file_from_s3(bucket_name, object_name, file_name)

datafile_path = "Data/Product_Information_Dataset_with_embeddings.csv"

df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)

In [5]:
# search through the reviews for a specific product
def search_embeddings(df, query, n=3, pprint=True):
    embedding = get_embedding(query, model="text-embedding-3-small")
    df["similarities"] = df.embedding.apply(
        lambda x: cosine_similarity(x, embedding)
    )
    res = df.sort_values("similarities", ascending=False).head(n)
    return res

In [8]:
res = search_embeddings(df, "BOYA BYM1 Microphone", n=10000)
res = res.drop(columns=["combined", "embedding"])
res.to_csv("search_results_embeddings.csv", index=False)

## Alternatively, instead of embeddings in the csv, we could use Pinecone

In [5]:
import os
import time

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

In [4]:
# To create the index. Don't run this cell if you want to use the existing index.
from pinecone import ServerlessSpec,Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "chatbot-store"  # put in the name of your pinecone index here. When creating the index in pinecone.io, the Dimensions have to be the same as the result length.

In [6]:
# # To empty the index. DONT RUN THIS CELL IF YOU WANT TO KEEP THE INDEX
# if index_name in pc.list_indexes().names():
#     pc.delete_index(index_name)

# pc.create_index(
#     name=index_name,
#     dimension=1536,
#     metric="cosine",
#     spec=ServerlessSpec(cloud="aws", region="us-east-1"),
# )
# while not pc.describe_index(index_name).status["ready"]:
#     time.sleep(1)

# index = pc.Index(index_name)

## Then we create a vector embedding for each question using OpenAI (as demonstrated earlier), and upsert the ID, vector embedding, and original text for each phrase to Pinecone.

In [None]:
from tqdm.auto import tqdm

count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(trec["text"]), batch_size)):
    # set end position of batch
    i_end = min(i + batch_size, len(trec["text"]))
    # get batch of lines and IDs
    lines_batch = trec["text"][i : i + batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = client.embeddings.create(input=lines_batch, model=MODEL)
    embeds = [record.embedding for record in res.data]
    # prep metadata and upsert batch
    meta = [{"text": line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))