<a href="https://colab.research.google.com/github/andrewgcodes/lightspeedEmbeddings/blob/main/demoLightspeedEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!pip install openai
!pip install tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from openai)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5 (from aiohttp->openai)
  Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0,>=4.0.0a3 (from aiohttp->openai)
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp->openai)
  Downloadin

# Multithreading

In [2]:
# imports
import openai
import pandas as pd
import tiktoken
import concurrent.futures
import time
from openai.embeddings_utils import get_embedding
from requests.exceptions import HTTPError
from tqdm import tqdm
import logging
import numpy as np

# set your OpenAI API key here.
openai.api_key="sk-INSERTAPIKEY"

# embedding model parameters

# you can change the embeddings model but ada-002 is the best in quality and cost
embedding_model = "text-embedding-ada-002"

# probably don't touch this
embedding_encoding = "cl100k_base"
encoding = tiktoken.get_encoding("cl100k_base")

# code is set to ignore documents/texts that are too long (more than about 6,000 words)
# there is a limit of ~8,000 tokens to be embedded by the OpenAI models
# you should not exceed 8,000 here
max_tokens = 4000

def load_data(df, method='filter'):
    encoding = tiktoken.get_encoding(embedding_encoding)
    df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))

    if method == 'filter':
        df = df[df.n_tokens <= max_tokens]
    elif method == 'truncate':
        def truncate_to_max_tokens(text):
            tokens = encoding.encode(text)
            return encoding.decode(tokens[:max_tokens])

        df['combined'] = df['combined'].apply(truncate_to_max_tokens)
    else:
        raise ValueError(f'Invalid method {method}. Choose "filter" or "truncate".')

    return df

def get_embeddings(text, max_retries=5):
    retries = 0
    while retries < max_retries:
        try:
            return get_embedding(text, engine=embedding_model)
        except HTTPError as e:
            if e.response.status_code == 429:
                print('Rate limit exceeded. Sleeping for a while before retrying...')
                time.sleep(30)
                retries += 1
            else:
                raise e
    raise Exception('Failed to get embedding after multiple retries')

def get_embeddings_parallel(df, n_threads=10, logfile='embeddings_errors.log'):
    logging.basicConfig(filename=logfile, level=logging.ERROR)
    logger = logging.getLogger()

    with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
        future_to_text = {executor.submit(get_embeddings, text): text for text in df['combined']}
        for future in tqdm(concurrent.futures.as_completed(future_to_text), total=len(future_to_text), desc='Embedding texts'):
            text = future_to_text[future]
            try:
                df.loc[df['combined'] == text, 'embedding'] = str(future.result())
            except Exception as e:
                error_message = f'Failed to get embedding for text: {text}. Exception: {e}'
                print(error_message)
                logger.error(error_message)  # Log the error message
    return df

def process_data(input_datapath, output_datapath, logfile, method='filter'):
    df = load_data(input_datapath, method)
    df = get_embeddings_parallel(df, logfile=logfile)
    df.to_csv(output_datapath)


if __name__ == "__main__":
    input_datapath = "fine_food_reviews_1k.csv" #REPLACE WITH YOUR CSV FILE
    output_file_name = "fine_food_reviews_with_embeddings_1k.csv" #REPLACE WITH DESIRED OUTPUT FILE NAME
    log_file_name = "embedding_errors.log"  # Log file name
    # READ IN YOUR CSV
    df = pd.read_csv(input_datapath, index_col=0)

    # This is code SPECIFIC to the Fine Food Reviews dataset!
    # You will need to modify it to fit your own dataset.

    df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
    df = df.dropna()

    # LightspeedEmbeddings is just looking for a dataframe with a column called combined.
    # It will calculate the embeddings for each cell in the combined column.
    df["combined"] = (
        "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
    )
    df.drop("Time", axis=1, inplace=True)

    # timing code
    start_time = time.time()

    # this actually starts getting the embeddings
    # you can select method = "filter" which removes texts that are longer than max_tokens
    # or you can select method = "truncate" which will chop away at too-long texts until they are smaller than max_tokens
    process_data(df, output_file_name, log_file_name, method = "truncate")

    end_time = time.time()

    # see how long it took!
    print(end_time-start_time)


Embedding texts: 100%|██████████| 1000/1000 [00:10<00:00, 91.49it/s]


11.78279447555542


# Sanity check the results here. Try different queries.

In [3]:
# We store the embedding vector as a string in one 'cell' in the dataframe.
# You need to use .apply(eval).apply(np.array) to convert the strings back into vectors
# This step takes a while.
df = pd.read_csv('fine_food_reviews_with_embeddings_1k.csv')
df["embedding"] = df.embedding.apply(eval).apply(np.array)

## Here is code directly taken from the OpenAI Cookbook tutorial on semantic search.

[Link](https://github.com/openai/openai-cookbook/blob/main/examples/Semantic_text_search_using_embeddings.ipynb)

In [4]:
from openai.embeddings_utils import get_embedding, cosine_similarity

# search through the reviews for a specific product
def search_reviews(df, product_description, n=3, pprint=True):
    product_embedding = get_embedding(
        product_description,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .combined.str.replace("Title: ", "")
        .str.replace("; Content:", ": ")
    )
    if pprint:
        for r in results:
            print(r[:200])
            print()
    return results


results = search_reviews(df, "ramen", n=5)


Instant noodle with best taste and texture -- see Recall Info:  I just researched about the korean noodle recall because I love this Shin Ramyun noodle -- the best taste and texture instant noodle I h

Best cup of noodles ever!:  Tried many (from all over the world) different types of packaged quick-meal type noodles.  These are by far the best I've had.

Great stuff:  I use this to make a broth for noodles and soup. it reminds me of the days I spent in Japan. easy to use.

Fine for a microwave dinner:  The Barilla Mezze Penne with spicy marinara sauce is easy to prepare and tastes better than similar products. The sauce is not as spicy as I expected it to be but does ha



# You can download your embeddings CSV file from the folder panel on the lefthand side of Google Colab.